mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 19:46:19 +00:00
scrap: guardar a sqlite
This commit is contained in:
parent
b8276ce7fd
commit
7c1d6e190a
1 changed files with 21 additions and 14 deletions
|
@ -8,7 +8,6 @@ import { WARCParser } from "warcio";
|
||||||
import { createReadStream, createWriteStream } from "fs";
|
import { createReadStream, createWriteStream } from "fs";
|
||||||
import { writeFile } from "fs/promises";
|
import { writeFile } from "fs/promises";
|
||||||
import { createHash } from "crypto";
|
import { createHash } from "crypto";
|
||||||
import { migrate } from "drizzle-orm/bun-sqlite/migrator";
|
|
||||||
import { getCarrefourProduct } from "./carrefour.js";
|
import { getCarrefourProduct } from "./carrefour.js";
|
||||||
import { getDiaProduct } from "./dia.js";
|
import { getDiaProduct } from "./dia.js";
|
||||||
import { getCotoProduct } from "./coto.js";
|
import { getCotoProduct } from "./coto.js";
|
||||||
|
@ -17,7 +16,22 @@ import { join } from "path";
|
||||||
const sqlite = new Database("sqlite.db");
|
const sqlite = new Database("sqlite.db");
|
||||||
const db = drizzle(sqlite);
|
const db = drizzle(sqlite);
|
||||||
|
|
||||||
const DEBUG = true;
|
sqlite.run(`
|
||||||
|
create table precios(
|
||||||
|
id integer primary key autoincrement,
|
||||||
|
ean text not null,
|
||||||
|
fetched_at text not null,
|
||||||
|
precio_centavos integer,
|
||||||
|
in_stock integer,
|
||||||
|
url text
|
||||||
|
);
|
||||||
|
`);
|
||||||
|
|
||||||
|
for (const path of process.argv.slice(2)) {
|
||||||
|
await parseWarc(path);
|
||||||
|
}
|
||||||
|
|
||||||
|
const DEBUG = false;
|
||||||
|
|
||||||
export type Precio = typeof precios.$inferInsert;
|
export type Precio = typeof precios.$inferInsert;
|
||||||
export type Precioish = Omit<Precio, "fetchedAt" | "url" | "id">;
|
export type Precioish = Omit<Precio, "fetchedAt" | "url" | "id">;
|
||||||
|
@ -26,17 +40,13 @@ async function storePrecioPoint(point: Precio) {
|
||||||
await db.insert(precios).values(point);
|
await db.insert(precios).values(point);
|
||||||
}
|
}
|
||||||
|
|
||||||
(async () => {
|
async function parseWarc(path: string) {
|
||||||
const o = createWriteStream("x.tsv");
|
const warc = createReadStream(path);
|
||||||
o.write(`ean\tfetchedAt\tprecioCentavos\tinStock\turl\n`);
|
|
||||||
|
|
||||||
const warc = createReadStream(process.argv[2]);
|
|
||||||
const parser = new WARCParser(warc);
|
const parser = new WARCParser(warc);
|
||||||
let progress = { done: 0, errors: 0 };
|
let progress = { done: 0, errors: 0 };
|
||||||
for await (const record of parser) {
|
for await (const record of parser) {
|
||||||
if (record.warcType === "response") {
|
if (record.warcType === "response") {
|
||||||
if (!record.warcTargetURI) throw new Error("no uri");
|
if (!record.warcTargetURI) throw new Error("no uri");
|
||||||
console.log(record.warcTargetURI);
|
|
||||||
const html = await record.contentText();
|
const html = await record.contentText();
|
||||||
|
|
||||||
const url = new URL(record.warcTargetURI);
|
const url = new URL(record.warcTargetURI);
|
||||||
|
@ -56,10 +66,7 @@ async function storePrecioPoint(point: Precio) {
|
||||||
url: record.warcTargetURI,
|
url: record.warcTargetURI,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (ish)
|
if (ish) await storePrecioPoint(p);
|
||||||
o.write(
|
|
||||||
`${p.ean}\t${p.fetchedAt}\t${p.precioCentavos}\t${p.inStock}\t${p.url}\n`
|
|
||||||
);
|
|
||||||
|
|
||||||
// console.log(product);
|
// console.log(product);
|
||||||
progress.done++;
|
progress.done++;
|
||||||
|
@ -76,8 +83,8 @@ async function storePrecioPoint(point: Precio) {
|
||||||
console.error(`wrote html to ${output}`);
|
console.error(`wrote html to ${output}`);
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
console.debug(progress);
|
console.debug(`done: ${progress.done}; errored: ${progress.errors}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})();
|
}
|
||||||
|
|
Loading…
Reference in a new issue