mejoras scrap

This commit is contained in:
Cat /dev/Nulo 2023-12-23 01:04:48 -03:00
parent 95eac9dcca
commit 8f8e133b5f

View file

@ -14,17 +14,19 @@ import { getCotoProduct } from "./coto.js";
import { join } from "path"; import { join } from "path";
import pMap from "p-map"; import pMap from "p-map";
const DEBUG = false;
const sqlite = new Database("sqlite.db"); const sqlite = new Database("sqlite.db");
const db = drizzle(sqlite); const db = drizzle(sqlite);
sqlite.run(` sqlite.run(`
pragma journal_mode = WAL; pragma journal_mode = OFF;
pragma synchronous = normal; pragma synchronous = 0;
pragma temp_store = memory; pragma cache_size = 1000000;
pragma mmap_size = 30000000000; pragma locking_mode = exclusive;
`); `);
sqlite.run(` sqlite.run(`
create table precios( create table if not exists precios(
id integer primary key autoincrement, id integer primary key autoincrement,
ean text not null, ean text not null,
fetched_at text not null, fetched_at text not null,
@ -34,12 +36,11 @@ create table precios(
); );
`); `);
let progress = { done: 0, errors: 0 };
await pMap(process.argv.slice(2), (path) => parseWarc(path), { await pMap(process.argv.slice(2), (path) => parseWarc(path), {
concurrency: 40, concurrency: 40,
}); });
const DEBUG = false;
export type Precio = typeof precios.$inferInsert; export type Precio = typeof precios.$inferInsert;
export type Precioish = Omit<Precio, "fetchedAt" | "url" | "id">; export type Precioish = Omit<Precio, "fetchedAt" | "url" | "id">;
@ -48,12 +49,16 @@ async function storePrecioPoint(point: Precio) {
} }
async function parseWarc(path: string) { async function parseWarc(path: string) {
const warc = createReadStream(path); // const warc = createReadStream(path);
const warc = Bun.spawn(["zstd", "-do", "/dev/stdout", path], {
stderr: "ignore",
}).stdout;
const parser = new WARCParser(warc); const parser = new WARCParser(warc);
let progress = { done: 0, errors: 0 };
for await (const record of parser) { for await (const record of parser) {
if (record.warcType === "response") { if (record.warcType === "response") {
if (!record.warcTargetURI) throw new Error("no uri"); if (!record.warcTargetURI) continue;
const html = await record.contentText(); const html = await record.contentText();
const url = new URL(record.warcTargetURI); const url = new URL(record.warcTargetURI);