From 8f8e133b5f31c57dd1af0da4357637b64fb6d1d2 Mon Sep 17 00:00:00 2001 From: Nulo Date: Sat, 23 Dec 2023 01:04:48 -0300 Subject: [PATCH] mejoras scrap --- scraper/scrap.ts | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/scraper/scrap.ts b/scraper/scrap.ts index 2e6438a..d84c1ae 100644 --- a/scraper/scrap.ts +++ b/scraper/scrap.ts @@ -14,17 +14,19 @@ import { getCotoProduct } from "./coto.js"; import { join } from "path"; import pMap from "p-map"; +const DEBUG = false; + const sqlite = new Database("sqlite.db"); const db = drizzle(sqlite); sqlite.run(` -pragma journal_mode = WAL; -pragma synchronous = normal; -pragma temp_store = memory; -pragma mmap_size = 30000000000; +pragma journal_mode = OFF; +pragma synchronous = 0; +pragma cache_size = 1000000; +pragma locking_mode = exclusive; `); sqlite.run(` -create table precios( +create table if not exists precios( id integer primary key autoincrement, ean text not null, fetched_at text not null, @@ -34,12 +36,11 @@ create table precios( ); `); +let progress = { done: 0, errors: 0 }; await pMap(process.argv.slice(2), (path) => parseWarc(path), { concurrency: 40, }); -const DEBUG = false; - export type Precio = typeof precios.$inferInsert; export type Precioish = Omit; @@ -48,12 +49,16 @@ async function storePrecioPoint(point: Precio) { } async function parseWarc(path: string) { - const warc = createReadStream(path); + // const warc = createReadStream(path); + + const warc = Bun.spawn(["zstd", "-do", "/dev/stdout", path], { + stderr: "ignore", + }).stdout; + const parser = new WARCParser(warc); - let progress = { done: 0, errors: 0 }; for await (const record of parser) { if (record.warcType === "response") { - if (!record.warcTargetURI) throw new Error("no uri"); + if (!record.warcTargetURI) continue; const html = await record.contentText(); const url = new URL(record.warcTargetURI);