From 794247d657fe7618f7f6f0fce17915ff9ee84d40 Mon Sep 17 00:00:00 2001 From: Nulo Date: Sun, 24 Dec 2023 18:06:52 -0300 Subject: [PATCH] limpiar scraper --- scraper/bench.ts | 40 ------------------------------ scraper/downloadUrls.ts | 27 -------------------- scraper/{ => parsers}/carrefour.ts | 4 +-- scraper/{ => parsers}/coto.ts | 2 +- scraper/{ => parsers}/dia.ts | 4 +-- scraper/scrap.ts | 6 ++--- 6 files changed, 8 insertions(+), 75 deletions(-) delete mode 100644 scraper/bench.ts delete mode 100644 scraper/downloadUrls.ts rename scraper/{ => parsers}/carrefour.ts (95%) rename scraper/{ => parsers}/coto.ts (95%) rename scraper/{ => parsers}/dia.ts (79%) diff --git a/scraper/bench.ts b/scraper/bench.ts deleted file mode 100644 index 9f55f5b..0000000 --- a/scraper/bench.ts +++ /dev/null @@ -1,40 +0,0 @@ -// import { run, bench, group, baseline } from "mitata"; -import { createReadStream } from "node:fs"; -import { Writable } from "node:stream"; -import { pipeline } from "node:stream/promises"; -import { getCarrefourProduct } from "./carrefour.js"; -import { WARCParser } from "warcio"; -// import { ZSTDDecompress } from "simple-zstd"; -// import { AutoWARCParser } from "node-warc"; - -// const html = await readFile("carrefour.html", "utf-8"); -// bench("carrefour", () => { -// getCarrefourProduct(html); -// }); - -// await bench("warcio", async () => { -// const warc = Bun.spawn( -// ["zstd", "-do", "/dev/stdout", "../data/carrefour.warc.zst"], -// { -// // stdin: Bun.file().stream(), -// } -// ).stdout; -// // const warc = Bun.file("../data/carrefour.warc").stream(1024 * 1024 * 512); -// // const warc = createReadStream("../data/carrefour.warc.zst").pipe(ZSTDDecompress()); - -// const parser = new WARCParser(warc); -// for await (const record of parser) { -// const html = await record.contentText(); -// } -// }); - -// await bench("warc", ); - -async function bench(name: string, func: () => Promise) { - const t0 = performance.now(); - await func(); - const t1 = performance.now(); - console.debug(`${name} took ${t1 - t0}`); -} - -// await run({}); diff --git a/scraper/downloadUrls.ts b/scraper/downloadUrls.ts deleted file mode 100644 index 87b694f..0000000 --- a/scraper/downloadUrls.ts +++ /dev/null @@ -1,27 +0,0 @@ -import { readFile, writeFile } from "fs/promises"; -import pMap from "p-map"; -import { nanoid } from "nanoid"; -import { getHtml } from "./fetch.js"; -import { join } from "path"; - -(async () => { - const inputPath = process.argv[2]; - const outputPath = process.argv[3]; - if (!inputPath || !outputPath) { - console.error("falta input y/o output"); - process.exit(1); - } - const file = await readFile(inputPath, "utf-8"); - const urls = file.split("\n"); - - await pMap( - urls, - async (url: string) => { - const id = nanoid(); - const html = await getHtml(url); - await writeFile(join(outputPath, `${id}.link`), url); - await writeFile(join(outputPath, id), html); - }, - { concurrency: 12 } - ); -})(); diff --git a/scraper/carrefour.ts b/scraper/parsers/carrefour.ts similarity index 95% rename from scraper/carrefour.ts rename to scraper/parsers/carrefour.ts index da79ce6..788c4a7 100644 --- a/scraper/carrefour.ts +++ b/scraper/parsers/carrefour.ts @@ -1,6 +1,6 @@ import { parseHTML } from "linkedom"; -import { Precioish, type Precio } from "./scrap.js"; -import { getProductJsonLd, priceFromMeta } from "./common.js"; +import { Precioish, type Precio } from "../scrap.js"; +import { getProductJsonLd, priceFromMeta } from "../common.js"; function getEanByTable(dom: Window): string { const eanLabelEl = dom.window.document.querySelector( diff --git a/scraper/coto.ts b/scraper/parsers/coto.ts similarity index 95% rename from scraper/coto.ts rename to scraper/parsers/coto.ts index 2145634..5fda36f 100644 --- a/scraper/coto.ts +++ b/scraper/parsers/coto.ts @@ -1,5 +1,5 @@ import { parseHTML } from "linkedom"; -import { type Precioish } from "./scrap.js"; +import { type Precioish } from "../scrap.js"; function getEanFromText({ document }: Window) { const potentialEanEls = Array.from( diff --git a/scraper/dia.ts b/scraper/parsers/dia.ts similarity index 79% rename from scraper/dia.ts rename to scraper/parsers/dia.ts index 7359f1c..0779da2 100644 --- a/scraper/dia.ts +++ b/scraper/parsers/dia.ts @@ -1,6 +1,6 @@ import { parseHTML } from "linkedom"; -import { type Precioish } from "./scrap.js"; -import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js"; +import { type Precioish } from "../scrap.js"; +import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js"; export function getDiaProduct(html: string | Buffer): Precioish { const dom = parseHTML(html); diff --git a/scraper/scrap.ts b/scraper/scrap.ts index 81698c8..34bed3e 100644 --- a/scraper/scrap.ts +++ b/scraper/scrap.ts @@ -6,9 +6,9 @@ import * as schema from "db-datos/schema.js"; import { WARCParser } from "warcio"; import { writeFile } from "fs/promises"; import { createHash } from "crypto"; -import { getCarrefourProduct } from "./carrefour.js"; -import { getDiaProduct } from "./dia.js"; -import { getCotoProduct } from "./coto.js"; +import { getCarrefourProduct } from "./parsers/carrefour.js"; +import { getDiaProduct } from "./parsers/dia.js"; +import { getCotoProduct } from "./parsers/coto.js"; import { join } from "path"; import pMap from "p-map"; import { and, eq, sql } from "drizzle-orm";