limpiar scraper

2025-02-22 15:06:26 +00:00 · 2023-12-24 18:06:52 -03:00 · 2023-12-24 18:06:52 -03:00 · 794247d657
commit 794247d657
parent f2f5c7afdd
6 changed files with 8 additions and 75 deletions
--- a/scraper/bench.ts
+++ b/scraper/bench.ts
@ -1,40 +0,0 @@
-// import { run, bench, group, baseline } from "mitata";
-import { createReadStream } from "node:fs";
-import { Writable } from "node:stream";
-import { pipeline } from "node:stream/promises";
-import { getCarrefourProduct } from "./carrefour.js";
-import { WARCParser } from "warcio";
-// import { ZSTDDecompress } from "simple-zstd";
-// import { AutoWARCParser } from "node-warc";
-
-// const html = await readFile("carrefour.html", "utf-8");
-// bench("carrefour", () => {
-//   getCarrefourProduct(html);
-// });
-
-// await bench("warcio", async () => {
-//   const warc = Bun.spawn(
-//     ["zstd", "-do", "/dev/stdout", "../data/carrefour.warc.zst"],
-//     {
-//       // stdin: Bun.file().stream(),
-//     }
-//   ).stdout;
-//   // const warc = Bun.file("../data/carrefour.warc").stream(1024 * 1024 * 512);
-//   // const warc = createReadStream("../data/carrefour.warc.zst").pipe(ZSTDDecompress());
-
-//   const parser = new WARCParser(warc);
-//   for await (const record of parser) {
-//     const html = await record.contentText();
-//   }
-// });
-
-// await bench("warc", );
-
-async function bench(name: string, func: () => Promise<void>) {
-  const t0 = performance.now();
-  await func();
-  const t1 = performance.now();
-  console.debug(`${name} took ${t1 - t0}`);
-}
-
-// await run({});
--- a/scraper/downloadUrls.ts
+++ b/scraper/downloadUrls.ts
@ -1,27 +0,0 @@
-import { readFile, writeFile } from "fs/promises";
-import pMap from "p-map";
-import { nanoid } from "nanoid";
-import { getHtml } from "./fetch.js";
-import { join } from "path";
-
-(async () => {
-  const inputPath = process.argv[2];
-  const outputPath = process.argv[3];
-  if (!inputPath || !outputPath) {
-    console.error("falta input y/o output");
-    process.exit(1);
-  }
-  const file = await readFile(inputPath, "utf-8");
-  const urls = file.split("\n");
-
-  await pMap(
-    urls,
-    async (url: string) => {
-      const id = nanoid();
-      const html = await getHtml(url);
-      await writeFile(join(outputPath, `${id}.link`), url);
-      await writeFile(join(outputPath, id), html);
-    },
-    { concurrency: 12 }
-  );
-})();
--- a/scraper/parsers/carrefour.ts
+++ b/scraper/parsers/carrefour.ts
@ -1,6 +1,6 @@
 import { parseHTML } from "linkedom";
-import { Precioish, type Precio } from "./scrap.js";
-import { getProductJsonLd, priceFromMeta } from "./common.js";
+import { Precioish, type Precio } from "../scrap.js";
+import { getProductJsonLd, priceFromMeta } from "../common.js";

 function getEanByTable(dom: Window): string {
  const eanLabelEl = dom.window.document.querySelector(
--- a/scraper/parsers/coto.ts
+++ b/scraper/parsers/coto.ts
@ -1,5 +1,5 @@
 import { parseHTML } from "linkedom";
-import { type Precioish } from "./scrap.js";
+import { type Precioish } from "../scrap.js";

 function getEanFromText({ document }: Window) {
  const potentialEanEls = Array.from(
--- a/scraper/parsers/dia.ts
+++ b/scraper/parsers/dia.ts
@ -1,6 +1,6 @@
 import { parseHTML } from "linkedom";
-import { type Precioish } from "./scrap.js";
-import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
+import { type Precioish } from "../scrap.js";
+import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js";

 export function getDiaProduct(html: string | Buffer): Precioish {
  const dom = parseHTML(html);
--- a/scraper/scrap.ts
+++ b/scraper/scrap.ts
@ -6,9 +6,9 @@ import * as schema from "db-datos/schema.js";
 import { WARCParser } from "warcio";
 import { writeFile } from "fs/promises";
 import { createHash } from "crypto";
-import { getCarrefourProduct } from "./carrefour.js";
-import { getDiaProduct } from "./dia.js";
-import { getCotoProduct } from "./coto.js";
+import { getCarrefourProduct } from "./parsers/carrefour.js";
+import { getDiaProduct } from "./parsers/dia.js";
+import { getCotoProduct } from "./parsers/coto.js";
 import { join } from "path";
 import pMap from "p-map";
 import { and, eq, sql } from "drizzle-orm";