limpiar scraper

This commit is contained in:
Cat /dev/Nulo 2023-12-24 18:06:52 -03:00
parent f2f5c7afdd
commit 794247d657
6 changed files with 8 additions and 75 deletions

View file

@ -1,40 +0,0 @@
// import { run, bench, group, baseline } from "mitata";
import { createReadStream } from "node:fs";
import { Writable } from "node:stream";
import { pipeline } from "node:stream/promises";
import { getCarrefourProduct } from "./carrefour.js";
import { WARCParser } from "warcio";
// import { ZSTDDecompress } from "simple-zstd";
// import { AutoWARCParser } from "node-warc";
// const html = await readFile("carrefour.html", "utf-8");
// bench("carrefour", () => {
// getCarrefourProduct(html);
// });
// await bench("warcio", async () => {
// const warc = Bun.spawn(
// ["zstd", "-do", "/dev/stdout", "../data/carrefour.warc.zst"],
// {
// // stdin: Bun.file().stream(),
// }
// ).stdout;
// // const warc = Bun.file("../data/carrefour.warc").stream(1024 * 1024 * 512);
// // const warc = createReadStream("../data/carrefour.warc.zst").pipe(ZSTDDecompress());
// const parser = new WARCParser(warc);
// for await (const record of parser) {
// const html = await record.contentText();
// }
// });
// await bench("warc", );
async function bench(name: string, func: () => Promise<void>) {
const t0 = performance.now();
await func();
const t1 = performance.now();
console.debug(`${name} took ${t1 - t0}`);
}
// await run({});

View file

@ -1,27 +0,0 @@
import { readFile, writeFile } from "fs/promises";
import pMap from "p-map";
import { nanoid } from "nanoid";
import { getHtml } from "./fetch.js";
import { join } from "path";
(async () => {
const inputPath = process.argv[2];
const outputPath = process.argv[3];
if (!inputPath || !outputPath) {
console.error("falta input y/o output");
process.exit(1);
}
const file = await readFile(inputPath, "utf-8");
const urls = file.split("\n");
await pMap(
urls,
async (url: string) => {
const id = nanoid();
const html = await getHtml(url);
await writeFile(join(outputPath, `${id}.link`), url);
await writeFile(join(outputPath, id), html);
},
{ concurrency: 12 }
);
})();

View file

@ -1,6 +1,6 @@
import { parseHTML } from "linkedom";
import { Precioish, type Precio } from "./scrap.js";
import { getProductJsonLd, priceFromMeta } from "./common.js";
import { Precioish, type Precio } from "../scrap.js";
import { getProductJsonLd, priceFromMeta } from "../common.js";
function getEanByTable(dom: Window): string {
const eanLabelEl = dom.window.document.querySelector(

View file

@ -1,5 +1,5 @@
import { parseHTML } from "linkedom";
import { type Precioish } from "./scrap.js";
import { type Precioish } from "../scrap.js";
function getEanFromText({ document }: Window) {
const potentialEanEls = Array.from(

View file

@ -1,6 +1,6 @@
import { parseHTML } from "linkedom";
import { type Precioish } from "./scrap.js";
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
import { type Precioish } from "../scrap.js";
import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js";
export function getDiaProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html);

View file

@ -6,9 +6,9 @@ import * as schema from "db-datos/schema.js";
import { WARCParser } from "warcio";
import { writeFile } from "fs/promises";
import { createHash } from "crypto";
import { getCarrefourProduct } from "./carrefour.js";
import { getDiaProduct } from "./dia.js";
import { getCotoProduct } from "./coto.js";
import { getCarrefourProduct } from "./parsers/carrefour.js";
import { getDiaProduct } from "./parsers/dia.js";
import { getCotoProduct } from "./parsers/coto.js";
import { join } from "path";
import pMap from "p-map";
import { and, eq, sql } from "drizzle-orm";