mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 11:36:20 +00:00
limpiar scraper
This commit is contained in:
parent
f2f5c7afdd
commit
794247d657
6 changed files with 8 additions and 75 deletions
|
@ -1,40 +0,0 @@
|
|||
// import { run, bench, group, baseline } from "mitata";
|
||||
import { createReadStream } from "node:fs";
|
||||
import { Writable } from "node:stream";
|
||||
import { pipeline } from "node:stream/promises";
|
||||
import { getCarrefourProduct } from "./carrefour.js";
|
||||
import { WARCParser } from "warcio";
|
||||
// import { ZSTDDecompress } from "simple-zstd";
|
||||
// import { AutoWARCParser } from "node-warc";
|
||||
|
||||
// const html = await readFile("carrefour.html", "utf-8");
|
||||
// bench("carrefour", () => {
|
||||
// getCarrefourProduct(html);
|
||||
// });
|
||||
|
||||
// await bench("warcio", async () => {
|
||||
// const warc = Bun.spawn(
|
||||
// ["zstd", "-do", "/dev/stdout", "../data/carrefour.warc.zst"],
|
||||
// {
|
||||
// // stdin: Bun.file().stream(),
|
||||
// }
|
||||
// ).stdout;
|
||||
// // const warc = Bun.file("../data/carrefour.warc").stream(1024 * 1024 * 512);
|
||||
// // const warc = createReadStream("../data/carrefour.warc.zst").pipe(ZSTDDecompress());
|
||||
|
||||
// const parser = new WARCParser(warc);
|
||||
// for await (const record of parser) {
|
||||
// const html = await record.contentText();
|
||||
// }
|
||||
// });
|
||||
|
||||
// await bench("warc", );
|
||||
|
||||
async function bench(name: string, func: () => Promise<void>) {
|
||||
const t0 = performance.now();
|
||||
await func();
|
||||
const t1 = performance.now();
|
||||
console.debug(`${name} took ${t1 - t0}`);
|
||||
}
|
||||
|
||||
// await run({});
|
|
@ -1,27 +0,0 @@
|
|||
import { readFile, writeFile } from "fs/promises";
|
||||
import pMap from "p-map";
|
||||
import { nanoid } from "nanoid";
|
||||
import { getHtml } from "./fetch.js";
|
||||
import { join } from "path";
|
||||
|
||||
(async () => {
|
||||
const inputPath = process.argv[2];
|
||||
const outputPath = process.argv[3];
|
||||
if (!inputPath || !outputPath) {
|
||||
console.error("falta input y/o output");
|
||||
process.exit(1);
|
||||
}
|
||||
const file = await readFile(inputPath, "utf-8");
|
||||
const urls = file.split("\n");
|
||||
|
||||
await pMap(
|
||||
urls,
|
||||
async (url: string) => {
|
||||
const id = nanoid();
|
||||
const html = await getHtml(url);
|
||||
await writeFile(join(outputPath, `${id}.link`), url);
|
||||
await writeFile(join(outputPath, id), html);
|
||||
},
|
||||
{ concurrency: 12 }
|
||||
);
|
||||
})();
|
|
@ -1,6 +1,6 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
import { Precioish, type Precio } from "./scrap.js";
|
||||
import { getProductJsonLd, priceFromMeta } from "./common.js";
|
||||
import { Precioish, type Precio } from "../scrap.js";
|
||||
import { getProductJsonLd, priceFromMeta } from "../common.js";
|
||||
|
||||
function getEanByTable(dom: Window): string {
|
||||
const eanLabelEl = dom.window.document.querySelector(
|
|
@ -1,5 +1,5 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
import { type Precioish } from "./scrap.js";
|
||||
import { type Precioish } from "../scrap.js";
|
||||
|
||||
function getEanFromText({ document }: Window) {
|
||||
const potentialEanEls = Array.from(
|
|
@ -1,6 +1,6 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
import { type Precioish } from "./scrap.js";
|
||||
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
|
||||
import { type Precioish } from "../scrap.js";
|
||||
import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js";
|
||||
|
||||
export function getDiaProduct(html: string | Buffer): Precioish {
|
||||
const dom = parseHTML(html);
|
|
@ -6,9 +6,9 @@ import * as schema from "db-datos/schema.js";
|
|||
import { WARCParser } from "warcio";
|
||||
import { writeFile } from "fs/promises";
|
||||
import { createHash } from "crypto";
|
||||
import { getCarrefourProduct } from "./carrefour.js";
|
||||
import { getDiaProduct } from "./dia.js";
|
||||
import { getCotoProduct } from "./coto.js";
|
||||
import { getCarrefourProduct } from "./parsers/carrefour.js";
|
||||
import { getDiaProduct } from "./parsers/dia.js";
|
||||
import { getCotoProduct } from "./parsers/coto.js";
|
||||
import { join } from "path";
|
||||
import pMap from "p-map";
|
||||
import { and, eq, sql } from "drizzle-orm";
|
||||
|
|
Loading…
Reference in a new issue