mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 11:36:20 +00:00
limpiar scraper
This commit is contained in:
parent
f2f5c7afdd
commit
794247d657
6 changed files with 8 additions and 75 deletions
|
@ -1,40 +0,0 @@
|
||||||
// import { run, bench, group, baseline } from "mitata";
|
|
||||||
import { createReadStream } from "node:fs";
|
|
||||||
import { Writable } from "node:stream";
|
|
||||||
import { pipeline } from "node:stream/promises";
|
|
||||||
import { getCarrefourProduct } from "./carrefour.js";
|
|
||||||
import { WARCParser } from "warcio";
|
|
||||||
// import { ZSTDDecompress } from "simple-zstd";
|
|
||||||
// import { AutoWARCParser } from "node-warc";
|
|
||||||
|
|
||||||
// const html = await readFile("carrefour.html", "utf-8");
|
|
||||||
// bench("carrefour", () => {
|
|
||||||
// getCarrefourProduct(html);
|
|
||||||
// });
|
|
||||||
|
|
||||||
// await bench("warcio", async () => {
|
|
||||||
// const warc = Bun.spawn(
|
|
||||||
// ["zstd", "-do", "/dev/stdout", "../data/carrefour.warc.zst"],
|
|
||||||
// {
|
|
||||||
// // stdin: Bun.file().stream(),
|
|
||||||
// }
|
|
||||||
// ).stdout;
|
|
||||||
// // const warc = Bun.file("../data/carrefour.warc").stream(1024 * 1024 * 512);
|
|
||||||
// // const warc = createReadStream("../data/carrefour.warc.zst").pipe(ZSTDDecompress());
|
|
||||||
|
|
||||||
// const parser = new WARCParser(warc);
|
|
||||||
// for await (const record of parser) {
|
|
||||||
// const html = await record.contentText();
|
|
||||||
// }
|
|
||||||
// });
|
|
||||||
|
|
||||||
// await bench("warc", );
|
|
||||||
|
|
||||||
async function bench(name: string, func: () => Promise<void>) {
|
|
||||||
const t0 = performance.now();
|
|
||||||
await func();
|
|
||||||
const t1 = performance.now();
|
|
||||||
console.debug(`${name} took ${t1 - t0}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// await run({});
|
|
|
@ -1,27 +0,0 @@
|
||||||
import { readFile, writeFile } from "fs/promises";
|
|
||||||
import pMap from "p-map";
|
|
||||||
import { nanoid } from "nanoid";
|
|
||||||
import { getHtml } from "./fetch.js";
|
|
||||||
import { join } from "path";
|
|
||||||
|
|
||||||
(async () => {
|
|
||||||
const inputPath = process.argv[2];
|
|
||||||
const outputPath = process.argv[3];
|
|
||||||
if (!inputPath || !outputPath) {
|
|
||||||
console.error("falta input y/o output");
|
|
||||||
process.exit(1);
|
|
||||||
}
|
|
||||||
const file = await readFile(inputPath, "utf-8");
|
|
||||||
const urls = file.split("\n");
|
|
||||||
|
|
||||||
await pMap(
|
|
||||||
urls,
|
|
||||||
async (url: string) => {
|
|
||||||
const id = nanoid();
|
|
||||||
const html = await getHtml(url);
|
|
||||||
await writeFile(join(outputPath, `${id}.link`), url);
|
|
||||||
await writeFile(join(outputPath, id), html);
|
|
||||||
},
|
|
||||||
{ concurrency: 12 }
|
|
||||||
);
|
|
||||||
})();
|
|
|
@ -1,6 +1,6 @@
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import { Precioish, type Precio } from "./scrap.js";
|
import { Precioish, type Precio } from "../scrap.js";
|
||||||
import { getProductJsonLd, priceFromMeta } from "./common.js";
|
import { getProductJsonLd, priceFromMeta } from "../common.js";
|
||||||
|
|
||||||
function getEanByTable(dom: Window): string {
|
function getEanByTable(dom: Window): string {
|
||||||
const eanLabelEl = dom.window.document.querySelector(
|
const eanLabelEl = dom.window.document.querySelector(
|
|
@ -1,5 +1,5 @@
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import { type Precioish } from "./scrap.js";
|
import { type Precioish } from "../scrap.js";
|
||||||
|
|
||||||
function getEanFromText({ document }: Window) {
|
function getEanFromText({ document }: Window) {
|
||||||
const potentialEanEls = Array.from(
|
const potentialEanEls = Array.from(
|
|
@ -1,6 +1,6 @@
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import { type Precioish } from "./scrap.js";
|
import { type Precioish } from "../scrap.js";
|
||||||
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
|
import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js";
|
||||||
|
|
||||||
export function getDiaProduct(html: string | Buffer): Precioish {
|
export function getDiaProduct(html: string | Buffer): Precioish {
|
||||||
const dom = parseHTML(html);
|
const dom = parseHTML(html);
|
|
@ -6,9 +6,9 @@ import * as schema from "db-datos/schema.js";
|
||||||
import { WARCParser } from "warcio";
|
import { WARCParser } from "warcio";
|
||||||
import { writeFile } from "fs/promises";
|
import { writeFile } from "fs/promises";
|
||||||
import { createHash } from "crypto";
|
import { createHash } from "crypto";
|
||||||
import { getCarrefourProduct } from "./carrefour.js";
|
import { getCarrefourProduct } from "./parsers/carrefour.js";
|
||||||
import { getDiaProduct } from "./dia.js";
|
import { getDiaProduct } from "./parsers/dia.js";
|
||||||
import { getCotoProduct } from "./coto.js";
|
import { getCotoProduct } from "./parsers/coto.js";
|
||||||
import { join } from "path";
|
import { join } from "path";
|
||||||
import pMap from "p-map";
|
import pMap from "p-map";
|
||||||
import { and, eq, sql } from "drizzle-orm";
|
import { and, eq, sql } from "drizzle-orm";
|
||||||
|
|
Loading…
Reference in a new issue