Compare commits

...

6 commits

Author SHA1 Message Date
405502877c ignorar paginas status!=200 2024-01-01 16:50:17 -03:00
7b989f0ea0 fixup! carrefour 2024-01-01 16:50:00 -03:00
f85c74a7bc borrar parsers no usados 2024-01-01 16:49:52 -03:00
518a7cb2f5 carrefour: solo intentar parsear jsonld en error 2024-01-01 16:49:43 -03:00
eb271fb496 scrap manual: printear resultado 2024-01-01 16:03:22 -03:00
8482d7ceea scrap: trackear proceso zstd 2024-01-01 15:32:42 -03:00
4 changed files with 38 additions and 39 deletions

View file

@ -7,16 +7,19 @@ import { parseWarc } from "./scrap.js";
if (process.argv[2] === "auto") { if (process.argv[2] === "auto") {
await auto(); await auto();
} else if (process.argv[2] === "scrap-carrefour-links") { } else if (process.argv[2] === "scrap-carrefour-links") {
await scrapCarrefourProducts() await scrapCarrefourProducts();
} else if (process.argv[2] === "scrap-dia-links") { } else if (process.argv[2] === "scrap-dia-links") {
await scrapDiaProducts() await scrapDiaProducts();
} else if (process.argv[2] === "scrap-coto-links") { } else if (process.argv[2] === "scrap-coto-links") {
await scrapCotoProducts() await scrapCotoProducts();
} else if (process.argv[2] === "scrap") { } else if (process.argv[2] === "scrap") {
const warcPaths = process.argv.slice(3); const warcPaths = process.argv.slice(3);
if (warcPaths.length > 0) { if (warcPaths.length > 0) {
for (const path of warcPaths) { for (const path of warcPaths) {
await parseWarc(path); const res = await parseWarc(path);
console.info("=======================================");
console.info(path, res);
console.info("=======================================");
} }
} else { } else {
console.error("Especificá WARCs para scrapear."); console.error("Especificá WARCs para scrapear.");

View file

@ -12,6 +12,10 @@ export function priceFromMeta(dom: Window) {
const precioCentavos = parseFloat(precioMeta) * 100; const precioCentavos = parseFloat(precioMeta) * 100;
return precioCentavos; return precioCentavos;
} }
export function stockFromMeta(dom: Window) {
const stockMeta = getMetaProp(dom, "product:availability");
return stockMeta === "instock";
}
function parseJsonLds(dom: Window): object[] { function parseJsonLds(dom: Window): object[] {
const scripts = dom.window.document.querySelectorAll( const scripts = dom.window.document.querySelectorAll(

View file

@ -1,20 +1,7 @@
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import { Precioish, type Precio } from "../scrap.js"; import { Precioish } from "../scrap.js";
import { getProductJsonLd, priceFromMeta } from "../common.js"; import { getProductJsonLd, priceFromMeta, stockFromMeta } from "../common.js";
function getEanByTable(dom: Window): string {
const eanLabelEl = dom.window.document.querySelector(
'td[data-specification="EAN"]'
);
const eanValueEl = eanLabelEl?.parentElement?.children[1];
if (
!eanValueEl ||
!(eanValueEl instanceof dom.window.HTMLElement) ||
!eanValueEl.dataset.specification
)
throw new Error("No encontré el EAN");
return eanValueEl.dataset.specification;
}
function parseScriptJson<T>(dom: Window, varname: string): T { function parseScriptJson<T>(dom: Window, varname: string): T {
const script = dom.window.document.querySelector<HTMLTemplateElement>( const script = dom.window.document.querySelector<HTMLTemplateElement>(
`template[data-type="json"][data-varname="${varname}"]` `template[data-type="json"][data-varname="${varname}"]`
@ -37,31 +24,27 @@ function eanFromSeedState(dom: Window): string {
if (!productSkuJson) throw new Error("no encontré el sku en el json"); if (!productSkuJson) throw new Error("no encontré el sku en el json");
return productSkuJson[1].ean; return productSkuJson[1].ean;
} }
function eanFromDynamicYieldScript(dom: Window): string {
const scriptEl = dom.window.document.querySelector(
`script[src^="//st.dynamicyield.com/st?"]`
);
if (!scriptEl || !(scriptEl instanceof dom.window.HTMLScriptElement))
throw new Error("no encuentro el script de dynamicyield");
const url = new URL(scriptEl.src);
const ctx = url.searchParams.get("ctx");
if (!ctx) throw new Error("no hay ctx");
return JSON.parse(ctx).data[0];
}
export function getCarrefourProduct(html: string | Buffer): Precioish { export function getCarrefourProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html); const dom = parseHTML(html);
const precioCentavos = priceFromMeta(dom); const precioCentavos = priceFromMeta(dom);
const inStock = stockFromMeta(dom);
const ean = eanFromSeedState(dom); const ean = eanFromSeedState(dom);
let name, imageUrl;
try {
const ld = getProductJsonLd(dom); const ld = getProductJsonLd(dom);
const name = ld.name; name = ld.name;
const imageUrl = ld.image; imageUrl = ld.image;
const inStock = } catch (error) {
ld.offers.offers[0].availability === "http://schema.org/InStock"; if (inStock) {
throw error;
} else {
// algunas paginas sin stock no tienen json ld
}
}
return { return {
name, name,

View file

@ -37,9 +37,8 @@ export async function parseWarc(path: string) {
errors: { error: any; warcRecordId: string; path: string }[]; errors: { error: any; warcRecordId: string; path: string }[];
} = { done: 0, errors: [] }; } = { done: 0, errors: [] };
const warc = Bun.spawn(["zstd", "-do", "/dev/stdout", path], { const proc = Bun.spawn(["zstdcat", "-d", path], {});
stderr: "ignore", const warc = proc.stdout;
}).stdout;
// TODO: tirar error si falla zstd // TODO: tirar error si falla zstd
const parser = new WARCParser(warc); const parser = new WARCParser(warc);
@ -53,6 +52,12 @@ export async function parseWarc(path: string) {
console.debug(`skipped ${warcRecordId}`); console.debug(`skipped ${warcRecordId}`);
continue; continue;
} }
if (record.httpHeaders?.statusCode !== 200) {
console.debug(
`skipped ${warcRecordId} because status=${record.httpHeaders?.statusCode} (!=200)`
);
continue;
}
// TODO: sobreescribir si existe el mismo record-id pero con version mas bajo? // TODO: sobreescribir si existe el mismo record-id pero con version mas bajo?
const html = await record.contentText(); const html = await record.contentText();
@ -99,5 +104,9 @@ export async function parseWarc(path: string) {
} }
} }
if ((await proc.exited) !== 0) {
throw new Error("zstd tiró un error");
}
return progress; return progress;
} }