Compare commits

..

No commits in common. "405502877c288e24e45c176b86e60b98d4ee67ca" and "eb0e62447b67f47e08324c1a041f9bfae244702f" have entirely different histories.

4 changed files with 39 additions and 38 deletions

View file

@ -7,19 +7,16 @@ import { parseWarc } from "./scrap.js";
if (process.argv[2] === "auto") {
await auto();
} else if (process.argv[2] === "scrap-carrefour-links") {
await scrapCarrefourProducts();
await scrapCarrefourProducts()
} else if (process.argv[2] === "scrap-dia-links") {
await scrapDiaProducts();
await scrapDiaProducts()
} else if (process.argv[2] === "scrap-coto-links") {
await scrapCotoProducts();
await scrapCotoProducts()
} else if (process.argv[2] === "scrap") {
const warcPaths = process.argv.slice(3);
if (warcPaths.length > 0) {
for (const path of warcPaths) {
const res = await parseWarc(path);
console.info("=======================================");
console.info(path, res);
console.info("=======================================");
await parseWarc(path);
}
} else {
console.error("Especificá WARCs para scrapear.");

View file

@ -12,10 +12,6 @@ export function priceFromMeta(dom: Window) {
const precioCentavos = parseFloat(precioMeta) * 100;
return precioCentavos;
}
export function stockFromMeta(dom: Window) {
const stockMeta = getMetaProp(dom, "product:availability");
return stockMeta === "instock";
}
function parseJsonLds(dom: Window): object[] {
const scripts = dom.window.document.querySelectorAll(

View file

@ -1,7 +1,20 @@
import { parseHTML } from "linkedom";
import { Precioish } from "../scrap.js";
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "../common.js";
import { Precioish, type Precio } from "../scrap.js";
import { getProductJsonLd, priceFromMeta } from "../common.js";
function getEanByTable(dom: Window): string {
const eanLabelEl = dom.window.document.querySelector(
'td[data-specification="EAN"]'
);
const eanValueEl = eanLabelEl?.parentElement?.children[1];
if (
!eanValueEl ||
!(eanValueEl instanceof dom.window.HTMLElement) ||
!eanValueEl.dataset.specification
)
throw new Error("No encontré el EAN");
return eanValueEl.dataset.specification;
}
function parseScriptJson<T>(dom: Window, varname: string): T {
const script = dom.window.document.querySelector<HTMLTemplateElement>(
`template[data-type="json"][data-varname="${varname}"]`
@ -24,27 +37,31 @@ function eanFromSeedState(dom: Window): string {
if (!productSkuJson) throw new Error("no encontré el sku en el json");
return productSkuJson[1].ean;
}
function eanFromDynamicYieldScript(dom: Window): string {
const scriptEl = dom.window.document.querySelector(
`script[src^="//st.dynamicyield.com/st?"]`
);
if (!scriptEl || !(scriptEl instanceof dom.window.HTMLScriptElement))
throw new Error("no encuentro el script de dynamicyield");
const url = new URL(scriptEl.src);
const ctx = url.searchParams.get("ctx");
if (!ctx) throw new Error("no hay ctx");
return JSON.parse(ctx).data[0];
}
export function getCarrefourProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html);
const precioCentavos = priceFromMeta(dom);
const inStock = stockFromMeta(dom);
const ean = eanFromSeedState(dom);
let name, imageUrl;
try {
const ld = getProductJsonLd(dom);
name = ld.name;
imageUrl = ld.image;
} catch (error) {
if (inStock) {
throw error;
} else {
// algunas paginas sin stock no tienen json ld
}
}
const name = ld.name;
const imageUrl = ld.image;
const inStock =
ld.offers.offers[0].availability === "http://schema.org/InStock";
return {
name,

View file

@ -37,8 +37,9 @@ export async function parseWarc(path: string) {
errors: { error: any; warcRecordId: string; path: string }[];
} = { done: 0, errors: [] };
const proc = Bun.spawn(["zstdcat", "-d", path], {});
const warc = proc.stdout;
const warc = Bun.spawn(["zstd", "-do", "/dev/stdout", path], {
stderr: "ignore",
}).stdout;
// TODO: tirar error si falla zstd
const parser = new WARCParser(warc);
@ -52,12 +53,6 @@ export async function parseWarc(path: string) {
console.debug(`skipped ${warcRecordId}`);
continue;
}
if (record.httpHeaders?.statusCode !== 200) {
console.debug(
`skipped ${warcRecordId} because status=${record.httpHeaders?.statusCode} (!=200)`
);
continue;
}
// TODO: sobreescribir si existe el mismo record-id pero con version mas bajo?
const html = await record.contentText();
@ -104,9 +99,5 @@ export async function parseWarc(path: string) {
}
}
if ((await proc.exited) !== 0) {
throw new Error("zstd tiró un error");
}
return progress;
}