mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 03:26:19 +00:00
Compare commits
6 commits
eb0e62447b
...
405502877c
Author | SHA1 | Date | |
---|---|---|---|
405502877c | |||
7b989f0ea0 | |||
f85c74a7bc | |||
518a7cb2f5 | |||
eb271fb496 | |||
8482d7ceea |
4 changed files with 38 additions and 39 deletions
|
@ -7,16 +7,19 @@ import { parseWarc } from "./scrap.js";
|
|||
if (process.argv[2] === "auto") {
|
||||
await auto();
|
||||
} else if (process.argv[2] === "scrap-carrefour-links") {
|
||||
await scrapCarrefourProducts()
|
||||
await scrapCarrefourProducts();
|
||||
} else if (process.argv[2] === "scrap-dia-links") {
|
||||
await scrapDiaProducts()
|
||||
await scrapDiaProducts();
|
||||
} else if (process.argv[2] === "scrap-coto-links") {
|
||||
await scrapCotoProducts()
|
||||
await scrapCotoProducts();
|
||||
} else if (process.argv[2] === "scrap") {
|
||||
const warcPaths = process.argv.slice(3);
|
||||
if (warcPaths.length > 0) {
|
||||
for (const path of warcPaths) {
|
||||
await parseWarc(path);
|
||||
const res = await parseWarc(path);
|
||||
console.info("=======================================");
|
||||
console.info(path, res);
|
||||
console.info("=======================================");
|
||||
}
|
||||
} else {
|
||||
console.error("Especificá WARCs para scrapear.");
|
||||
|
|
|
@ -12,6 +12,10 @@ export function priceFromMeta(dom: Window) {
|
|||
const precioCentavos = parseFloat(precioMeta) * 100;
|
||||
return precioCentavos;
|
||||
}
|
||||
export function stockFromMeta(dom: Window) {
|
||||
const stockMeta = getMetaProp(dom, "product:availability");
|
||||
return stockMeta === "instock";
|
||||
}
|
||||
|
||||
function parseJsonLds(dom: Window): object[] {
|
||||
const scripts = dom.window.document.querySelectorAll(
|
||||
|
|
|
@ -1,20 +1,7 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
import { Precioish, type Precio } from "../scrap.js";
|
||||
import { getProductJsonLd, priceFromMeta } from "../common.js";
|
||||
import { Precioish } from "../scrap.js";
|
||||
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "../common.js";
|
||||
|
||||
function getEanByTable(dom: Window): string {
|
||||
const eanLabelEl = dom.window.document.querySelector(
|
||||
'td[data-specification="EAN"]'
|
||||
);
|
||||
const eanValueEl = eanLabelEl?.parentElement?.children[1];
|
||||
if (
|
||||
!eanValueEl ||
|
||||
!(eanValueEl instanceof dom.window.HTMLElement) ||
|
||||
!eanValueEl.dataset.specification
|
||||
)
|
||||
throw new Error("No encontré el EAN");
|
||||
return eanValueEl.dataset.specification;
|
||||
}
|
||||
function parseScriptJson<T>(dom: Window, varname: string): T {
|
||||
const script = dom.window.document.querySelector<HTMLTemplateElement>(
|
||||
`template[data-type="json"][data-varname="${varname}"]`
|
||||
|
@ -37,31 +24,27 @@ function eanFromSeedState(dom: Window): string {
|
|||
if (!productSkuJson) throw new Error("no encontré el sku en el json");
|
||||
return productSkuJson[1].ean;
|
||||
}
|
||||
function eanFromDynamicYieldScript(dom: Window): string {
|
||||
const scriptEl = dom.window.document.querySelector(
|
||||
`script[src^="//st.dynamicyield.com/st?"]`
|
||||
);
|
||||
if (!scriptEl || !(scriptEl instanceof dom.window.HTMLScriptElement))
|
||||
throw new Error("no encuentro el script de dynamicyield");
|
||||
|
||||
const url = new URL(scriptEl.src);
|
||||
const ctx = url.searchParams.get("ctx");
|
||||
if (!ctx) throw new Error("no hay ctx");
|
||||
return JSON.parse(ctx).data[0];
|
||||
}
|
||||
|
||||
export function getCarrefourProduct(html: string | Buffer): Precioish {
|
||||
const dom = parseHTML(html);
|
||||
|
||||
const precioCentavos = priceFromMeta(dom);
|
||||
const inStock = stockFromMeta(dom);
|
||||
|
||||
const ean = eanFromSeedState(dom);
|
||||
|
||||
let name, imageUrl;
|
||||
try {
|
||||
const ld = getProductJsonLd(dom);
|
||||
const name = ld.name;
|
||||
const imageUrl = ld.image;
|
||||
const inStock =
|
||||
ld.offers.offers[0].availability === "http://schema.org/InStock";
|
||||
name = ld.name;
|
||||
imageUrl = ld.image;
|
||||
} catch (error) {
|
||||
if (inStock) {
|
||||
throw error;
|
||||
} else {
|
||||
// algunas paginas sin stock no tienen json ld
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
name,
|
||||
|
|
|
@ -37,9 +37,8 @@ export async function parseWarc(path: string) {
|
|||
errors: { error: any; warcRecordId: string; path: string }[];
|
||||
} = { done: 0, errors: [] };
|
||||
|
||||
const warc = Bun.spawn(["zstd", "-do", "/dev/stdout", path], {
|
||||
stderr: "ignore",
|
||||
}).stdout;
|
||||
const proc = Bun.spawn(["zstdcat", "-d", path], {});
|
||||
const warc = proc.stdout;
|
||||
// TODO: tirar error si falla zstd
|
||||
|
||||
const parser = new WARCParser(warc);
|
||||
|
@ -53,6 +52,12 @@ export async function parseWarc(path: string) {
|
|||
console.debug(`skipped ${warcRecordId}`);
|
||||
continue;
|
||||
}
|
||||
if (record.httpHeaders?.statusCode !== 200) {
|
||||
console.debug(
|
||||
`skipped ${warcRecordId} because status=${record.httpHeaders?.statusCode} (!=200)`
|
||||
);
|
||||
continue;
|
||||
}
|
||||
// TODO: sobreescribir si existe el mismo record-id pero con version mas bajo?
|
||||
|
||||
const html = await record.contentText();
|
||||
|
@ -99,5 +104,9 @@ export async function parseWarc(path: string) {
|
|||
}
|
||||
}
|
||||
|
||||
if ((await proc.exited) !== 0) {
|
||||
throw new Error("zstd tiró un error");
|
||||
}
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue