mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 11:36:20 +00:00
Compare commits
6 commits
eb0e62447b
...
405502877c
Author | SHA1 | Date | |
---|---|---|---|
405502877c | |||
7b989f0ea0 | |||
f85c74a7bc | |||
518a7cb2f5 | |||
eb271fb496 | |||
8482d7ceea |
4 changed files with 38 additions and 39 deletions
|
@ -7,16 +7,19 @@ import { parseWarc } from "./scrap.js";
|
||||||
if (process.argv[2] === "auto") {
|
if (process.argv[2] === "auto") {
|
||||||
await auto();
|
await auto();
|
||||||
} else if (process.argv[2] === "scrap-carrefour-links") {
|
} else if (process.argv[2] === "scrap-carrefour-links") {
|
||||||
await scrapCarrefourProducts()
|
await scrapCarrefourProducts();
|
||||||
} else if (process.argv[2] === "scrap-dia-links") {
|
} else if (process.argv[2] === "scrap-dia-links") {
|
||||||
await scrapDiaProducts()
|
await scrapDiaProducts();
|
||||||
} else if (process.argv[2] === "scrap-coto-links") {
|
} else if (process.argv[2] === "scrap-coto-links") {
|
||||||
await scrapCotoProducts()
|
await scrapCotoProducts();
|
||||||
} else if (process.argv[2] === "scrap") {
|
} else if (process.argv[2] === "scrap") {
|
||||||
const warcPaths = process.argv.slice(3);
|
const warcPaths = process.argv.slice(3);
|
||||||
if (warcPaths.length > 0) {
|
if (warcPaths.length > 0) {
|
||||||
for (const path of warcPaths) {
|
for (const path of warcPaths) {
|
||||||
await parseWarc(path);
|
const res = await parseWarc(path);
|
||||||
|
console.info("=======================================");
|
||||||
|
console.info(path, res);
|
||||||
|
console.info("=======================================");
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
console.error("Especificá WARCs para scrapear.");
|
console.error("Especificá WARCs para scrapear.");
|
||||||
|
|
|
@ -12,6 +12,10 @@ export function priceFromMeta(dom: Window) {
|
||||||
const precioCentavos = parseFloat(precioMeta) * 100;
|
const precioCentavos = parseFloat(precioMeta) * 100;
|
||||||
return precioCentavos;
|
return precioCentavos;
|
||||||
}
|
}
|
||||||
|
export function stockFromMeta(dom: Window) {
|
||||||
|
const stockMeta = getMetaProp(dom, "product:availability");
|
||||||
|
return stockMeta === "instock";
|
||||||
|
}
|
||||||
|
|
||||||
function parseJsonLds(dom: Window): object[] {
|
function parseJsonLds(dom: Window): object[] {
|
||||||
const scripts = dom.window.document.querySelectorAll(
|
const scripts = dom.window.document.querySelectorAll(
|
||||||
|
|
|
@ -1,20 +1,7 @@
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import { Precioish, type Precio } from "../scrap.js";
|
import { Precioish } from "../scrap.js";
|
||||||
import { getProductJsonLd, priceFromMeta } from "../common.js";
|
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "../common.js";
|
||||||
|
|
||||||
function getEanByTable(dom: Window): string {
|
|
||||||
const eanLabelEl = dom.window.document.querySelector(
|
|
||||||
'td[data-specification="EAN"]'
|
|
||||||
);
|
|
||||||
const eanValueEl = eanLabelEl?.parentElement?.children[1];
|
|
||||||
if (
|
|
||||||
!eanValueEl ||
|
|
||||||
!(eanValueEl instanceof dom.window.HTMLElement) ||
|
|
||||||
!eanValueEl.dataset.specification
|
|
||||||
)
|
|
||||||
throw new Error("No encontré el EAN");
|
|
||||||
return eanValueEl.dataset.specification;
|
|
||||||
}
|
|
||||||
function parseScriptJson<T>(dom: Window, varname: string): T {
|
function parseScriptJson<T>(dom: Window, varname: string): T {
|
||||||
const script = dom.window.document.querySelector<HTMLTemplateElement>(
|
const script = dom.window.document.querySelector<HTMLTemplateElement>(
|
||||||
`template[data-type="json"][data-varname="${varname}"]`
|
`template[data-type="json"][data-varname="${varname}"]`
|
||||||
|
@ -37,31 +24,27 @@ function eanFromSeedState(dom: Window): string {
|
||||||
if (!productSkuJson) throw new Error("no encontré el sku en el json");
|
if (!productSkuJson) throw new Error("no encontré el sku en el json");
|
||||||
return productSkuJson[1].ean;
|
return productSkuJson[1].ean;
|
||||||
}
|
}
|
||||||
function eanFromDynamicYieldScript(dom: Window): string {
|
|
||||||
const scriptEl = dom.window.document.querySelector(
|
|
||||||
`script[src^="//st.dynamicyield.com/st?"]`
|
|
||||||
);
|
|
||||||
if (!scriptEl || !(scriptEl instanceof dom.window.HTMLScriptElement))
|
|
||||||
throw new Error("no encuentro el script de dynamicyield");
|
|
||||||
|
|
||||||
const url = new URL(scriptEl.src);
|
|
||||||
const ctx = url.searchParams.get("ctx");
|
|
||||||
if (!ctx) throw new Error("no hay ctx");
|
|
||||||
return JSON.parse(ctx).data[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
export function getCarrefourProduct(html: string | Buffer): Precioish {
|
export function getCarrefourProduct(html: string | Buffer): Precioish {
|
||||||
const dom = parseHTML(html);
|
const dom = parseHTML(html);
|
||||||
|
|
||||||
const precioCentavos = priceFromMeta(dom);
|
const precioCentavos = priceFromMeta(dom);
|
||||||
|
const inStock = stockFromMeta(dom);
|
||||||
|
|
||||||
const ean = eanFromSeedState(dom);
|
const ean = eanFromSeedState(dom);
|
||||||
|
|
||||||
const ld = getProductJsonLd(dom);
|
let name, imageUrl;
|
||||||
const name = ld.name;
|
try {
|
||||||
const imageUrl = ld.image;
|
const ld = getProductJsonLd(dom);
|
||||||
const inStock =
|
name = ld.name;
|
||||||
ld.offers.offers[0].availability === "http://schema.org/InStock";
|
imageUrl = ld.image;
|
||||||
|
} catch (error) {
|
||||||
|
if (inStock) {
|
||||||
|
throw error;
|
||||||
|
} else {
|
||||||
|
// algunas paginas sin stock no tienen json ld
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
name,
|
name,
|
||||||
|
|
|
@ -37,9 +37,8 @@ export async function parseWarc(path: string) {
|
||||||
errors: { error: any; warcRecordId: string; path: string }[];
|
errors: { error: any; warcRecordId: string; path: string }[];
|
||||||
} = { done: 0, errors: [] };
|
} = { done: 0, errors: [] };
|
||||||
|
|
||||||
const warc = Bun.spawn(["zstd", "-do", "/dev/stdout", path], {
|
const proc = Bun.spawn(["zstdcat", "-d", path], {});
|
||||||
stderr: "ignore",
|
const warc = proc.stdout;
|
||||||
}).stdout;
|
|
||||||
// TODO: tirar error si falla zstd
|
// TODO: tirar error si falla zstd
|
||||||
|
|
||||||
const parser = new WARCParser(warc);
|
const parser = new WARCParser(warc);
|
||||||
|
@ -53,6 +52,12 @@ export async function parseWarc(path: string) {
|
||||||
console.debug(`skipped ${warcRecordId}`);
|
console.debug(`skipped ${warcRecordId}`);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
if (record.httpHeaders?.statusCode !== 200) {
|
||||||
|
console.debug(
|
||||||
|
`skipped ${warcRecordId} because status=${record.httpHeaders?.statusCode} (!=200)`
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
// TODO: sobreescribir si existe el mismo record-id pero con version mas bajo?
|
// TODO: sobreescribir si existe el mismo record-id pero con version mas bajo?
|
||||||
|
|
||||||
const html = await record.contentText();
|
const html = await record.contentText();
|
||||||
|
@ -99,5 +104,9 @@ export async function parseWarc(path: string) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ((await proc.exited) !== 0) {
|
||||||
|
throw new Error("zstd tiró un error");
|
||||||
|
}
|
||||||
|
|
||||||
return progress;
|
return progress;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue