mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 11:36:20 +00:00
Compare commits
No commits in common. "405502877c288e24e45c176b86e60b98d4ee67ca" and "eb0e62447b67f47e08324c1a041f9bfae244702f" have entirely different histories.
405502877c
...
eb0e62447b
4 changed files with 39 additions and 38 deletions
|
@ -7,19 +7,16 @@ import { parseWarc } from "./scrap.js";
|
||||||
if (process.argv[2] === "auto") {
|
if (process.argv[2] === "auto") {
|
||||||
await auto();
|
await auto();
|
||||||
} else if (process.argv[2] === "scrap-carrefour-links") {
|
} else if (process.argv[2] === "scrap-carrefour-links") {
|
||||||
await scrapCarrefourProducts();
|
await scrapCarrefourProducts()
|
||||||
} else if (process.argv[2] === "scrap-dia-links") {
|
} else if (process.argv[2] === "scrap-dia-links") {
|
||||||
await scrapDiaProducts();
|
await scrapDiaProducts()
|
||||||
} else if (process.argv[2] === "scrap-coto-links") {
|
} else if (process.argv[2] === "scrap-coto-links") {
|
||||||
await scrapCotoProducts();
|
await scrapCotoProducts()
|
||||||
} else if (process.argv[2] === "scrap") {
|
} else if (process.argv[2] === "scrap") {
|
||||||
const warcPaths = process.argv.slice(3);
|
const warcPaths = process.argv.slice(3);
|
||||||
if (warcPaths.length > 0) {
|
if (warcPaths.length > 0) {
|
||||||
for (const path of warcPaths) {
|
for (const path of warcPaths) {
|
||||||
const res = await parseWarc(path);
|
await parseWarc(path);
|
||||||
console.info("=======================================");
|
|
||||||
console.info(path, res);
|
|
||||||
console.info("=======================================");
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
console.error("Especificá WARCs para scrapear.");
|
console.error("Especificá WARCs para scrapear.");
|
||||||
|
|
|
@ -12,10 +12,6 @@ export function priceFromMeta(dom: Window) {
|
||||||
const precioCentavos = parseFloat(precioMeta) * 100;
|
const precioCentavos = parseFloat(precioMeta) * 100;
|
||||||
return precioCentavos;
|
return precioCentavos;
|
||||||
}
|
}
|
||||||
export function stockFromMeta(dom: Window) {
|
|
||||||
const stockMeta = getMetaProp(dom, "product:availability");
|
|
||||||
return stockMeta === "instock";
|
|
||||||
}
|
|
||||||
|
|
||||||
function parseJsonLds(dom: Window): object[] {
|
function parseJsonLds(dom: Window): object[] {
|
||||||
const scripts = dom.window.document.querySelectorAll(
|
const scripts = dom.window.document.querySelectorAll(
|
||||||
|
|
|
@ -1,7 +1,20 @@
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import { Precioish } from "../scrap.js";
|
import { Precioish, type Precio } from "../scrap.js";
|
||||||
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "../common.js";
|
import { getProductJsonLd, priceFromMeta } from "../common.js";
|
||||||
|
|
||||||
|
function getEanByTable(dom: Window): string {
|
||||||
|
const eanLabelEl = dom.window.document.querySelector(
|
||||||
|
'td[data-specification="EAN"]'
|
||||||
|
);
|
||||||
|
const eanValueEl = eanLabelEl?.parentElement?.children[1];
|
||||||
|
if (
|
||||||
|
!eanValueEl ||
|
||||||
|
!(eanValueEl instanceof dom.window.HTMLElement) ||
|
||||||
|
!eanValueEl.dataset.specification
|
||||||
|
)
|
||||||
|
throw new Error("No encontré el EAN");
|
||||||
|
return eanValueEl.dataset.specification;
|
||||||
|
}
|
||||||
function parseScriptJson<T>(dom: Window, varname: string): T {
|
function parseScriptJson<T>(dom: Window, varname: string): T {
|
||||||
const script = dom.window.document.querySelector<HTMLTemplateElement>(
|
const script = dom.window.document.querySelector<HTMLTemplateElement>(
|
||||||
`template[data-type="json"][data-varname="${varname}"]`
|
`template[data-type="json"][data-varname="${varname}"]`
|
||||||
|
@ -24,27 +37,31 @@ function eanFromSeedState(dom: Window): string {
|
||||||
if (!productSkuJson) throw new Error("no encontré el sku en el json");
|
if (!productSkuJson) throw new Error("no encontré el sku en el json");
|
||||||
return productSkuJson[1].ean;
|
return productSkuJson[1].ean;
|
||||||
}
|
}
|
||||||
|
function eanFromDynamicYieldScript(dom: Window): string {
|
||||||
|
const scriptEl = dom.window.document.querySelector(
|
||||||
|
`script[src^="//st.dynamicyield.com/st?"]`
|
||||||
|
);
|
||||||
|
if (!scriptEl || !(scriptEl instanceof dom.window.HTMLScriptElement))
|
||||||
|
throw new Error("no encuentro el script de dynamicyield");
|
||||||
|
|
||||||
|
const url = new URL(scriptEl.src);
|
||||||
|
const ctx = url.searchParams.get("ctx");
|
||||||
|
if (!ctx) throw new Error("no hay ctx");
|
||||||
|
return JSON.parse(ctx).data[0];
|
||||||
|
}
|
||||||
|
|
||||||
export function getCarrefourProduct(html: string | Buffer): Precioish {
|
export function getCarrefourProduct(html: string | Buffer): Precioish {
|
||||||
const dom = parseHTML(html);
|
const dom = parseHTML(html);
|
||||||
|
|
||||||
const precioCentavos = priceFromMeta(dom);
|
const precioCentavos = priceFromMeta(dom);
|
||||||
const inStock = stockFromMeta(dom);
|
|
||||||
|
|
||||||
const ean = eanFromSeedState(dom);
|
const ean = eanFromSeedState(dom);
|
||||||
|
|
||||||
let name, imageUrl;
|
|
||||||
try {
|
|
||||||
const ld = getProductJsonLd(dom);
|
const ld = getProductJsonLd(dom);
|
||||||
name = ld.name;
|
const name = ld.name;
|
||||||
imageUrl = ld.image;
|
const imageUrl = ld.image;
|
||||||
} catch (error) {
|
const inStock =
|
||||||
if (inStock) {
|
ld.offers.offers[0].availability === "http://schema.org/InStock";
|
||||||
throw error;
|
|
||||||
} else {
|
|
||||||
// algunas paginas sin stock no tienen json ld
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
name,
|
name,
|
||||||
|
|
|
@ -37,8 +37,9 @@ export async function parseWarc(path: string) {
|
||||||
errors: { error: any; warcRecordId: string; path: string }[];
|
errors: { error: any; warcRecordId: string; path: string }[];
|
||||||
} = { done: 0, errors: [] };
|
} = { done: 0, errors: [] };
|
||||||
|
|
||||||
const proc = Bun.spawn(["zstdcat", "-d", path], {});
|
const warc = Bun.spawn(["zstd", "-do", "/dev/stdout", path], {
|
||||||
const warc = proc.stdout;
|
stderr: "ignore",
|
||||||
|
}).stdout;
|
||||||
// TODO: tirar error si falla zstd
|
// TODO: tirar error si falla zstd
|
||||||
|
|
||||||
const parser = new WARCParser(warc);
|
const parser = new WARCParser(warc);
|
||||||
|
@ -52,12 +53,6 @@ export async function parseWarc(path: string) {
|
||||||
console.debug(`skipped ${warcRecordId}`);
|
console.debug(`skipped ${warcRecordId}`);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (record.httpHeaders?.statusCode !== 200) {
|
|
||||||
console.debug(
|
|
||||||
`skipped ${warcRecordId} because status=${record.httpHeaders?.statusCode} (!=200)`
|
|
||||||
);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// TODO: sobreescribir si existe el mismo record-id pero con version mas bajo?
|
// TODO: sobreescribir si existe el mismo record-id pero con version mas bajo?
|
||||||
|
|
||||||
const html = await record.contentText();
|
const html = await record.contentText();
|
||||||
|
@ -104,9 +99,5 @@ export async function parseWarc(path: string) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((await proc.exited) !== 0) {
|
|
||||||
throw new Error("zstd tiró un error");
|
|
||||||
}
|
|
||||||
|
|
||||||
return progress;
|
return progress;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue