scrapear dia y carrefour via bunjs

This commit is contained in:
Cat /dev/Nulo 2023-12-22 15:46:22 -03:00
parent 5e55ad7131
commit d85b54f837
8 changed files with 227 additions and 102 deletions

4
.gitignore vendored
View file

@ -3,4 +3,6 @@ data/carrefour
p.* p.*
p p
node_modules/ node_modules/
*.db *.db
scraper/debug/
scraper/x.tsv

View file

@ -43,7 +43,7 @@ importers:
version: 9.2.2 version: 9.2.2
drizzle-orm: drizzle-orm:
specifier: ^0.29.1 specifier: ^0.29.1
version: 0.29.1(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2) version: 0.29.1(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2)(bun-types@1.0.19)
linkedom: linkedom:
specifier: ^0.16.5 specifier: ^0.16.5
version: 0.16.5 version: 0.16.5
@ -59,6 +59,9 @@ importers:
warcio: warcio:
specifier: ^2.2.1 specifier: ^2.2.1
version: 2.2.1 version: 2.2.1
zod:
specifier: ^3.22.4
version: 3.22.4
devDependencies: devDependencies:
'@types/better-sqlite3': '@types/better-sqlite3':
specifier: ^7.6.8 specifier: ^7.6.8
@ -66,6 +69,9 @@ importers:
'@types/node': '@types/node':
specifier: ^20.10.5 specifier: ^20.10.5
version: 20.10.5 version: 20.10.5
bun-types:
specifier: ^1.0.19
version: 1.0.19
tsx: tsx:
specifier: ^4.7.0 specifier: ^4.7.0
version: 4.7.0 version: 4.7.0
@ -274,6 +280,11 @@ packages:
dependencies: dependencies:
undici-types: 5.26.5 undici-types: 5.26.5
/@types/ws@8.5.10:
resolution: {integrity: sha512-vmQSUcfalpIq0R9q7uTo2lXs6eGIpt9wtnLdMv9LVpIjCA/+ufZRozlVoVelIYixx1ugCBKDhn89vnsEGOCx9A==}
dependencies:
'@types/node': 20.10.5
/ansi-regex@5.0.1: /ansi-regex@5.0.1:
resolution: {integrity: sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==} resolution: {integrity: sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==}
engines: {node: '>=8'} engines: {node: '>=8'}
@ -330,6 +341,13 @@ packages:
ieee754: 1.2.1 ieee754: 1.2.1
dev: false dev: false
/bun-types@1.0.19:
resolution: {integrity: sha512-7P5/r+twssrkDQ6HMit2GARMBbAxz1tLLEcMgQOCZeCX9BzNtabktjPCu+DmcvDYDnL/Ke75pmKg9CNBTlCzlQ==}
dependencies:
'@types/node': 20.10.5
'@types/ws': 8.5.10
undici-types: 5.26.5
/chownr@1.1.4: /chownr@1.1.4:
resolution: {integrity: sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==} resolution: {integrity: sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==}
dev: false dev: false
@ -424,7 +442,7 @@ packages:
domhandler: 5.0.3 domhandler: 5.0.3
dev: false dev: false
/drizzle-orm@0.29.1(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2): /drizzle-orm@0.29.1(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2)(bun-types@1.0.19):
resolution: {integrity: sha512-yItc4unfHnk8XkDD3/bdC63vdboTY7e7I03lCF1OJYABXSIfQYU9BFTQJXMMovVeb3T1/OJWwfW/70T1XPnuUA==} resolution: {integrity: sha512-yItc4unfHnk8XkDD3/bdC63vdboTY7e7I03lCF1OJYABXSIfQYU9BFTQJXMMovVeb3T1/OJWwfW/70T1XPnuUA==}
peerDependencies: peerDependencies:
'@aws-sdk/client-rds-data': '>=3' '@aws-sdk/client-rds-data': '>=3'
@ -488,6 +506,7 @@ packages:
dependencies: dependencies:
'@types/better-sqlite3': 7.6.8 '@types/better-sqlite3': 7.6.8
better-sqlite3: 9.2.2 better-sqlite3: 9.2.2
bun-types: 1.0.19
dev: false dev: false
/emoji-regex@8.0.0: /emoji-regex@8.0.0:
@ -955,3 +974,7 @@ packages:
y18n: 5.0.8 y18n: 5.0.8
yargs-parser: 21.1.1 yargs-parser: 21.1.1
dev: false dev: false
/zod@3.22.4:
resolution: {integrity: sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==}
dev: false

70
scraper/carrefour.ts Normal file
View file

@ -0,0 +1,70 @@
import { parseHTML } from "linkedom";
import { Precioish, type Precio } from "./scrap.js";
import { getProductJsonLd, priceFromMeta } from "./common.js";
function getEanByTable(dom: Window): string {
const eanLabelEl = dom.window.document.querySelector(
'td[data-specification="EAN"]'
);
const eanValueEl = eanLabelEl?.parentElement?.children[1];
if (
!eanValueEl ||
!(eanValueEl instanceof dom.window.HTMLElement) ||
!eanValueEl.dataset.specification
)
throw new Error("No encontré el EAN");
return eanValueEl.dataset.specification;
}
function parseScriptJson<T>(dom: Window, varname: string): T {
const script = dom.window.document.querySelector<HTMLTemplateElement>(
`template[data-type="json"][data-varname="${varname}"]`
)?.content?.children[0];
if (!script) throw new Error("no encuentro el script");
return JSON.parse(script.innerHTML);
}
function eanFromSeedState(dom: Window): string {
const json = parseScriptJson<object>(dom, "__STATE__");
const productJson = Object.entries(json).find(
([key, val]) => key.startsWith("Product:") && val.__typename === "Product"
);
if (!productJson) throw new Error("no encontré el product en el json");
const productSkuJson = Object.entries(json).find(
([key, val]) =>
key.startsWith(`Product:${productJson[1].cacheId}`) &&
val.__typename === "SKU"
);
if (!productSkuJson) throw new Error("no encontré el sku en el json");
return productSkuJson[1].ean;
}
function eanFromDynamicYieldScript(dom: Window): string {
const scriptEl = dom.window.document.querySelector(
`script[src^="//st.dynamicyield.com/st?"]`
);
if (!scriptEl || !(scriptEl instanceof dom.window.HTMLScriptElement))
throw new Error("no encuentro el script de dynamicyield");
const url = new URL(scriptEl.src);
const ctx = url.searchParams.get("ctx");
if (!ctx) throw new Error("no hay ctx");
return JSON.parse(ctx).data[0];
}
export function getCarrefourProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html);
const precioCentavos = priceFromMeta(dom);
// const productLd = findJsonLd(dom, "Product");
const ean = eanFromSeedState(dom);
const ld = getProductJsonLd(dom);
const inStock =
ld.offers.offers[0].availability === "http://schema.org/InStock";
return {
ean,
precioCentavos,
inStock,
};
}

50
scraper/common.ts Normal file
View file

@ -0,0 +1,50 @@
import { z } from "zod";
export function getMetaProp(dom: Window, prop: string) {
return dom.window.document
.querySelector(`meta[property="${prop}"]`)
?.getAttribute("content");
}
export function priceFromMeta(dom: Window) {
const precioMeta = getMetaProp(dom, "product:price:amount");
if (!precioMeta) return null;
const precioCentavos = parseFloat(precioMeta) * 100;
return precioCentavos;
}
function parseJsonLds(dom: Window): object[] {
const scripts = dom.window.document.querySelectorAll(
'script[type="application/ld+json"]'
);
return [...scripts].map((scripts) => JSON.parse(scripts.innerHTML));
}
function findJsonLd(dom: Window, type: string): object | undefined {
return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type);
}
const zProductLd = z.object({
"@type": z.literal("Product"),
name: z.string(),
image: z.string(),
offers: z.object({
offers: z.tuple([
z.object({
"@type": z.literal("Offer"),
price: z.number(),
priceCurrency: z.literal("ARS"),
availability: z.enum([
"http://schema.org/OutOfStock",
"http://schema.org/InStock",
]),
}),
]),
}),
});
type ProductLd = z.infer<typeof zProductLd>;
export function getProductJsonLd(dom: Window): ProductLd {
const ld = findJsonLd(dom, "Product");
const productLd = zProductLd.parse(ld);
return productLd;
}

21
scraper/dia.ts Normal file
View file

@ -0,0 +1,21 @@
import { parseHTML } from "linkedom";
import { type Precioish } from "./scrap.js";
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
export function getDiaProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html);
const ean = getMetaProp(dom, "product:retailer_item_id");
if (!ean) throw new Error("No encontré el ean");
const precioCentavos = priceFromMeta(dom);
const ld = getProductJsonLd(dom);
const inStock =
ld.offers.offers[0].availability === "http://schema.org/InStock";
return {
ean,
precioCentavos,
inStock,
};
}

View file

@ -17,11 +17,13 @@
"nanoid": "^5.0.4", "nanoid": "^5.0.4",
"p-map": "^7.0.0", "p-map": "^7.0.0",
"undici": "^6.2.0", "undici": "^6.2.0",
"warcio": "^2.2.1" "warcio": "^2.2.1",
"zod": "^3.22.4"
}, },
"devDependencies": { "devDependencies": {
"@types/better-sqlite3": "^7.6.8", "@types/better-sqlite3": "^7.6.8",
"@types/node": "^20.10.5", "@types/node": "^20.10.5",
"bun-types": "^1.0.19",
"tsx": "^4.7.0", "tsx": "^4.7.0",
"typescript": "^5.3.3" "typescript": "^5.3.3"
} }

View file

@ -1,128 +1,79 @@
/// <reference lib="dom" /> /// <reference lib="dom" />
/// <reference lib="dom.iterable" /> /// <reference lib="dom.iterable" />
/// <reference types="node" /> /// <reference types="node" />
import { parseHTML } from "linkedom"; import { Database } from "bun:sqlite";
import { drizzle } from "drizzle-orm/better-sqlite3"; import { drizzle } from "drizzle-orm/bun-sqlite";
import Database from "better-sqlite3";
import { precios } from "./db/schema.js"; import { precios } from "./db/schema.js";
import { WARCParser } from "warcio"; import { WARCParser } from "warcio";
import { createReadStream } from "fs"; import { createReadStream, createWriteStream } from "fs";
import { writeFile } from "fs/promises"; import { writeFile } from "fs/promises";
import { createHash } from "crypto"; import { createHash } from "crypto";
import { migrate } from "drizzle-orm/bun-sqlite/migrator";
import { getCarrefourProduct } from "./carrefour.js";
import { getDiaProduct } from "./dia.js";
import { join } from "path";
const sqlite = new Database("sqlite.db"); const sqlite = new Database("sqlite.db");
const db = drizzle(sqlite); const db = drizzle(sqlite);
type Precio = typeof precios.$inferInsert; const DEBUG = true;
export type Precio = typeof precios.$inferInsert;
export type Precioish = Omit<Precio, "fetchedAt" | "url">;
async function storePrecioPoint(point: Precio) { async function storePrecioPoint(point: Precio) {
await db.insert(precios).values(point); await db.insert(precios).values(point);
} }
function getEanByTable(dom: Window): string {
const eanLabelEl = dom.window.document.querySelector(
'td[data-specification="EAN"]'
);
const eanValueEl = eanLabelEl?.parentElement?.children[1];
if (
!eanValueEl ||
!(eanValueEl instanceof dom.window.HTMLElement) ||
!eanValueEl.dataset.specification
)
throw new Error("No encontré el EAN");
return eanValueEl.dataset.specification;
}
function parseJsonLds(dom: Window): object[] {
const scripts = dom.window.document.querySelectorAll(
'script[type="application/ld+json"]'
);
return [...scripts].map((scripts) => JSON.parse(scripts.innerHTML));
}
function findJsonLd(dom: Window, type: string): object | undefined {
return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type);
}
function parseScriptJson<T>(dom: Window, varname: string): T {
const script = dom.window.document.querySelector<HTMLTemplateElement>(
`template[data-type="json"][data-varname="${varname}"]`
)?.content?.children[0];
if (!script) throw new Error("no encuentro el script");
return JSON.parse(script.innerHTML);
}
function eanFromSeedState(dom: Window): string {
const json = parseScriptJson<object>(dom, "__STATE__");
const productJson = Object.entries(json).find(
([key, val]) => key.startsWith("Product:") && val.__typename === "Product"
);
if (!productJson) throw new Error("no encontré el product en el json");
const productSkuJson = Object.entries(json).find(
([key, val]) =>
key.startsWith(`Product:${productJson[1].cacheId}`) &&
val.__typename === "SKU"
);
if (!productSkuJson) throw new Error("no encontré el sku en el json");
return productSkuJson[1].ean;
}
function eanFromDynamicYieldScript(dom: Window): string {
const scriptEl = dom.window.document.querySelector(
`script[src^="//st.dynamicyield.com/st?"]`
);
if (!scriptEl || !(scriptEl instanceof dom.window.HTMLScriptElement))
throw new Error("no encuentro el script de dynamicyield");
const url = new URL(scriptEl.src);
const ctx = url.searchParams.get("ctx");
if (!ctx) throw new Error("no hay ctx");
return JSON.parse(ctx).data[0];
}
function getCarrefourProduct(html: string | Buffer): Precio {
const dom = parseHTML(html);
const precioMeta = dom.window.document
.querySelector(`meta[property="product:price:amount"]`)
?.getAttribute("content");
if (!precioMeta) throw new Error("No encontré el precio");
const precioCentavos = parseFloat(precioMeta) * 100;
// const productLd = findJsonLd(dom, "Product");
const ean = eanFromSeedState(dom);
return {
ean,
precioCentavos,
fetchedAt: new Date(),
};
}
(async () => { (async () => {
// await migrate(db, { migrationsFolder: "./drizzle" }); const o = createWriteStream("x.tsv");
// const p = await getCarrefourProduct( o.write(`ean\tfetchedAt\tprecioCentavos\tinStock\turl\n`);
// "https://www.carrefour.com.ar/bebida-lactea-la-serenisima-ultra-0-grasa-vainilla-900-cc/p"
// );
// await storePrecioPoint(p);
const warc = createReadStream(process.argv[2]); const warc = createReadStream(process.argv[2]);
const parser = new WARCParser(warc); const parser = new WARCParser(warc);
let progress = { done: 0, errors: 0 };
for await (const record of parser) { for await (const record of parser) {
if (record.warcType === "response") { if (record.warcType === "response") {
if (!record.warcTargetURI) throw new Error("no uri");
console.log(record.warcTargetURI); console.log(record.warcTargetURI);
const html = await record.contentText(); const html = await record.contentText();
const url = new URL(record.warcTargetURI);
try { try {
const product = getCarrefourProduct(html); let ish: Precioish | undefined = undefined;
console.log(product); if (url.hostname === "www.carrefour.com.ar")
ish = getCarrefourProduct(html);
else if (url.hostname === "diaonline.supermercadosdia.com.ar")
ish = getDiaProduct(html);
else console.error(`Unknown host ${url.hostname}`);
const p: Precio = {
...ish,
fetchedAt: new Date(record.warcDate!),
url: record.warcTargetURI,
};
if (ish)
o.write(
`${p.ean}\t${p.fetchedAt}\t${p.precioCentavos}\t${p.inStock}\t${p.url}\n`
);
// console.log(product);
progress.done++;
} catch (error) { } catch (error) {
console.error(error); console.error(error);
const urlHash = createHash("md5") progress.errors++;
.update(record.warcTargetURI!)
.digest("hex"); if (DEBUG) {
const output = `${urlHash}.html`; const urlHash = createHash("md5")
await writeFile(output, html); .update(record.warcTargetURI!)
console.error(`wrote html to ${output}`); .digest("hex");
const output = join("debug", `${urlHash}.html`);
await writeFile(output, html);
console.error(`wrote html to ${output}`);
}
} finally {
console.debug(progress);
} }
} }
} }

6
scraper/tsconfig.json Normal file
View file

@ -0,0 +1,6 @@
{
"extends": "../tsconfig.json",
"compilerOptions": {
"types": ["bun-types"]
}
}