From d85b54f8372bb6cc54545d548ef08429e3e13a29 Mon Sep 17 00:00:00 2001 From: Nulo Date: Fri, 22 Dec 2023 15:46:22 -0300 Subject: [PATCH] scrapear dia y carrefour via bunjs --- .gitignore | 4 +- pnpm-lock.yaml | 27 +++++++- scraper/carrefour.ts | 70 ++++++++++++++++++++ scraper/common.ts | 50 ++++++++++++++ scraper/dia.ts | 21 ++++++ scraper/package.json | 4 +- scraper/scrap.ts | 147 ++++++++++++++---------------------------- scraper/tsconfig.json | 6 ++ 8 files changed, 227 insertions(+), 102 deletions(-) create mode 100644 scraper/carrefour.ts create mode 100644 scraper/common.ts create mode 100644 scraper/dia.ts create mode 100644 scraper/tsconfig.json diff --git a/.gitignore b/.gitignore index bb41a98..c73b41a 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,6 @@ data/carrefour p.* p node_modules/ -*.db \ No newline at end of file +*.db +scraper/debug/ +scraper/x.tsv \ No newline at end of file diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index a9e973b..798ad39 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -43,7 +43,7 @@ importers: version: 9.2.2 drizzle-orm: specifier: ^0.29.1 - version: 0.29.1(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2) + version: 0.29.1(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2)(bun-types@1.0.19) linkedom: specifier: ^0.16.5 version: 0.16.5 @@ -59,6 +59,9 @@ importers: warcio: specifier: ^2.2.1 version: 2.2.1 + zod: + specifier: ^3.22.4 + version: 3.22.4 devDependencies: '@types/better-sqlite3': specifier: ^7.6.8 @@ -66,6 +69,9 @@ importers: '@types/node': specifier: ^20.10.5 version: 20.10.5 + bun-types: + specifier: ^1.0.19 + version: 1.0.19 tsx: specifier: ^4.7.0 version: 4.7.0 @@ -274,6 +280,11 @@ packages: dependencies: undici-types: 5.26.5 + /@types/ws@8.5.10: + resolution: {integrity: sha512-vmQSUcfalpIq0R9q7uTo2lXs6eGIpt9wtnLdMv9LVpIjCA/+ufZRozlVoVelIYixx1ugCBKDhn89vnsEGOCx9A==} + dependencies: + '@types/node': 20.10.5 + /ansi-regex@5.0.1: resolution: {integrity: sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==} engines: {node: '>=8'} @@ -330,6 +341,13 @@ packages: ieee754: 1.2.1 dev: false + /bun-types@1.0.19: + resolution: {integrity: sha512-7P5/r+twssrkDQ6HMit2GARMBbAxz1tLLEcMgQOCZeCX9BzNtabktjPCu+DmcvDYDnL/Ke75pmKg9CNBTlCzlQ==} + dependencies: + '@types/node': 20.10.5 + '@types/ws': 8.5.10 + undici-types: 5.26.5 + /chownr@1.1.4: resolution: {integrity: sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==} dev: false @@ -424,7 +442,7 @@ packages: domhandler: 5.0.3 dev: false - /drizzle-orm@0.29.1(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2): + /drizzle-orm@0.29.1(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2)(bun-types@1.0.19): resolution: {integrity: sha512-yItc4unfHnk8XkDD3/bdC63vdboTY7e7I03lCF1OJYABXSIfQYU9BFTQJXMMovVeb3T1/OJWwfW/70T1XPnuUA==} peerDependencies: '@aws-sdk/client-rds-data': '>=3' @@ -488,6 +506,7 @@ packages: dependencies: '@types/better-sqlite3': 7.6.8 better-sqlite3: 9.2.2 + bun-types: 1.0.19 dev: false /emoji-regex@8.0.0: @@ -955,3 +974,7 @@ packages: y18n: 5.0.8 yargs-parser: 21.1.1 dev: false + + /zod@3.22.4: + resolution: {integrity: sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==} + dev: false diff --git a/scraper/carrefour.ts b/scraper/carrefour.ts new file mode 100644 index 0000000..da79ce6 --- /dev/null +++ b/scraper/carrefour.ts @@ -0,0 +1,70 @@ +import { parseHTML } from "linkedom"; +import { Precioish, type Precio } from "./scrap.js"; +import { getProductJsonLd, priceFromMeta } from "./common.js"; + +function getEanByTable(dom: Window): string { + const eanLabelEl = dom.window.document.querySelector( + 'td[data-specification="EAN"]' + ); + const eanValueEl = eanLabelEl?.parentElement?.children[1]; + if ( + !eanValueEl || + !(eanValueEl instanceof dom.window.HTMLElement) || + !eanValueEl.dataset.specification + ) + throw new Error("No encontré el EAN"); + return eanValueEl.dataset.specification; +} +function parseScriptJson(dom: Window, varname: string): T { + const script = dom.window.document.querySelector( + `template[data-type="json"][data-varname="${varname}"]` + )?.content?.children[0]; + if (!script) throw new Error("no encuentro el script"); + return JSON.parse(script.innerHTML); +} +function eanFromSeedState(dom: Window): string { + const json = parseScriptJson(dom, "__STATE__"); + const productJson = Object.entries(json).find( + ([key, val]) => key.startsWith("Product:") && val.__typename === "Product" + ); + if (!productJson) throw new Error("no encontré el product en el json"); + + const productSkuJson = Object.entries(json).find( + ([key, val]) => + key.startsWith(`Product:${productJson[1].cacheId}`) && + val.__typename === "SKU" + ); + if (!productSkuJson) throw new Error("no encontré el sku en el json"); + return productSkuJson[1].ean; +} +function eanFromDynamicYieldScript(dom: Window): string { + const scriptEl = dom.window.document.querySelector( + `script[src^="//st.dynamicyield.com/st?"]` + ); + if (!scriptEl || !(scriptEl instanceof dom.window.HTMLScriptElement)) + throw new Error("no encuentro el script de dynamicyield"); + + const url = new URL(scriptEl.src); + const ctx = url.searchParams.get("ctx"); + if (!ctx) throw new Error("no hay ctx"); + return JSON.parse(ctx).data[0]; +} + +export function getCarrefourProduct(html: string | Buffer): Precioish { + const dom = parseHTML(html); + + const precioCentavos = priceFromMeta(dom); + + // const productLd = findJsonLd(dom, "Product"); + const ean = eanFromSeedState(dom); + + const ld = getProductJsonLd(dom); + const inStock = + ld.offers.offers[0].availability === "http://schema.org/InStock"; + + return { + ean, + precioCentavos, + inStock, + }; +} diff --git a/scraper/common.ts b/scraper/common.ts new file mode 100644 index 0000000..fbeb380 --- /dev/null +++ b/scraper/common.ts @@ -0,0 +1,50 @@ +import { z } from "zod"; + +export function getMetaProp(dom: Window, prop: string) { + return dom.window.document + .querySelector(`meta[property="${prop}"]`) + ?.getAttribute("content"); +} + +export function priceFromMeta(dom: Window) { + const precioMeta = getMetaProp(dom, "product:price:amount"); + if (!precioMeta) return null; + const precioCentavos = parseFloat(precioMeta) * 100; + return precioCentavos; +} + +function parseJsonLds(dom: Window): object[] { + const scripts = dom.window.document.querySelectorAll( + 'script[type="application/ld+json"]' + ); + return [...scripts].map((scripts) => JSON.parse(scripts.innerHTML)); +} +function findJsonLd(dom: Window, type: string): object | undefined { + return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type); +} + +const zProductLd = z.object({ + "@type": z.literal("Product"), + name: z.string(), + image: z.string(), + offers: z.object({ + offers: z.tuple([ + z.object({ + "@type": z.literal("Offer"), + price: z.number(), + priceCurrency: z.literal("ARS"), + availability: z.enum([ + "http://schema.org/OutOfStock", + "http://schema.org/InStock", + ]), + }), + ]), + }), +}); +type ProductLd = z.infer; + +export function getProductJsonLd(dom: Window): ProductLd { + const ld = findJsonLd(dom, "Product"); + const productLd = zProductLd.parse(ld); + return productLd; +} diff --git a/scraper/dia.ts b/scraper/dia.ts new file mode 100644 index 0000000..7359f1c --- /dev/null +++ b/scraper/dia.ts @@ -0,0 +1,21 @@ +import { parseHTML } from "linkedom"; +import { type Precioish } from "./scrap.js"; +import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js"; + +export function getDiaProduct(html: string | Buffer): Precioish { + const dom = parseHTML(html); + + const ean = getMetaProp(dom, "product:retailer_item_id"); + if (!ean) throw new Error("No encontré el ean"); + const precioCentavos = priceFromMeta(dom); + + const ld = getProductJsonLd(dom); + const inStock = + ld.offers.offers[0].availability === "http://schema.org/InStock"; + + return { + ean, + precioCentavos, + inStock, + }; +} diff --git a/scraper/package.json b/scraper/package.json index 815b4bf..865f64a 100644 --- a/scraper/package.json +++ b/scraper/package.json @@ -17,11 +17,13 @@ "nanoid": "^5.0.4", "p-map": "^7.0.0", "undici": "^6.2.0", - "warcio": "^2.2.1" + "warcio": "^2.2.1", + "zod": "^3.22.4" }, "devDependencies": { "@types/better-sqlite3": "^7.6.8", "@types/node": "^20.10.5", + "bun-types": "^1.0.19", "tsx": "^4.7.0", "typescript": "^5.3.3" } diff --git a/scraper/scrap.ts b/scraper/scrap.ts index 954dd2a..8b667d4 100644 --- a/scraper/scrap.ts +++ b/scraper/scrap.ts @@ -1,128 +1,79 @@ /// /// /// -import { parseHTML } from "linkedom"; -import { drizzle } from "drizzle-orm/better-sqlite3"; -import Database from "better-sqlite3"; +import { Database } from "bun:sqlite"; +import { drizzle } from "drizzle-orm/bun-sqlite"; import { precios } from "./db/schema.js"; import { WARCParser } from "warcio"; -import { createReadStream } from "fs"; +import { createReadStream, createWriteStream } from "fs"; import { writeFile } from "fs/promises"; import { createHash } from "crypto"; +import { migrate } from "drizzle-orm/bun-sqlite/migrator"; +import { getCarrefourProduct } from "./carrefour.js"; +import { getDiaProduct } from "./dia.js"; +import { join } from "path"; const sqlite = new Database("sqlite.db"); const db = drizzle(sqlite); -type Precio = typeof precios.$inferInsert; +const DEBUG = true; + +export type Precio = typeof precios.$inferInsert; +export type Precioish = Omit; async function storePrecioPoint(point: Precio) { await db.insert(precios).values(point); } -function getEanByTable(dom: Window): string { - const eanLabelEl = dom.window.document.querySelector( - 'td[data-specification="EAN"]' - ); - const eanValueEl = eanLabelEl?.parentElement?.children[1]; - if ( - !eanValueEl || - !(eanValueEl instanceof dom.window.HTMLElement) || - !eanValueEl.dataset.specification - ) - throw new Error("No encontré el EAN"); - return eanValueEl.dataset.specification; -} - -function parseJsonLds(dom: Window): object[] { - const scripts = dom.window.document.querySelectorAll( - 'script[type="application/ld+json"]' - ); - return [...scripts].map((scripts) => JSON.parse(scripts.innerHTML)); -} -function findJsonLd(dom: Window, type: string): object | undefined { - return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type); -} - -function parseScriptJson(dom: Window, varname: string): T { - const script = dom.window.document.querySelector( - `template[data-type="json"][data-varname="${varname}"]` - )?.content?.children[0]; - if (!script) throw new Error("no encuentro el script"); - return JSON.parse(script.innerHTML); -} - -function eanFromSeedState(dom: Window): string { - const json = parseScriptJson(dom, "__STATE__"); - const productJson = Object.entries(json).find( - ([key, val]) => key.startsWith("Product:") && val.__typename === "Product" - ); - if (!productJson) throw new Error("no encontré el product en el json"); - - const productSkuJson = Object.entries(json).find( - ([key, val]) => - key.startsWith(`Product:${productJson[1].cacheId}`) && - val.__typename === "SKU" - ); - if (!productSkuJson) throw new Error("no encontré el sku en el json"); - return productSkuJson[1].ean; -} - -function eanFromDynamicYieldScript(dom: Window): string { - const scriptEl = dom.window.document.querySelector( - `script[src^="//st.dynamicyield.com/st?"]` - ); - if (!scriptEl || !(scriptEl instanceof dom.window.HTMLScriptElement)) - throw new Error("no encuentro el script de dynamicyield"); - - const url = new URL(scriptEl.src); - const ctx = url.searchParams.get("ctx"); - if (!ctx) throw new Error("no hay ctx"); - return JSON.parse(ctx).data[0]; -} - -function getCarrefourProduct(html: string | Buffer): Precio { - const dom = parseHTML(html); - - const precioMeta = dom.window.document - .querySelector(`meta[property="product:price:amount"]`) - ?.getAttribute("content"); - if (!precioMeta) throw new Error("No encontré el precio"); - const precioCentavos = parseFloat(precioMeta) * 100; - - // const productLd = findJsonLd(dom, "Product"); - const ean = eanFromSeedState(dom); - - return { - ean, - precioCentavos, - fetchedAt: new Date(), - }; -} - (async () => { - // await migrate(db, { migrationsFolder: "./drizzle" }); - // const p = await getCarrefourProduct( - // "https://www.carrefour.com.ar/bebida-lactea-la-serenisima-ultra-0-grasa-vainilla-900-cc/p" - // ); - // await storePrecioPoint(p); + const o = createWriteStream("x.tsv"); + o.write(`ean\tfetchedAt\tprecioCentavos\tinStock\turl\n`); const warc = createReadStream(process.argv[2]); const parser = new WARCParser(warc); + let progress = { done: 0, errors: 0 }; for await (const record of parser) { if (record.warcType === "response") { + if (!record.warcTargetURI) throw new Error("no uri"); console.log(record.warcTargetURI); const html = await record.contentText(); + + const url = new URL(record.warcTargetURI); try { - const product = getCarrefourProduct(html); - console.log(product); + let ish: Precioish | undefined = undefined; + if (url.hostname === "www.carrefour.com.ar") + ish = getCarrefourProduct(html); + else if (url.hostname === "diaonline.supermercadosdia.com.ar") + ish = getDiaProduct(html); + else console.error(`Unknown host ${url.hostname}`); + + const p: Precio = { + ...ish, + fetchedAt: new Date(record.warcDate!), + url: record.warcTargetURI, + }; + + if (ish) + o.write( + `${p.ean}\t${p.fetchedAt}\t${p.precioCentavos}\t${p.inStock}\t${p.url}\n` + ); + + // console.log(product); + progress.done++; } catch (error) { console.error(error); - const urlHash = createHash("md5") - .update(record.warcTargetURI!) - .digest("hex"); - const output = `${urlHash}.html`; - await writeFile(output, html); - console.error(`wrote html to ${output}`); + progress.errors++; + + if (DEBUG) { + const urlHash = createHash("md5") + .update(record.warcTargetURI!) + .digest("hex"); + const output = join("debug", `${urlHash}.html`); + await writeFile(output, html); + console.error(`wrote html to ${output}`); + } + } finally { + console.debug(progress); } } } diff --git a/scraper/tsconfig.json b/scraper/tsconfig.json new file mode 100644 index 0000000..0e66022 --- /dev/null +++ b/scraper/tsconfig.json @@ -0,0 +1,6 @@ +{ + "extends": "../tsconfig.json", + "compilerOptions": { + "types": ["bun-types"] + } +}