scrapear dia y carrefour via bunjs

This commit is contained in:
Cat /dev/Nulo 2023-12-22 15:46:22 -03:00
parent 5e55ad7131
commit d85b54f837
8 changed files with 227 additions and 102 deletions

.gitignore vendored
View file

@ -3,4 +3,6 @@ data/carrefour

View file

@ -43,7 +43,7 @@ importers:
version: 9.2.2
specifier: ^0.29.1
version: 0.29.1(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2)
version: 0.29.1(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2)(bun-types@1.0.19)
specifier: ^0.16.5
version: 0.16.5
@ -59,6 +59,9 @@ importers:
specifier: ^2.2.1
version: 2.2.1
specifier: ^3.22.4
version: 3.22.4
specifier: ^7.6.8
@ -66,6 +69,9 @@ importers:
specifier: ^20.10.5
version: 20.10.5
specifier: ^1.0.19
version: 1.0.19
specifier: ^4.7.0
version: 4.7.0
@ -274,6 +280,11 @@ packages:
undici-types: 5.26.5
resolution: {integrity: sha512-vmQSUcfalpIq0R9q7uTo2lXs6eGIpt9wtnLdMv9LVpIjCA/+ufZRozlVoVelIYixx1ugCBKDhn89vnsEGOCx9A==}
'@types/node': 20.10.5
resolution: {integrity: sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==}
engines: {node: '>=8'}
@ -330,6 +341,13 @@ packages:
ieee754: 1.2.1
dev: false
resolution: {integrity: sha512-7P5/r+twssrkDQ6HMit2GARMBbAxz1tLLEcMgQOCZeCX9BzNtabktjPCu+DmcvDYDnL/Ke75pmKg9CNBTlCzlQ==}
'@types/node': 20.10.5
'@types/ws': 8.5.10
undici-types: 5.26.5
resolution: {integrity: sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==}
dev: false
@ -424,7 +442,7 @@ packages:
domhandler: 5.0.3
dev: false
resolution: {integrity: sha512-yItc4unfHnk8XkDD3/bdC63vdboTY7e7I03lCF1OJYABXSIfQYU9BFTQJXMMovVeb3T1/OJWwfW/70T1XPnuUA==}
'@aws-sdk/client-rds-data': '>=3'
@ -488,6 +506,7 @@ packages:
'@types/better-sqlite3': 7.6.8
better-sqlite3: 9.2.2
bun-types: 1.0.19
dev: false
@ -955,3 +974,7 @@ packages:
y18n: 5.0.8
yargs-parser: 21.1.1
dev: false
resolution: {integrity: sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==}
dev: false

scraper/carrefour.ts Normal file
View file

@ -0,0 +1,70 @@
import { parseHTML } from "linkedom";
import { Precioish, type Precio } from "./scrap.js";
import { getProductJsonLd, priceFromMeta } from "./common.js";
function getEanByTable(dom: Window): string {
const eanLabelEl = dom.window.document.querySelector(
const eanValueEl = eanLabelEl?.parentElement?.children[1];
if (
!eanValueEl ||
!(eanValueEl instanceof dom.window.HTMLElement) ||
throw new Error("No encontré el EAN");
return eanValueEl.dataset.specification;
function parseScriptJson<T>(dom: Window, varname: string): T {
const script = dom.window.document.querySelector<HTMLTemplateElement>(
if (!script) throw new Error("no encuentro el script");
return JSON.parse(script.innerHTML);
function eanFromSeedState(dom: Window): string {
const json = parseScriptJson<object>(dom, "__STATE__");
const productJson = Object.entries(json).find(
([key, val]) => key.startsWith("Product:") && val.__typename === "Product"
if (!productJson) throw new Error("no encontré el product en el json");
const productSkuJson = Object.entries(json).find(
([key, val]) =>
key.startsWith(`Product:${productJson[1].cacheId}`) &&
val.__typename === "SKU"
if (!productSkuJson) throw new Error("no encontré el sku en el json");
return productSkuJson[1].ean;
function eanFromDynamicYieldScript(dom: Window): string {
const scriptEl = dom.window.document.querySelector(
if (!scriptEl || !(scriptEl instanceof dom.window.HTMLScriptElement))
throw new Error("no encuentro el script de dynamicyield");
const url = new URL(scriptEl.src);
const ctx = url.searchParams.get("ctx");
if (!ctx) throw new Error("no hay ctx");
return JSON.parse(ctx).data[0];
export function getCarrefourProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html);
const precioCentavos = priceFromMeta(dom);
// const productLd = findJsonLd(dom, "Product");
const ean = eanFromSeedState(dom);
const ld = getProductJsonLd(dom);
const inStock =
ld.offers.offers[0].availability === "";
return {

scraper/common.ts Normal file
View file

@ -0,0 +1,50 @@
import { z } from "zod";
export function getMetaProp(dom: Window, prop: string) {
return dom.window.document
export function priceFromMeta(dom: Window) {
const precioMeta = getMetaProp(dom, "product:price:amount");
if (!precioMeta) return null;
const precioCentavos = parseFloat(precioMeta) * 100;
return precioCentavos;
function parseJsonLds(dom: Window): object[] {
const scripts = dom.window.document.querySelectorAll(
return [...scripts].map((scripts) => JSON.parse(scripts.innerHTML));
function findJsonLd(dom: Window, type: string): object | undefined {
return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type);
const zProductLd = z.object({
"@type": z.literal("Product"),
name: z.string(),
image: z.string(),
offers: z.object({
offers: z.tuple([
"@type": z.literal("Offer"),
price: z.number(),
priceCurrency: z.literal("ARS"),
availability: z.enum([
type ProductLd = z.infer<typeof zProductLd>;
export function getProductJsonLd(dom: Window): ProductLd {
const ld = findJsonLd(dom, "Product");
const productLd = zProductLd.parse(ld);
return productLd;

scraper/dia.ts Normal file
View file

@ -0,0 +1,21 @@
import { parseHTML } from "linkedom";
import { type Precioish } from "./scrap.js";
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
export function getDiaProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html);
const ean = getMetaProp(dom, "product:retailer_item_id");
if (!ean) throw new Error("No encontré el ean");
const precioCentavos = priceFromMeta(dom);
const ld = getProductJsonLd(dom);
const inStock =
ld.offers.offers[0].availability === "";
return {

View file

@ -17,11 +17,13 @@
"nanoid": "^5.0.4",
"p-map": "^7.0.0",
"undici": "^6.2.0",
"warcio": "^2.2.1"
"warcio": "^2.2.1",
"zod": "^3.22.4"
"devDependencies": {
"@types/better-sqlite3": "^7.6.8",
"@types/node": "^20.10.5",
"bun-types": "^1.0.19",
"tsx": "^4.7.0",
"typescript": "^5.3.3"

View file

@ -1,128 +1,79 @@
/// <reference lib="dom" />
/// <reference lib="dom.iterable" />
/// <reference types="node" />
import { parseHTML } from "linkedom";
import { drizzle } from "drizzle-orm/better-sqlite3";
import Database from "better-sqlite3";
import { Database } from "bun:sqlite";
import { drizzle } from "drizzle-orm/bun-sqlite";
import { precios } from "./db/schema.js";
import { WARCParser } from "warcio";
import { createReadStream } from "fs";
import { createReadStream, createWriteStream } from "fs";
import { writeFile } from "fs/promises";
import { createHash } from "crypto";
import { migrate } from "drizzle-orm/bun-sqlite/migrator";
import { getCarrefourProduct } from "./carrefour.js";
import { getDiaProduct } from "./dia.js";
import { join } from "path";
const sqlite = new Database("sqlite.db");
const db = drizzle(sqlite);
type Precio = typeof precios.$inferInsert;
const DEBUG = true;
export type Precio = typeof precios.$inferInsert;
export type Precioish = Omit<Precio, "fetchedAt" | "url">;
async function storePrecioPoint(point: Precio) {
await db.insert(precios).values(point);
function getEanByTable(dom: Window): string {
const eanLabelEl = dom.window.document.querySelector(
const eanValueEl = eanLabelEl?.parentElement?.children[1];
if (
!eanValueEl ||
!(eanValueEl instanceof dom.window.HTMLElement) ||
throw new Error("No encontré el EAN");
return eanValueEl.dataset.specification;
function parseJsonLds(dom: Window): object[] {
const scripts = dom.window.document.querySelectorAll(
return [...scripts].map((scripts) => JSON.parse(scripts.innerHTML));
function findJsonLd(dom: Window, type: string): object | undefined {
return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type);
function parseScriptJson<T>(dom: Window, varname: string): T {
const script = dom.window.document.querySelector<HTMLTemplateElement>(
if (!script) throw new Error("no encuentro el script");
return JSON.parse(script.innerHTML);
function eanFromSeedState(dom: Window): string {
const json = parseScriptJson<object>(dom, "__STATE__");
const productJson = Object.entries(json).find(
([key, val]) => key.startsWith("Product:") && val.__typename === "Product"
if (!productJson) throw new Error("no encontré el product en el json");
const productSkuJson = Object.entries(json).find(
([key, val]) =>
key.startsWith(`Product:${productJson[1].cacheId}`) &&
val.__typename === "SKU"
if (!productSkuJson) throw new Error("no encontré el sku en el json");
return productSkuJson[1].ean;
function eanFromDynamicYieldScript(dom: Window): string {
const scriptEl = dom.window.document.querySelector(
if (!scriptEl || !(scriptEl instanceof dom.window.HTMLScriptElement))
throw new Error("no encuentro el script de dynamicyield");
const url = new URL(scriptEl.src);
const ctx = url.searchParams.get("ctx");
if (!ctx) throw new Error("no hay ctx");
return JSON.parse(ctx).data[0];
function getCarrefourProduct(html: string | Buffer): Precio {
const dom = parseHTML(html);
const precioMeta = dom.window.document
if (!precioMeta) throw new Error("No encontré el precio");
const precioCentavos = parseFloat(precioMeta) * 100;
// const productLd = findJsonLd(dom, "Product");
const ean = eanFromSeedState(dom);
return {
fetchedAt: new Date(),
(async () => {
// await migrate(db, { migrationsFolder: "./drizzle" });
// const p = await getCarrefourProduct(
// ""
// );
// await storePrecioPoint(p);
const o = createWriteStream("x.tsv");
const warc = createReadStream(process.argv[2]);
const parser = new WARCParser(warc);
let progress = { done: 0, errors: 0 };
for await (const record of parser) {
if (record.warcType === "response") {
if (!record.warcTargetURI) throw new Error("no uri");
const html = await record.contentText();
const url = new URL(record.warcTargetURI);
try {
const product = getCarrefourProduct(html);
let ish: Precioish | undefined = undefined;
if (url.hostname === "")
ish = getCarrefourProduct(html);
else if (url.hostname === "")
ish = getDiaProduct(html);
else console.error(`Unknown host ${url.hostname}`);
const p: Precio = {
fetchedAt: new Date(record.warcDate!),
url: record.warcTargetURI,
if (ish)
// console.log(product);
} catch (error) {
const urlHash = createHash("md5")
const output = `${urlHash}.html`;
await writeFile(output, html);
console.error(`wrote html to ${output}`);
if (DEBUG) {
const urlHash = createHash("md5")
const output = join("debug", `${urlHash}.html`);
await writeFile(output, html);
console.error(`wrote html to ${output}`);
} finally {

scraper/tsconfig.json Normal file
View file

@ -0,0 +1,6 @@
"extends": "../tsconfig.json",
"compilerOptions": {
"types": ["bun-types"]