2024-11-26 19:46:19 +00:00
22 changed files with 2132 additions and 26 deletions
--- a/.github/workflows/container.yml
+++ b/.github/workflows/container.yml
@ -88,32 +88,12 @@ jobs:
        uses: actions/cache@v3
        with:
          path: usr/src/app/target
-          key: usr/src/app/target-${{ hashFiles('Dockerfile.scraper') }}
+          key: usr/src/app/target-${{ hashFiles('Dockerfile') }}
      - name: inject usr/src/app/target into docker
        uses: reproducible-containers/buildkit-cache-dance@v2.1.3
        with:
          cache-source: usr/src/app/target
          cache-target: /usr/src/app/target
      - name: Cache root/.cargo/registry
        uses: actions/cache@v3
        with:
          path: root/.cargo/registry
          key: root/.cargo/registry-${{ hashFiles('Dockerfile.scraper') }}
      - name: inject root/.cargo/registry into docker
        uses: reproducible-containers/buildkit-cache-dance@v2.1.3
        with:
          cache-source: root/.cargo/registry
          cache-target: /root/.cargo/registry
      - name: Cache root/.cargo/git
        uses: actions/cache@v3
        with:
          path: root/.cargo/git
          key: root/.cargo/git-${{ hashFiles('Dockerfile.scraper') }}
      - name: inject root/.cargo/git into docker
        uses: reproducible-containers/buildkit-cache-dance@v2.1.3
        with:
          cache-source: root/.cargo/git
          cache-target: /root/.cargo/git
      - name: Build and push Docker image
        uses: docker/build-push-action@v5
        with:
--- a/db-datos/drizzle.config.js
+++ b/db-datos/drizzle.config.js
@ -1,4 +1,4 @@
-export const DB_PATH = process.env.DB_PATH ?? "../sqlite.db";
+export const DB_PATH = process.env.DB_PATH ?? "../scraper/sqlite.db";
 /** @type { import("drizzle-kit").Config } */
 export default {
--- a/link-scrapers/carrefour.ts
+++ b/link-scrapers/carrefour.ts
@ -0,0 +1,33 @@
 import pMap from "p-map";
 import { saveUrls } from "db-datos/urlHelpers.js";
 import { getUrlsFromSitemap } from "./common.js";
 export async function scrapCarrefourProducts() {
  await scrapBySitemap();
 }
 async function scrapBySitemap() {
  // de https://www.carrefour.com.ar/sitemap.xml
  const sitemaps = [
    "https://www.carrefour.com.ar/sitemap/product-0.xml",
    "https://www.carrefour.com.ar/sitemap/product-1.xml",
    "https://www.carrefour.com.ar/sitemap/product-2.xml",
    "https://www.carrefour.com.ar/sitemap/product-3.xml",
    "https://www.carrefour.com.ar/sitemap/product-4.xml",
    "https://www.carrefour.com.ar/sitemap/product-5.xml",
    "https://www.carrefour.com.ar/sitemap/product-6.xml",
    "https://www.carrefour.com.ar/sitemap/product-7.xml",
    "https://www.carrefour.com.ar/sitemap/product-8.xml",
    "https://www.carrefour.com.ar/sitemap/product-9.xml",
  ];
  await pMap(
    sitemaps,
    async (sitemapUrl) => {
      const res = await fetch(sitemapUrl);
      const xml = await res.text();
      saveUrls(getUrlsFromSitemap(xml));
    },
    { concurrency: 3 }
  );
 }
--- a/link-scrapers/common.ts
+++ b/link-scrapers/common.ts
@ -0,0 +1,14 @@
 import { decodeXML } from "entities";
 export function getUrlsFromSitemap(xml: string) {
  let urls = new Set<string>();
  new HTMLRewriter()
    .on("loc", {
      text(element) {
        const txt = element.text.trim();
        if (!txt) return;
        urls.add(decodeXML(txt));
      },
    })
    .transform(new Response(xml));
  return Array.from(urls);
 }
--- a/link-scrapers/coto.ts
+++ b/link-scrapers/coto.ts
@ -0,0 +1,44 @@
 import { parseHTML } from "linkedom";
 import PQueue from "p-queue";
 import { saveUrls } from "db-datos/urlHelpers.js";
 export async function scrapCotoProducts() {
  const initial =
    "https://www.cotodigital3.com.ar/sitios/cdigi/browse?Nf=product.endDate%7CGTEQ+1.7032032E12%7C%7Cproduct.startDate%7CLTEQ+1.7032032E12&No=2200&Nr=AND%28product.sDisp_200%3A1004%2Cproduct.language%3Aespa%C3%B1ol%2COR%28product.siteId%3ACotoDigital%29%29&Nrpp=200";
  const queue = new PQueue({ concurrency: 4 });
  const pageSize = 300; // hasta 1000
  const links = Array.from(
    { length: Math.ceil(29000 / pageSize) },
    (x, i) => i
  ).map((i) => {
    const url = new URL(initial);
    url.searchParams.set("No", `${i * pageSize}`);
    url.searchParams.set("Nrpp", `${pageSize}`);
    return url.toString();
  });
  const promises = links.map((l) => queue.add(getPage(l)));
  await Promise.all(promises);
 }
 function getPage(url: string) {
  return async () => {
    let html;
    try {
      const res = await fetch(url);
      html = await res.text();
    } catch (error) {
      await getPage(url)();
      return;
    }
    const { document } = parseHTML(html);
    const hrefs = Array.from(
      document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
      (a) => new URL(a.href, url).toString()
    );
    saveUrls(hrefs);
  };
 }
--- a/link-scrapers/dia.ts
+++ b/link-scrapers/dia.ts
@ -0,0 +1,124 @@
 import pMap from "p-map";
 import { parseHTML } from "linkedom";
 import { saveUrls } from "db-datos/urlHelpers.js";
 import { getUrlsFromSitemap } from "./common.js";
 const categorias = [
  "https://diaonline.supermercadosdia.com.ar/almacen",
  "https://diaonline.supermercadosdia.com.ar/almacen/conservas",
  "https://diaonline.supermercadosdia.com.ar/almacen/aceites-y-aderezos",
  "https://diaonline.supermercadosdia.com.ar/almacen/pastas-secas",
  "https://diaonline.supermercadosdia.com.ar/almacen/arroz-y-legumbres",
  "https://diaonline.supermercadosdia.com.ar/almacen/panaderia",
  "https://diaonline.supermercadosdia.com.ar/almacen/golosinas-y-alfajores",
  "https://diaonline.supermercadosdia.com.ar/almacen/reposteria",
  "https://diaonline.supermercadosdia.com.ar/almacen/comidas-listas",
  "https://diaonline.supermercadosdia.com.ar/almacen/harinas",
  "https://diaonline.supermercadosdia.com.ar/almacen/picadas",
  "https://diaonline.supermercadosdia.com.ar/almacen/panaderia/pan-rallado-y-rebozadores",
  "https://diaonline.supermercadosdia.com.ar/desayuno",
  "https://diaonline.supermercadosdia.com.ar/desayuno/galletitas-y-cereales",
  "https://diaonline.supermercadosdia.com.ar/desayuno/infusiones-y-endulzantes",
  "https://diaonline.supermercadosdia.com.ar/desayuno/para-untar",
  "https://diaonline.supermercadosdia.com.ar/frescos",
  "https://diaonline.supermercadosdia.com.ar/frescos/leches",
  "https://diaonline.supermercadosdia.com.ar/frescos/fiambreria",
  "https://diaonline.supermercadosdia.com.ar/frescos/lacteos",
  "https://diaonline.supermercadosdia.com.ar/frescos/carniceria",
  "https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras",
  "https://diaonline.supermercadosdia.com.ar/frescos/pastas-frescas",
  "https://diaonline.supermercadosdia.com.ar/frescos/listos-para-disfrutar",
  "https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutas",
  "https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/verduras",
  "https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/huevos",
  "https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutos-secos",
  "https://diaonline.supermercadosdia.com.ar/bebidas",
  "https://diaonline.supermercadosdia.com.ar/bebidas/gaseosas",
  "https://diaonline.supermercadosdia.com.ar/bebidas/cervezas",
  "https://diaonline.supermercadosdia.com.ar/bebidas/aguas",
  "https://diaonline.supermercadosdia.com.ar/bebidas/bodega",
  "https://diaonline.supermercadosdia.com.ar/bebidas/jugos-e-isot%C3%B3nicas",
  "https://diaonline.supermercadosdia.com.ar/bebidas/aperitivos",
  "https://diaonline.supermercadosdia.com.ar/bebidas/bebidas-blancas-y-licores",
  "https://diaonline.supermercadosdia.com.ar/congelados",
  "https://diaonline.supermercadosdia.com.ar/congelados/hamburguesas-y-medallones",
  "https://diaonline.supermercadosdia.com.ar/congelados/rebozados",
  "https://diaonline.supermercadosdia.com.ar/congelados/vegetales-congelados",
  "https://diaonline.supermercadosdia.com.ar/congelados/postres-congelados",
  "https://diaonline.supermercadosdia.com.ar/congelados/pescaderia",
  "https://diaonline.supermercadosdia.com.ar/congelados/papas-congeladas",
  "https://diaonline.supermercadosdia.com.ar/congelados/comidas-congeladas",
  "https://diaonline.supermercadosdia.com.ar/congelados/hielo",
  "https://diaonline.supermercadosdia.com.ar/limpieza",
  "https://diaonline.supermercadosdia.com.ar/limpieza/cuidado-de-la-ropa",
  "https://diaonline.supermercadosdia.com.ar/limpieza/papeleria",
  "https://diaonline.supermercadosdia.com.ar/limpieza/limpiadores",
  "https://diaonline.supermercadosdia.com.ar/limpieza/limpieza-de-cocina",
  "https://diaonline.supermercadosdia.com.ar/limpieza/accesorios-de-limpieza",
  "https://diaonline.supermercadosdia.com.ar/limpieza/desodorantes-de-ambiente",
  "https://diaonline.supermercadosdia.com.ar/limpieza/insecticidas",
  "https://diaonline.supermercadosdia.com.ar/limpieza/fosforos-y-velas",
  "https://diaonline.supermercadosdia.com.ar/limpieza/bolsas",
  "https://diaonline.supermercadosdia.com.ar/4160?map=productClusterIds&order=OrderByBestDiscountDESC",
  "https://diaonline.supermercadosdia.com.ar/4136?map=productClusterIds&order=OrderByBestDiscountDESC",
  "https://diaonline.supermercadosdia.com.ar/4143?map=productClusterIds&order=OrderByBestDiscountDESC",
  "https://diaonline.supermercadosdia.com.ar/4189?map=productClusterIds&order=OrderByBestDiscountDESC",
  "https://diaonline.supermercadosdia.com.ar/4086?map=productClusterIds&order=OrderByBestDiscountDESC",
  "https://diaonline.supermercadosdia.com.ar/2089?map=productClusterIds&order=OrderByBestDiscountDESC",
 ];
 export async function scrapDiaProducts() {
  await Promise.all([
    // scrapBySite(),
    scrapBySitemap(),
  ]);
 }
 async function scrapBySitemap() {
  // de https://diaonline.supermercadosdia.com.ar/sitemap.xml
  const sitemaps = [
    "https://diaonline.supermercadosdia.com.ar/sitemap/product-1.xml",
    "https://diaonline.supermercadosdia.com.ar/sitemap/product-2.xml",
    "https://diaonline.supermercadosdia.com.ar/sitemap/product-3.xml",
    "https://diaonline.supermercadosdia.com.ar/sitemap/product-4.xml",
    "https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
  ];
  await pMap(
    sitemaps,
    async (sitemapUrl) => {
      const res = await fetch(sitemapUrl);
      const xml = await res.text();
      saveUrls(getUrlsFromSitemap(xml));
    },
    { concurrency: 3 }
  );
 }
 async function scrapBySite() {
  const links = categorias.flatMap((link) =>
    Array.from({ length: 51 }, (x, i) => i).map((i) => {
      const url = new URL(link);
      url.searchParams.set("page", `${i}`);
      return url.toString();
    })
  );
  await pMap(
    links,
    async (url) => {
      const res = await fetch(url, { timeout: false });
      const html = await res.text();
      const { document } = parseHTML(html);
      const hrefs = Array.from(
        document.querySelectorAll<HTMLAnchorElement>(
          "a.vtex-product-summary-2-x-clearLink"
        ),
        (a) => new URL(a.href, url).toString()
      );
      saveUrls(hrefs);
    },
    { concurrency: 32 }
  );
 }
--- a/link-scrapers/jumbo.ts
+++ b/link-scrapers/jumbo.ts
@ -0,0 +1,38 @@
 import pMap from "p-map";
 import { saveUrls } from "db-datos/urlHelpers.js";
 import { getUrlsFromSitemap } from "./common.js";
 export async function scrapJumboProducts() {
  await scrapBySitemap();
 }
 async function scrapBySitemap() {
  // de https://www.jumbo.com.ar/sitemap.xml
  const sitemaps = [
    "https://www.jumbo.com.ar/sitemap/product-1.xml",
    "https://www.jumbo.com.ar/sitemap/product-10.xml",
    "https://www.jumbo.com.ar/sitemap/product-11.xml",
    "https://www.jumbo.com.ar/sitemap/product-12.xml",
    "https://www.jumbo.com.ar/sitemap/product-13.xml",
    "https://www.jumbo.com.ar/sitemap/product-14.xml",
    "https://www.jumbo.com.ar/sitemap/product-15.xml",
    "https://www.jumbo.com.ar/sitemap/product-2.xml",
    "https://www.jumbo.com.ar/sitemap/product-3.xml",
    "https://www.jumbo.com.ar/sitemap/product-4.xml",
    "https://www.jumbo.com.ar/sitemap/product-5.xml",
    "https://www.jumbo.com.ar/sitemap/product-6.xml",
    "https://www.jumbo.com.ar/sitemap/product-7.xml",
    "https://www.jumbo.com.ar/sitemap/product-8.xml",
    "https://www.jumbo.com.ar/sitemap/product-9.xml",
  ];
  await pMap(
    sitemaps,
    async (sitemapUrl) => {
      const res = await fetch(sitemapUrl);
      const xml = await res.text();
      saveUrls(getUrlsFromSitemap(xml));
    },
    { concurrency: 3 }
  );
 }
--- a/link-scrapers/package.json
+++ b/link-scrapers/package.json
@ -0,0 +1,18 @@
 {
  "name": "link-scrapers",
  "type": "module",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "keywords": [],
  "author": "",
  "license": "ISC",
  "dependencies": {
    "entities": "^4.5.0",
    "linkedom": "^0.16.5",
    "p-queue": "^8.0.1"
  }
 }
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
--- a/pnpm-workspace.yaml
+++ b/pnpm-workspace.yaml
@ -1,3 +1,5 @@
 packages:
  - link-scrapers
  - scraper
  - sitio
  - db-datos
--- a/scraper-rs/src/main.rs
+++ b/scraper-rs/src/main.rs
@ -111,7 +111,7 @@ async fn fetch_list(pool: &Pool, links: Vec<String>) -> Counters {
 }
 fn connect_db() -> Pool {
-    let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string());
+    let db_path = env::var("DB_PATH").unwrap_or("../scraper/sqlite.db".to_string());
    let cfg = deadpool_sqlite::Config::new(db_path);
    let pool = cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap();
    pool
--- a/scraper/auto.ts
+++ b/scraper/auto.ts
@ -0,0 +1,137 @@
 import { mkdtemp, writeFile } from "node:fs/promises";
 import { tmpdir } from "node:os";
 import { join } from "node:path";
 import { Supermercado, hosts, supermercados } from "db-datos/supermercado.js";
 import PQueue from "p-queue";
 import { formatDuration, intervalToDuration } from "date-fns";
 import { db } from "db-datos/db.js";
 import { like } from "drizzle-orm";
 import { productoUrls } from "db-datos/schema.js";
 import { scrapDiaProducts } from "../link-scrapers/dia.js";
 import { scrapCotoProducts } from "../link-scrapers/coto.js";
 import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
 import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
 import { readableStreamToText } from "bun";
 // hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU
 const scrapQueue = new PQueue({ concurrency: 1 });
 export async function auto() {
  const a = new Auto();
  await Promise.all(supermercados.map((supr) => a.downloadList(supr)));
 }
 class Auto {
  telegramConfig?: { token: string; chatId: string };
  constructor() {
    if (!process.env.TELEGRAM_BOT_TOKEN)
      console.warn("no hay TELEGRAM_BOT_TOKEN, no voy a loggear por allá");
    else if (!process.env.TELEGRAM_BOT_CHAT_ID)
      console.warn("no hay TELEGRAM_BOT_CHAT_ID, no voy a loggear por allá");
    else
      this.telegramConfig = {
        token: process.env.TELEGRAM_BOT_TOKEN,
        chatId: process.env.TELEGRAM_BOT_CHAT_ID,
      };
    this.inform("[auto] Empezando scrap");
  }
  async scrapUrls(supermercado: Supermercado) {
    const t0 = performance.now();
    switch (supermercado) {
      case "Dia":
        await scrapDiaProducts();
        break;
      case "Coto":
        await scrapCotoProducts();
        break;
      case "Carrefour":
        await scrapCarrefourProducts();
        break;
      case "Jumbo":
        await scrapJumboProducts();
        break;
    }
    this.inform(
      `[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
    );
  }
  async downloadList(supermercado: Supermercado) {
    const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-download-"));
    await scrapQueue.add(async () => {
      await this.scrapUrls(supermercado);
    });
    const listPath = join(ctxPath, `lista-${supermercado}.txt`);
    const host = Object.entries(hosts).find(
      ([host, supe]) => supe === supermercado
    )![0];
    const results = await db.query.productoUrls
      .findMany({
        where: like(productoUrls.url, `%${host}%`),
      })
      .execute();
    const urls = results.map((r) => r.url);
    await writeFile(listPath, urls.join("\n") + "\n");
    this.scrapAndInform({ listPath });
    // TODO: borrar archivos temporales
  }
  async scrapAndInform({ listPath }: { listPath: string }) {
    const res = await scrapQueue.add(async () => {
      const t0 = performance.now();
      const sub = Bun.spawn({
        cmd: ["scraper-rs", "fetch-list", listPath],
        stdio: ["ignore", "pipe", "inherit"],
      });
      const text = await readableStreamToText(sub.stdout);
      const code = await sub.exited;
      if (code !== 0) throw new Error(`scraper-rs threw ${code}`);
      return { took: performance.now() - t0, text };
    });
    if (res) {
      const { took, text } = res;
      this.inform(
        `Procesado ${listPath} (${text}) (tardó ${formatMs(took)})`
        //(${progress.done} ok, ${
        //   progress.skipped
        // } skipped, ${progress.errors.length} errores)
      );
    } else {
      this.inform(`Algo falló en ${listPath}`);
    }
  }
  inform(msg: string) {
    this.sendTelegramMsg(msg);
    console.info(msg);
  }
  report(msg: string) {
    this.inform(msg);
    const error = new Error(msg);
    return error;
  }
  async sendTelegramMsg(text: string) {
    if (!this.telegramConfig) return;
    const url = new URL(
      `https://api.telegram.org/bot${this.telegramConfig.token}/sendMessage`
    );
    url.searchParams.set("chat_id", this.telegramConfig.chatId);
    url.searchParams.set("text", text);
    await fetch(url);
  }
 }
 function formatMs(ms: number) {
  return formatDuration(intervalToDuration({ start: 0, end: Math.round(ms) }));
 }
--- a/scraper/cli.ts
+++ b/scraper/cli.ts
@ -0,0 +1,44 @@
 import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
 import { scrapCotoProducts } from "../link-scrapers/coto.js";
 import { scrapDiaProducts } from "../link-scrapers/dia.js";
 import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
 import { auto } from "./auto.js";
 import { downloadList, getProduct } from "./scrap.js";
 import Cron from "croner";
 if (process.argv[2] === "auto") {
  await auto();
 } else if (process.argv[2] === "cron") {
  Cron("0 2 * * *", () => {
    auto();
  });
 } else if (process.argv[2] === "scrap-carrefour-links") {
  await scrapCarrefourProducts();
 } else if (process.argv[2] === "scrap-dia-links") {
  await scrapDiaProducts();
 } else if (process.argv[2] === "scrap-coto-links") {
  await scrapCotoProducts();
 } else if (process.argv[2] === "scrap-jumbo-links") {
  await scrapJumboProducts();
 } else if (process.argv[2] === "scrap-link") {
  const url = new URL(process.argv[3]);
  const res = await fetch(url);
  const text = await res.text();
  console.info(await getProduct(url, text));
 } else if (process.argv[2] === "scrap") {
  const urlLists = process.argv.slice(3);
  if (urlLists.length > 0) {
    for (const path of urlLists) {
      const res = await downloadList(path);
      console.info("=======================================");
      console.info(path, res);
      console.info("=======================================");
    }
  } else {
    console.error("Especificá listas de urls para scrapear.");
    process.exit(1);
  }
 } else {
  console.error("Especificá una acción (tipo `auto` o `scrap`) para hacer.");
  process.exit(1);
 }
--- a/scraper/package.json
+++ b/scraper/package.json
@ -0,0 +1,29 @@
 {
  "name": "scraper",
  "type": "module",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "scripts": {
    "check": "tsc"
  },
  "keywords": [],
  "author": "",
  "license": "ISC",
  "dependencies": {
    "@aws-sdk/client-s3": "^3.478.0",
    "@aws-sdk/lib-storage": "^3.478.0",
    "croner": "^8.0.0",
    "date-fns": "^3.0.6",
    "db-datos": "workspace:^",
    "drizzle-orm": "^0.29.1",
    "linkedom": "^0.16.5",
    "nanoid": "^5.0.4",
    "p-map": "^7.0.1",
    "p-queue": "^8.0.1",
    "zod": "^3.22.4"
  },
  "devDependencies": {
    "typescript": "^5.3.3"
  }
 }
--- a/scraper/parsers/carrefour.ts
+++ b/scraper/parsers/carrefour.ts
@ -0,0 +1,56 @@
 import { parseHTML } from "linkedom";
 import { Precioish } from "../scrap.js";
 import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
 function parseScriptJson<T>(dom: Window, varname: string): T {
  const script = dom.window.document.querySelector<HTMLTemplateElement>(
    `template[data-type="json"][data-varname="${varname}"]`
  )?.content?.children[0];
  if (!script) throw new Error("no encuentro el script");
  return JSON.parse(script.innerHTML);
 }
 function eanFromSeedState(dom: Window): string {
  const json = parseScriptJson<object>(dom, "__STATE__");
  const productJson = Object.entries(json).find(
    ([key, val]) => key.startsWith("Product:") && val.__typename === "Product"
  );
  if (!productJson) throw new Error("no encontré el product en el json");
  const productSkuJson = Object.entries(json).find(
    ([key, val]) =>
      key.startsWith(`Product:${productJson[1].cacheId}`) &&
      val.__typename === "SKU"
  );
  if (!productSkuJson) throw new Error("no encontré el sku en el json");
  return productSkuJson[1].ean;
 }
 export function getCarrefourProduct(html: string | Buffer): Precioish {
  const dom = parseHTML(html);
  const precioCentavos = priceFromMeta(dom);
  const inStock = stockFromMeta(dom);
  const ean = eanFromSeedState(dom);
  let name, imageUrl;
  try {
    const ld = getProductJsonLd(dom);
    name = ld.name;
    imageUrl = ld.image;
  } catch (error) {
    if (inStock) {
      throw error;
    } else {
      // algunas paginas sin stock no tienen json ld
    }
  }
  return {
    name,
    imageUrl,
    ean,
    precioCentavos,
    inStock,
  };
 }
--- a/scraper/parsers/common.ts
+++ b/scraper/parsers/common.ts
@ -0,0 +1,55 @@
 import { z } from "zod";
 export function getMetaProp(dom: Window, prop: string) {
  return dom.window.document
    .querySelector(`meta[property="${prop}"]`)
    ?.getAttribute("content");
 }
 export function priceFromMeta(dom: Window) {
  const precioMeta = getMetaProp(dom, "product:price:amount");
  if (!precioMeta) return null;
  const precioCentavos = parseFloat(precioMeta) * 100;
  return precioCentavos;
 }
 export function stockFromMeta(dom: Window) {
  const stockMeta = getMetaProp(dom, "product:availability");
  return stockMeta === "instock";
 }
 function parseJsonLds(dom: Window): object[] {
  const scripts = dom.window.document.querySelectorAll(
    'script[type="application/ld+json"]'
  );
  return Array.from(scripts, (script) => JSON.parse(script.innerHTML));
 }
 function findJsonLd(dom: Window, type: string): object | undefined {
  return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type);
 }
 const zProductLd = z.object({
  "@type": z.literal("Product"),
  name: z.string(),
  image: z.string(),
  sku: z.string().optional(),
  offers: z.object({
    offers: z.array(
      z.object({
        "@type": z.literal("Offer"),
        price: z.number(),
        priceCurrency: z.literal("ARS"),
        availability: z.enum([
          "http://schema.org/OutOfStock",
          "http://schema.org/InStock",
        ]),
      })
    ),
  }),
 });
 type ProductLd = z.infer<typeof zProductLd>;
 export function getProductJsonLd(dom: Window): ProductLd {
  const ld = findJsonLd(dom, "Product");
  const productLd = zProductLd.parse(ld);
  return productLd;
 }
--- a/scraper/parsers/coto.ts
+++ b/scraper/parsers/coto.ts
@ -0,0 +1,48 @@
 import { parseHTML } from "linkedom";
 import { type Precioish } from "../scrap.js";
 function getEanFromText({ document }: Window) {
  const potentialEanEls = Array.from(
    document.querySelectorAll("div#brandText")
  );
  const eanParent = potentialEanEls.find(
    (el) => el.textContent?.includes("| EAN: ")
  );
  if (!eanParent) throw new Error("no encuentro el eanparent");
  const eanEl = Array.from(
    eanParent?.querySelectorAll("span.span_codigoplu")
  )[1];
  const ean = eanEl?.textContent?.trim();
  if (!ean) throw new Error("no encuentro el ean");
  return ean;
 }
 function getPriceFromText({ document }: Window) {
  const el = document.querySelector(".atg_store_newPrice");
  if (!el?.textContent) return null;
  const nStr = el.textContent
    .trim()
    .replace("$", "")
    .replaceAll(".", "")
    .replace(",", ".");
  return parseFloat(nStr) * 100;
 }
 function getInStock({ document }: Window) {
  return !document.querySelector(".product_not_available");
 }
 export function getCotoProduct(html: string | Buffer): Precioish {
  const dom = parseHTML(html);
  const ean = getEanFromText(dom);
  const precioCentavos = getPriceFromText(dom);
  const inStock = getInStock(dom);
  const name = dom.document
    .querySelector("h1.product_page")
    ?.textContent?.trim();
  const imageUrl =
    dom.document.querySelector<HTMLImageElement>(".zoom img")?.src;
  return { name, imageUrl, ean, precioCentavos, inStock };
 }
--- a/scraper/parsers/dia.ts
+++ b/scraper/parsers/dia.ts
@ -0,0 +1,25 @@
 import { parseHTML } from "linkedom";
 import { type Precioish } from "../scrap.js";
 import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
 export function getDiaProduct(html: string | Buffer): Precioish {
  const dom = parseHTML(html);
  const ean = getMetaProp(dom, "product:retailer_item_id");
  if (!ean) throw new Error("No encontré el ean");
  const precioCentavos = priceFromMeta(dom);
  const ld = getProductJsonLd(dom);
  const name = ld.name;
  const imageUrl = ld.image;
  const inStock =
    ld.offers.offers[0].availability === "http://schema.org/InStock";
  return {
    name,
    imageUrl,
    ean,
    precioCentavos,
    inStock,
  };
 }
--- a/scraper/parsers/jumbo.ts
+++ b/scraper/parsers/jumbo.ts
@ -0,0 +1,54 @@
 import { parseHTML } from "linkedom";
 import { type Precioish } from "../scrap.js";
 import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
 import { z } from "zod";
 const zJumboSearch = z.tuple([
  z.object({
    items: z.array(
      z.object({
        ean: z.string(),
      })
    ),
  }),
 ]);
 async function getEanFromSearch(sku: string) {
  const url = new URL(
    "https://www.jumbo.com.ar/api/catalog_system/pub/products/search"
  );
  url.searchParams.set("fq", `skuId:${sku}`);
  const res = await fetch(url);
  const json = await res.json();
  const parsed = zJumboSearch.parse(json);
  const ean = parsed[0].items[0].ean;
  if (!parsed[0].items.every((x) => x.ean === ean)) {
    throw new Error("Inesperado: no todos los items tienen el mismo EAN");
  }
  return ean;
 }
 export async function getJumboProduct(
  html: string | Buffer
 ): Promise<Precioish> {
  const dom = parseHTML(html);
  const precioCentavos = priceFromMeta(dom);
  const inStock = stockFromMeta(dom);
  const ld = getProductJsonLd(dom);
  const name = ld.name;
  const imageUrl = ld.image;
  const retailerSku = ld.sku;
  if (!retailerSku)
    throw new Error("No encontré el SKU de Jumbo para pedir el EAN");
  const ean = await getEanFromSearch(retailerSku);
  return {
    name,
    imageUrl,
    ean,
    precioCentavos,
    inStock,
  };
 }
--- a/scraper/scrap.ts
+++ b/scraper/scrap.ts
@ -0,0 +1,127 @@
 /// <reference lib="dom" />
 import * as schema from "db-datos/schema.js";
 import { writeFile, mkdir } from "fs/promises";
 import { createHash } from "crypto";
 import { getCarrefourProduct } from "./parsers/carrefour.js";
 import { getDiaProduct } from "./parsers/dia.js";
 import { getCotoProduct } from "./parsers/coto.js";
 import { join } from "path";
 import { db } from "db-datos/db.js";
 import pMap from "p-map";
 import { getJumboProduct } from "./parsers/jumbo.js";
 const DEBUG = true;
 const PARSER_VERSION = 4;
 export type Precio = typeof schema.precios.$inferInsert;
 export type Precioish = Omit<
  Precio,
  "fetchedAt" | "url" | "id" | "warcRecordId" | "parserVersion"
 >;
 export async function downloadList(path: string) {
  let list = (await Bun.file(path).text())
    .split("\n")
    .filter((s) => s.length > 0);
  const results = await pMap(
    list,
    async (urlS) => {
      let res: ScrapResult = { type: "skipped" };
      for (let attempts = 0; attempts < 6; attempts++) {
        if (attempts !== 0) await wait(1500);
        res = await scrap(urlS);
        if (res.type === "done" || res.type === "skipped") {
          break;
        }
      }
      if (res.type === "error") console.error(res);
      return res;
    },
    { concurrency: 32 }
  );
  let progress: {
    done: number;
    skipped: number;
    errors: { error: any; url: string; debugPath: string }[];
  } = { done: 0, skipped: 0, errors: [] };
  for (const result of results) {
    switch (result.type) {
      case "done":
        progress.done++;
        break;
      case "error":
        progress.errors.push(result);
        break;
      case "skipped":
        progress.skipped++;
        break;
    }
  }
  return progress;
 }
 export async function getProduct(url: URL, html: string): Promise<Precioish> {
  if (url.hostname === "www.carrefour.com.ar") return getCarrefourProduct(html);
  else if (url.hostname === "diaonline.supermercadosdia.com.ar")
    return getDiaProduct(html);
  else if (url.hostname === "www.cotodigital3.com.ar")
    return getCotoProduct(html);
  else if (url.hostname === "www.jumbo.com.ar")
    return await getJumboProduct(html);
  else throw new Error(`Unknown host ${url.hostname}`);
 }
 type ScrapResult =
  | { type: "skipped" }
  | { type: "done" }
  | { type: "error"; url: string; error: any; debugPath: string };
 async function scrap(urlS: string): Promise<ScrapResult> {
  let url;
  try {
    url = new URL(urlS);
  } catch (err) {
    console.error(`skipped ${urlS} because ${err}`);
    return { type: "skipped" };
  }
  const res = await fetch(url);
  if (!res.ok) {
    console.debug(`skipped ${urlS} because status=${res.status} (!=200)`);
    return { type: "skipped" };
  }
  const html = await res.text();
  try {
    let ish = await getProduct(url, html);
    const p: Precio = {
      ...ish,
      fetchedAt: new Date(),
      url: urlS,
      parserVersion: PARSER_VERSION,
    };
    await db.insert(schema.precios).values(p);
    return { type: "done" };
  } catch (error) {
    const urlHash = createHash("md5").update(urlS).digest("hex");
    const output = join("debug", `${urlHash}.html`);
    if (DEBUG) {
      await mkdir("debug", { recursive: true });
      await writeFile(output, html);
    }
    return {
      type: "error",
      url: urlS,
      error,
      debugPath: output,
    };
  }
 }
 function wait(ms: number) {
  return new Promise((resolve) => setTimeout(resolve, ms));
 }
--- a/scraper/tsconfig.json
+++ b/scraper/tsconfig.json
@ -0,0 +1,4 @@
 {
  "extends": "../tsconfig.json",
  "exclude": ["../sitio"]
 }
--- a/sitio/src/routes/search/+page.server.ts
+++ b/sitio/src/routes/search/+page.server.ts
@ -11,7 +11,7 @@ export const load: PageServerLoad = async ({ url }) => {
      join precios p on p.ean = f.ean
      where f.name match ${`"${query}"`}
      group by p.ean
-      having max(p.fetched_at)
+      having max(p.fetched_at) and max(p.in_stock)
      order by p.in_stock desc;`;
    results = db.all(sqlQuery);
  }