carrefour url scraper

2024-11-26 11:36:20 +00:00 · 2023-12-29 22:58:37 -03:00 · 2023-12-29 22:58:37 -03:00 · 73759ae6d9
commit 73759ae6d9
parent 2bac37df53
5 changed files with 80 additions and 18 deletions
--- a/bun.lockb
+++ b/bun.lockb
--- a/carrefour-link-scraper/index.ts
+++ b/carrefour-link-scraper/index.ts
@ -0,0 +1,44 @@
+import pMap from "p-map";
+import { saveUrls } from "db-datos/urlHelpers.js";
+
+await scrapBySitemap();
+
+export async function scrapCarrefourProducts() {
+  await scrapBySitemap();
+}
+
+async function scrapBySitemap() {
+  // de https://www.carrefour.com.ar/sitemap.xml
+  const sitemaps = [
+    "https://www.carrefour.com.ar/sitemap/product-0.xml",
+    "https://www.carrefour.com.ar/sitemap/product-1.xml",
+    "https://www.carrefour.com.ar/sitemap/product-2.xml",
+    "https://www.carrefour.com.ar/sitemap/product-3.xml",
+    "https://www.carrefour.com.ar/sitemap/product-4.xml",
+    "https://www.carrefour.com.ar/sitemap/product-5.xml",
+    "https://www.carrefour.com.ar/sitemap/product-6.xml",
+    "https://www.carrefour.com.ar/sitemap/product-7.xml",
+    "https://www.carrefour.com.ar/sitemap/product-8.xml",
+    "https://www.carrefour.com.ar/sitemap/product-9.xml",
+  ];
+
+  await pMap(
+    sitemaps,
+    async (sitemapUrl) => {
+      const res = await fetch(sitemapUrl);
+      const xml = await res.text();
+      let urls = new Set<string>();
+      new HTMLRewriter()
+        .on("loc", {
+          text(element) {
+            const txt = element.text.trim();
+            if (!txt) return;
+            urls.add(txt);
+          },
+        })
+        .transform(new Response(xml));
+      saveUrls(Array.from(urls));
+    },
+    { concurrency: 3 }
+  );
+}
--- a/carrefour-link-scraper/package.json
+++ b/carrefour-link-scraper/package.json
@ -0,0 +1,17 @@
+{
+  "name": "carrefour-link-scraper",
+  "type": "module",
+  "version": "1.0.0",
+  "description": "",
+  "main": "index.js",
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "keywords": [],
+  "author": "",
+  "license": "ISC",
+  "dependencies": {
+    "linkedom": "^0.16.5",
+    "p-map": "^7.0.1"
+  }
+}
--- a/package.json
+++ b/package.json
@ -4,6 +4,7 @@
  "workspaces": [
    "dia-link-scraper",
    "coto-link-scraper",
+    "carrefour-link-scraper",
    "scraper",
    "sitio",
    "db-datos"
--- a/scraper/auto.ts
+++ b/scraper/auto.ts
@ -14,6 +14,7 @@ import { like } from "drizzle-orm";
 import { productoUrls } from "db-datos/schema.js";
 import { scrapDiaProducts } from "../dia-link-scraper/index.js";
 import { scrapCotoProducts } from "../coto-link-scraper/index.js";
+import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";

 const supermercados: Supermercado[] = [
  Supermercado.Carrefour,
@ -79,12 +80,7 @@ class Auto {
    const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-"));

    let listPath: string;
-    if (supermercado === "Carrefour") {
-      // TODO: carrefour todavía no tiene un scraper que guarde a la BD
-      listPath = resolve(
-        join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`)
-      );
-    } else {
+    {
      const t0 = performance.now();
      switch (supermercado) {
        case "Dia":
@ -93,10 +89,14 @@ class Auto {
        case "Coto":
          await scrapCotoProducts();
          break;
+        case "Carrefour":
+          await scrapCarrefourProducts();
+          break;
      }
      this.inform(
        `[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
      );
+    }

    listPath = join(ctxPath, `lista-${supermercado}.txt`);
    const host = Object.entries(hosts).find(
@ -109,7 +109,7 @@ class Auto {
      .execute();
    const urls = results.map((r) => r.url);
    await writeFile(listPath, urls.join("\n") + "\n");
-    }
+
    const date = new Date();
    const zstdWarcName = `${supermercado}-${format(
      date,