diff --git a/bun.lockb b/bun.lockb index 9de16ae..bd34d10 100755 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/carrefour-link-scraper/index.ts b/carrefour-link-scraper/index.ts new file mode 100644 index 0000000..eb9536f --- /dev/null +++ b/carrefour-link-scraper/index.ts @@ -0,0 +1,44 @@ +import pMap from "p-map"; +import { saveUrls } from "db-datos/urlHelpers.js"; + +await scrapBySitemap(); + +export async function scrapCarrefourProducts() { + await scrapBySitemap(); +} + +async function scrapBySitemap() { + // de https://www.carrefour.com.ar/sitemap.xml + const sitemaps = [ + "https://www.carrefour.com.ar/sitemap/product-0.xml", + "https://www.carrefour.com.ar/sitemap/product-1.xml", + "https://www.carrefour.com.ar/sitemap/product-2.xml", + "https://www.carrefour.com.ar/sitemap/product-3.xml", + "https://www.carrefour.com.ar/sitemap/product-4.xml", + "https://www.carrefour.com.ar/sitemap/product-5.xml", + "https://www.carrefour.com.ar/sitemap/product-6.xml", + "https://www.carrefour.com.ar/sitemap/product-7.xml", + "https://www.carrefour.com.ar/sitemap/product-8.xml", + "https://www.carrefour.com.ar/sitemap/product-9.xml", + ]; + + await pMap( + sitemaps, + async (sitemapUrl) => { + const res = await fetch(sitemapUrl); + const xml = await res.text(); + let urls = new Set(); + new HTMLRewriter() + .on("loc", { + text(element) { + const txt = element.text.trim(); + if (!txt) return; + urls.add(txt); + }, + }) + .transform(new Response(xml)); + saveUrls(Array.from(urls)); + }, + { concurrency: 3 } + ); +} diff --git a/carrefour-link-scraper/package.json b/carrefour-link-scraper/package.json new file mode 100644 index 0000000..a404536 --- /dev/null +++ b/carrefour-link-scraper/package.json @@ -0,0 +1,17 @@ +{ + "name": "carrefour-link-scraper", + "type": "module", + "version": "1.0.0", + "description": "", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "keywords": [], + "author": "", + "license": "ISC", + "dependencies": { + "linkedom": "^0.16.5", + "p-map": "^7.0.1" + } +} diff --git a/package.json b/package.json index 1d88ea2..30324d4 100644 --- a/package.json +++ b/package.json @@ -4,6 +4,7 @@ "workspaces": [ "dia-link-scraper", "coto-link-scraper", + "carrefour-link-scraper", "scraper", "sitio", "db-datos" diff --git a/scraper/auto.ts b/scraper/auto.ts index b82b86a..0547fda 100644 --- a/scraper/auto.ts +++ b/scraper/auto.ts @@ -14,6 +14,7 @@ import { like } from "drizzle-orm"; import { productoUrls } from "db-datos/schema.js"; import { scrapDiaProducts } from "../dia-link-scraper/index.js"; import { scrapCotoProducts } from "../coto-link-scraper/index.js"; +import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js"; const supermercados: Supermercado[] = [ Supermercado.Carrefour, @@ -79,12 +80,7 @@ class Auto { const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-")); let listPath: string; - if (supermercado === "Carrefour") { - // TODO: carrefour todavía no tiene un scraper que guarde a la BD - listPath = resolve( - join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`) - ); - } else { + { const t0 = performance.now(); switch (supermercado) { case "Dia": @@ -93,23 +89,27 @@ class Auto { case "Coto": await scrapCotoProducts(); break; + case "Carrefour": + await scrapCarrefourProducts(); + break; } this.inform( `[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}` ); - - listPath = join(ctxPath, `lista-${supermercado}.txt`); - const host = Object.entries(hosts).find( - ([host, supe]) => supe === supermercado - )![0]; - const results = await db.query.productoUrls - .findMany({ - where: like(productoUrls.url, `%${host}%`), - }) - .execute(); - const urls = results.map((r) => r.url); - await writeFile(listPath, urls.join("\n") + "\n"); } + + listPath = join(ctxPath, `lista-${supermercado}.txt`); + const host = Object.entries(hosts).find( + ([host, supe]) => supe === supermercado + )![0]; + const results = await db.query.productoUrls + .findMany({ + where: like(productoUrls.url, `%${host}%`), + }) + .execute(); + const urls = results.map((r) => r.url); + await writeFile(listPath, urls.join("\n") + "\n"); + const date = new Date(); const zstdWarcName = `${supermercado}-${format( date,