diff --git a/data/Jumbo.txt b/data/Jumbo.txt new file mode 100644 index 0000000..bfbf74f --- /dev/null +++ b/data/Jumbo.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6299b470d9debc9a173b40c2ba91208eb43a6c8cde02a4819e7bcd76368e4363 +size 922185 diff --git a/data/samples/Jumbo.100.txt b/data/samples/Jumbo.100.txt new file mode 100644 index 0000000..fadf390 --- /dev/null +++ b/data/samples/Jumbo.100.txt @@ -0,0 +1,100 @@ +https://www.jumbo.com.ar/huevos-de-color-avicoper-6-u-1-paquete-2/p +https://www.jumbo.com.ar/ajo-ahumado-organico-pampa-gourmet-285g/p +https://www.jumbo.com.ar/boxer-dst-raya-finita-art-b278-talle-m/p +https://www.jumbo.com.ar/yogur-bebible-ser-sachet-vainilla-900g/p +https://www.jumbo.com.ar/plato-playo-melamina-27-cm-boho-krea-2/p +https://www.jumbo.com.ar/mermelada-la-vieja-fabrica-frutos-del-bosque-350-gr/p +https://www.jumbo.com.ar/dr-lemon-vodka-pomelo-5/p +https://www.jumbo.com.ar/vino-cuvelier-los-andes-grand-vin-750cc/p +https://www.jumbo.com.ar/capsulas-cafe-cabrales-dg-cortado-x88gr/p +https://www.jumbo.com.ar/pizza-muzarella-e/p +https://www.jumbo.com.ar/filet-de-merluza-rebozado-8/p +https://www.jumbo.com.ar/ron-bacardi-carta-blanca-750-ml/p +https://www.jumbo.com.ar/sal-gruesa-celusal-1-kg/p +https://www.jumbo.com.ar/vaso-bajo-acrilico-boho-krea-2/p +https://www.jumbo.com.ar/espumante-chandon-demi-sec/p +https://www.jumbo.com.ar/jarra-electrica-smartlife-sl-ek1714wpn/p +https://www.jumbo.com.ar/espumante-dada-7-rose-dulce-750-cc/p +https://www.jumbo.com.ar/panquequera-hudson-de-aluminio-con-antiadherente-22cm/p +https://www.jumbo.com.ar/sacapuntas-de-plastico-pizzini-2un/p +https://www.jumbo.com.ar/vino-vinas-de-alvear-tinto-750ml/p +https://www.jumbo.com.ar/campera-mujer-puffer-larga/p +https://www.jumbo.com.ar/tabla-de-quesos/p +https://www.jumbo.com.ar/frutos-del-bosque-frutas-del-sur-x400gr/p +https://www.jumbo.com.ar/blister-resaltador-flash-amarillo-x-1-un/p +https://www.jumbo.com.ar/alim-whiskas-gatitos-carne-y-leche-500gr/p +https://www.jumbo.com.ar/detergente-polvo-zorro-blue-3k-x-1un/p +https://www.jumbo.com.ar/media-vestir-hombre-1s10471-negro/p +https://www.jumbo.com.ar/nachos-macritas-ketchup-x90g/p +https://www.jumbo.com.ar/pack-x3-medias-juvenil-liso-t-5-elemento/p +https://www.jumbo.com.ar/set-de-vehiculos-emergencias-duravit/p +https://www.jumbo.com.ar/carbon-patagonia-x-4kgs/p +https://www.jumbo.com.ar/rejilla-mr-trapo-cocina-algodon/p +https://www.jumbo.com.ar/jugo-exprimido-pura-frutta-arandanos-manzana-verde-x-1l/p +https://www.jumbo.com.ar/media-dama-invisible-alta-nyb-urb-2/p +https://www.jumbo.com.ar/boxer-nino-raya-violeta-2-colores-dst-t-10/p +https://www.jumbo.com.ar/barra-zafran-caju-y-sem-de-zapallo-x112g/p +https://www.jumbo.com.ar/iniciador-de-fuego-maderasa/p +https://www.jumbo.com.ar/queso-mozzarella-barraza-x-500grs-paq-gr-500/p +https://www.jumbo.com.ar/vaso-de-vidrio-cuadrado-360-cc/p +https://www.jumbo.com.ar/shampoo-sedal-jengibre-y-ricino-190ml/p +https://www.jumbo.com.ar/roller-gel-filgo-gel-pop-glitter-1un/p +https://www.jumbo.com.ar/una-familia-anormal-el-misterio-de-prh/p +https://www.jumbo.com.ar/veggie-stick-tomate-y-oliva-via-vita-x-50grs/p +https://www.jumbo.com.ar/bowl-stor-bicolor-mickey-mouse/p +https://www.jumbo.com.ar/vino-blanco-don-valentin-lacrado-750-ml/p +https://www.jumbo.com.ar/un-vecino-anormal-2-prh/p +https://www.jumbo.com.ar/paleta-pet-cancat-mordillo-ice/p +https://www.jumbo.com.ar/aceitunas-nucete-premium-descarozadas-180-gr/p +https://www.jumbo.com.ar/caja-plastica-6l-teen-boy-pv23-krea-2/p +https://www.jumbo.com.ar/vino-santa-julia-chardonnay-x-750-cc/p +https://www.jumbo.com.ar/protecor-solar-dermaglos-bebes-fps65-120gr/p +https://www.jumbo.com.ar/oregano-100-gr/p +https://www.jumbo.com.ar/puerro-song/p +https://www.jumbo.com.ar/repuesto-difusor-sandia-pepino-350-ml-2/p +https://www.jumbo.com.ar/botellas-plasticas-origin-580ml-rosa-2/p +https://www.jumbo.com.ar/nescafe-dolca-original-x-170gr/p +https://www.jumbo.com.ar/tapa-empanada-veggie-signo-de-oro-x-500g/p +https://www.jumbo.com.ar/inflador-de-pie-bestway-air-hammer/p +https://www.jumbo.com.ar/ketchup-ahumado-marian-arytza-400g/p +https://www.jumbo.com.ar/sal-marina-finas-hierbas-ahumada-s-tacc-450g/p +https://www.jumbo.com.ar/jugo-smudis-pomelo-500ml-brk-0-5-lt/p +https://www.jumbo.com.ar/limpiador-antihongos-ayudin-removedor-activo-envase-economico-450-ml/p +https://www.jumbo.com.ar/marcador-permanente-punta-redonda-color-negro/p +https://www.jumbo.com.ar/galletitas-dulces-con-chips-de-chocolate-pepitos-119g/p +https://www.jumbo.com.ar/afeitadora-bic-comfort-twin-l5p4-2/p +https://www.jumbo.com.ar/canvas-20x20-cm-paisajes-04-krea/p +https://www.jumbo.com.ar/turron-georgalos-de-mani-con-chocolate-x-90-gr/p +https://www.jumbo.com.ar/arroz-vanguardia-elaborado-largo-fino/p +https://www.jumbo.com.ar/set-x-3-pastafrola-fija-n-14/p +https://www.jumbo.com.ar/pulpa-fina-basilico-mutti-400-gr/p +https://www.jumbo.com.ar/vino-tinto-elementos-malbec-750-cc/p +https://www.jumbo.com.ar/enjuague-bucal-listerine-antisarro-suave-sn-alcohol-x250/p +https://www.jumbo.com.ar/almohaditas-lasfor-avellana-200-grs/p +https://www.jumbo.com.ar/vino-tinto-los-haroldos-estate-cabernet-sauvignon-750-ml/p +https://www.jumbo.com.ar/peluche-funnyland-maxtoys-tibalt-perro-28cm/p +https://www.jumbo.com.ar/cafetera-filtro-negro-electrolux-1-2-litros/p +https://www.jumbo.com.ar/media-nina-ciudadella-minnie-t2/p +https://www.jumbo.com.ar/portaretrato-colores-13x18cm-4c-krea4136010100/p +https://www.jumbo.com.ar/lustramuebles-blem-madera-aceite-de-argan-aerosol-360cc/p +https://www.jumbo.com.ar/sriracha-sauce-hashi-x250ml-2/p +https://www.jumbo.com.ar/plato-hondo-22-1-cm-ceramica-blanca/p +https://www.jumbo.com.ar/limpiador-harpic-banos-sarro-y-manchas-495ml/p +https://www.jumbo.com.ar/shampoo-dove-real-poder-de-las-plantas-purificacion-jengibre-300-ml/p +https://www.jumbo.com.ar/aromatizador-glade-mini-gel-car-3/p +https://www.jumbo.com.ar/carpeta-con-10-folios-a4/p +https://www.jumbo.com.ar/sabana-king-caracol-krea/p +https://www.jumbo.com.ar/leche-en-polvo-nutribaby-1-hmo-x-800-grs/p +https://www.jumbo.com.ar/chalitas-viavita-clasicas-x-100-grs-sin-tacc/p +https://www.jumbo.com.ar/hervidor-tramontina-14cm-cm-x1/p +https://www.jumbo.com.ar/aceitunas-de-gordal-ybarra-x240gr-2/p +https://www.jumbo.com.ar/tableta-vizzio-relleno-nugaton-x100g-2/p +https://www.jumbo.com.ar/mortadela-paladini-fetas-finas-x-200-gr-2/p +https://www.jumbo.com.ar/budin-limon-y-amapolas/p +https://www.jumbo.com.ar/vino-chac-chac-sauvingnon-blanc-lata-269cc/p +https://www.jumbo.com.ar/whisky-chivas-regal-18-yo-700cc/p +https://www.jumbo.com.ar/copa-de-vidrio-rigolleau-6/p +https://www.jumbo.com.ar/notcreamcheese-210-gr/p +https://www.jumbo.com.ar/oso-con-miel-de-abejas-cuisine-co-340-gr/p +https://www.jumbo.com.ar/difusor-aromas-spirit-spirit-win-home-250ml-x1/p +https://www.jumbo.com.ar/exprimidor-ultracomb-ex-2302/p diff --git a/db-datos/supermercado.ts b/db-datos/supermercado.ts index e46fe9a..1073db6 100644 --- a/db-datos/supermercado.ts +++ b/db-datos/supermercado.ts @@ -2,15 +2,18 @@ export enum Supermercado { Dia = "Dia", Carrefour = "Carrefour", Coto = "Coto", + Jumbo = "Jumbo", } export const hosts: { [host: string]: Supermercado } = { "diaonline.supermercadosdia.com.ar": Supermercado.Dia, "www.carrefour.com.ar": Supermercado.Carrefour, "www.cotodigital3.com.ar": Supermercado.Coto, + "www.jumbo.com.ar": Supermercado.Jumbo, }; export const colorBySupermercado: { [supermercado in Supermercado]: string } = { [Supermercado.Dia]: "#d52b1e", [Supermercado.Carrefour]: "#19549d", [Supermercado.Coto]: "#e20025", + [Supermercado.Jumbo]: "#2dc850", }; diff --git a/link-scrapers/jumbo.ts b/link-scrapers/jumbo.ts new file mode 100644 index 0000000..77356d5 --- /dev/null +++ b/link-scrapers/jumbo.ts @@ -0,0 +1,38 @@ +import pMap from "p-map"; +import { saveUrls } from "db-datos/urlHelpers.js"; +import { getUrlsFromSitemap } from "./common.js"; + +export async function scrapJumboProducts() { + await scrapBySitemap(); +} + +async function scrapBySitemap() { + // de https://www.jumbo.com.ar/sitemap.xml + const sitemaps = [ + "https://www.jumbo.com.ar/sitemap/product-1.xml", + "https://www.jumbo.com.ar/sitemap/product-10.xml", + "https://www.jumbo.com.ar/sitemap/product-11.xml", + "https://www.jumbo.com.ar/sitemap/product-12.xml", + "https://www.jumbo.com.ar/sitemap/product-13.xml", + "https://www.jumbo.com.ar/sitemap/product-14.xml", + "https://www.jumbo.com.ar/sitemap/product-15.xml", + "https://www.jumbo.com.ar/sitemap/product-2.xml", + "https://www.jumbo.com.ar/sitemap/product-3.xml", + "https://www.jumbo.com.ar/sitemap/product-4.xml", + "https://www.jumbo.com.ar/sitemap/product-5.xml", + "https://www.jumbo.com.ar/sitemap/product-6.xml", + "https://www.jumbo.com.ar/sitemap/product-7.xml", + "https://www.jumbo.com.ar/sitemap/product-8.xml", + "https://www.jumbo.com.ar/sitemap/product-9.xml", + ]; + + await pMap( + sitemaps, + async (sitemapUrl) => { + const res = await fetch(sitemapUrl); + const xml = await res.text(); + saveUrls(getUrlsFromSitemap(xml)); + }, + { concurrency: 3 } + ); +} diff --git a/scraper/auto.ts b/scraper/auto.ts index 21643fc..774fea4 100644 --- a/scraper/auto.ts +++ b/scraper/auto.ts @@ -11,6 +11,7 @@ import { productoUrls } from "db-datos/schema.js"; import { scrapDiaProducts } from "../link-scrapers/dia.js"; import { scrapCotoProducts } from "../link-scrapers/coto.js"; import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js"; +import { scrapJumboProducts } from "../link-scrapers/jumbo.js"; const supermercados: Supermercado[] = [ Supermercado.Carrefour, @@ -59,6 +60,9 @@ class Auto { case "Carrefour": await scrapCarrefourProducts(); break; + case "Jumbo": + await scrapJumboProducts(); + break; } this.inform( `[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}` diff --git a/scraper/cli.ts b/scraper/cli.ts index 0304ba8..8223973 100644 --- a/scraper/cli.ts +++ b/scraper/cli.ts @@ -1,6 +1,7 @@ import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js"; import { scrapCotoProducts } from "../link-scrapers/coto.js"; import { scrapDiaProducts } from "../link-scrapers/dia.js"; +import { scrapJumboProducts } from "../link-scrapers/jumbo.js"; import { auto } from "./auto.js"; import { downloadList, getProduct } from "./scrap.js"; @@ -12,11 +13,13 @@ if (process.argv[2] === "auto") { await scrapDiaProducts(); } else if (process.argv[2] === "scrap-coto-links") { await scrapCotoProducts(); +} else if (process.argv[2] === "scrap-jumbo-links") { + await scrapJumboProducts(); } else if (process.argv[2] === "scrap-link") { const url = new URL(process.argv[3]); const res = await fetch(url); const text = await res.text(); - console.info(getProduct(url, text)); + console.info(await getProduct(url, text)); } else if (process.argv[2] === "scrap") { const urlLists = process.argv.slice(3); if (urlLists.length > 0) { diff --git a/scraper/parsers/common.ts b/scraper/parsers/common.ts index f84edff..a4a2a99 100644 --- a/scraper/parsers/common.ts +++ b/scraper/parsers/common.ts @@ -31,6 +31,7 @@ const zProductLd = z.object({ "@type": z.literal("Product"), name: z.string(), image: z.string(), + sku: z.string().optional(), offers: z.object({ offers: z.array( z.object({ diff --git a/scraper/parsers/jumbo.ts b/scraper/parsers/jumbo.ts new file mode 100644 index 0000000..b26b09f --- /dev/null +++ b/scraper/parsers/jumbo.ts @@ -0,0 +1,54 @@ +import { parseHTML } from "linkedom"; +import { type Precioish } from "../scrap.js"; +import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js"; +import { z } from "zod"; + +const zJumboSearch = z.tuple([ + z.object({ + items: z.array( + z.object({ + ean: z.string(), + }) + ), + }), +]); + +async function getEanFromSearch(sku: string) { + const url = new URL( + "https://www.jumbo.com.ar/api/catalog_system/pub/products/search" + ); + url.searchParams.set("fq", `skuId:${sku}`); + const res = await fetch(url); + const json = await res.json(); + const parsed = zJumboSearch.parse(json); + const ean = parsed[0].items[0].ean; + if (!parsed[0].items.every((x) => x.ean === ean)) { + throw new Error("Inesperado: no todos los items tienen el mismo EAN"); + } + return ean; +} + +export async function getJumboProduct( + html: string | Buffer +): Promise { + const dom = parseHTML(html); + const precioCentavos = priceFromMeta(dom); + const inStock = stockFromMeta(dom); + + const ld = getProductJsonLd(dom); + const name = ld.name; + const imageUrl = ld.image; + + const retailerSku = ld.sku; + if (!retailerSku) + throw new Error("No encontré el SKU de Jumbo para pedir el EAN"); + const ean = await getEanFromSearch(retailerSku); + + return { + name, + imageUrl, + ean, + precioCentavos, + inStock, + }; +} diff --git a/scraper/scrap.ts b/scraper/scrap.ts index f482492..6b723df 100644 --- a/scraper/scrap.ts +++ b/scraper/scrap.ts @@ -7,6 +7,7 @@ import { getCotoProduct } from "./parsers/coto.js"; import { join } from "path"; import { db } from "db-datos/db.js"; import pMap from "p-map"; +import { getJumboProduct } from "./parsers/jumbo.js"; const DEBUG = true; const PARSER_VERSION = 4; @@ -60,12 +61,14 @@ export async function downloadList(path: string) { return progress; } -export function getProduct(url: URL, html: string) { +export async function getProduct(url: URL, html: string): Promise { if (url.hostname === "www.carrefour.com.ar") return getCarrefourProduct(html); else if (url.hostname === "diaonline.supermercadosdia.com.ar") return getDiaProduct(html); else if (url.hostname === "www.cotodigital3.com.ar") return getCotoProduct(html); + else if (url.hostname === "www.jumbo.com.ar") + return await getJumboProduct(html); else throw new Error(`Unknown host ${url.hostname}`); } @@ -90,7 +93,7 @@ async function scrap(urlS: string): Promise { const html = await res.text(); try { - let ish = getProduct(url, html); + let ish = await getProduct(url, html); const p: Precio = { ...ish,