Compare commits

...

29 commits

Author SHA1 Message Date
387036a958 cachear home por 10 min
tarda mucho hacer la query random
2024-01-04 20:44:42 -03:00
0dd725aafd ups
cuando no hay warc_version_id, todos los NULL se groupean entre si dejando solo una entry.

esto hace que funcione bien, excepto cuando hay varios con el mismo warc_record_id, en ese caso van a aparecer como entries distintas. en la práctica creo que en prod no hay warc_version_ids duplicados.
2024-01-04 20:04:50 -03:00
899133e474 actualizar drizzle-orm scraper 2024-01-04 19:58:25 -03:00
802d2c3c4d ci: chequear types antes de pushear imagen 2024-01-04 19:56:46 -03:00
92e814b13a corregir types + ci types 2024-01-04 19:56:18 -03:00
6a29ed257d ci: chequear ts sitio 2024-01-04 19:48:26 -03:00
1ce33c250e lockfile 2024-01-04 19:46:08 -03:00
845dc2dac1 sitio: colores jumbo 2024-01-04 19:46:03 -03:00
f089ff5047 arreglar types productpreview 2024-01-04 19:45:56 -03:00
16a51e41b1 actualizar drizzle-orm 2024-01-04 19:39:21 -03:00
2f14580142 actualizar drizzle-orm sitio 2024-01-04 19:38:41 -03:00
70298a601f activar jumbo en auto 2024-01-04 19:32:16 -03:00
f154053204 lockfile 2024-01-04 19:28:03 -03:00
1ce87c4fce esperar mas 2024-01-04 19:27:19 -03:00
525510a8dd Jumbo 2024-01-04 19:25:17 -03:00
e890d5f63b arreglar busqueda para queries extrañas 2024-01-04 18:47:24 -03:00
f0798e8620 link-scrapers: reutilizar codigo sitemaps 2024-01-04 18:12:55 -03:00
fa6de68f60 scraper: reordenar codigo
- borrar código viejo
- centralizar scrapers de links
2024-01-04 18:10:02 -03:00
a322bc36fc coto: corregir chequeo instock 2024-01-04 17:49:10 -03:00
da9f2c8348 cli: poder scrappear links especificos 2024-01-04 17:48:27 -03:00
e6f084b1da coto instock 2024-01-04 17:45:46 -03:00
6256817ee1 retornar progres 2024-01-04 16:55:48 -03:00
df845acc66 esperar entre pedidos fallidso 2024-01-04 16:52:11 -03:00
3c9788647b mostrar path a html debug junto al error 2024-01-04 16:42:47 -03:00
4f5994a2e1 no volver a scrapear cosas salteadas 2024-01-04 16:42:37 -03:00
087be6714c reintentar scrap 2024-01-04 16:31:00 -03:00
7e58397c8c siempre guardar html debug 2024-01-04 15:30:25 -03:00
5312861c42 gh actions sitio 2024-01-04 15:24:56 -03:00
f80c3ad4fc github actions 2024-01-04 15:20:56 -03:00
35 changed files with 481 additions and 187 deletions

83
.github/workflows/container.yml vendored Normal file
View file

@ -0,0 +1,83 @@
name: check and publish container image
on:
push:
branches: ["master"]
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
jobs:
check:
name: chequear typescript
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: oven-sh/setup-bun@v1
- run: bun install
working-directory: ./sitio
- run: bun check
working-directory: ./sitio
- run: bun install
working-directory: ./scraper
- run: bun check
working-directory: ./scraper
build-and-push-scraper:
runs-on: ubuntu-latest
needs: check
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Log in to the Container registry
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/scraper
- name: Build and push Docker image
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
with:
context: .
file: scraper/Containerfile
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
build-and-push-sitio:
needs: check
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Log in to the Container registry
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/sitio
- name: Build and push Docker image
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
with:
context: .
file: sitio/Containerfile
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

BIN
bun.lockb

Binary file not shown.

View file

@ -1,17 +0,0 @@
{
"name": "carrefour-link-scraper",
"type": "module",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"linkedom": "^0.16.5",
"p-map": "^7.0.1"
}
}

3
data/Jumbo.txt Normal file
View file

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6299b470d9debc9a173b40c2ba91208eb43a6c8cde02a4819e7bcd76368e4363
size 922185

100
data/samples/Jumbo.100.txt Normal file
View file

@ -0,0 +1,100 @@
https://www.jumbo.com.ar/huevos-de-color-avicoper-6-u-1-paquete-2/p
https://www.jumbo.com.ar/ajo-ahumado-organico-pampa-gourmet-285g/p
https://www.jumbo.com.ar/boxer-dst-raya-finita-art-b278-talle-m/p
https://www.jumbo.com.ar/yogur-bebible-ser-sachet-vainilla-900g/p
https://www.jumbo.com.ar/plato-playo-melamina-27-cm-boho-krea-2/p
https://www.jumbo.com.ar/mermelada-la-vieja-fabrica-frutos-del-bosque-350-gr/p
https://www.jumbo.com.ar/dr-lemon-vodka-pomelo-5/p
https://www.jumbo.com.ar/vino-cuvelier-los-andes-grand-vin-750cc/p
https://www.jumbo.com.ar/capsulas-cafe-cabrales-dg-cortado-x88gr/p
https://www.jumbo.com.ar/pizza-muzarella-e/p
https://www.jumbo.com.ar/filet-de-merluza-rebozado-8/p
https://www.jumbo.com.ar/ron-bacardi-carta-blanca-750-ml/p
https://www.jumbo.com.ar/sal-gruesa-celusal-1-kg/p
https://www.jumbo.com.ar/vaso-bajo-acrilico-boho-krea-2/p
https://www.jumbo.com.ar/espumante-chandon-demi-sec/p
https://www.jumbo.com.ar/jarra-electrica-smartlife-sl-ek1714wpn/p
https://www.jumbo.com.ar/espumante-dada-7-rose-dulce-750-cc/p
https://www.jumbo.com.ar/panquequera-hudson-de-aluminio-con-antiadherente-22cm/p
https://www.jumbo.com.ar/sacapuntas-de-plastico-pizzini-2un/p
https://www.jumbo.com.ar/vino-vinas-de-alvear-tinto-750ml/p
https://www.jumbo.com.ar/campera-mujer-puffer-larga/p
https://www.jumbo.com.ar/tabla-de-quesos/p
https://www.jumbo.com.ar/frutos-del-bosque-frutas-del-sur-x400gr/p
https://www.jumbo.com.ar/blister-resaltador-flash-amarillo-x-1-un/p
https://www.jumbo.com.ar/alim-whiskas-gatitos-carne-y-leche-500gr/p
https://www.jumbo.com.ar/detergente-polvo-zorro-blue-3k-x-1un/p
https://www.jumbo.com.ar/media-vestir-hombre-1s10471-negro/p
https://www.jumbo.com.ar/nachos-macritas-ketchup-x90g/p
https://www.jumbo.com.ar/pack-x3-medias-juvenil-liso-t-5-elemento/p
https://www.jumbo.com.ar/set-de-vehiculos-emergencias-duravit/p
https://www.jumbo.com.ar/carbon-patagonia-x-4kgs/p
https://www.jumbo.com.ar/rejilla-mr-trapo-cocina-algodon/p
https://www.jumbo.com.ar/jugo-exprimido-pura-frutta-arandanos-manzana-verde-x-1l/p
https://www.jumbo.com.ar/media-dama-invisible-alta-nyb-urb-2/p
https://www.jumbo.com.ar/boxer-nino-raya-violeta-2-colores-dst-t-10/p
https://www.jumbo.com.ar/barra-zafran-caju-y-sem-de-zapallo-x112g/p
https://www.jumbo.com.ar/iniciador-de-fuego-maderasa/p
https://www.jumbo.com.ar/queso-mozzarella-barraza-x-500grs-paq-gr-500/p
https://www.jumbo.com.ar/vaso-de-vidrio-cuadrado-360-cc/p
https://www.jumbo.com.ar/shampoo-sedal-jengibre-y-ricino-190ml/p
https://www.jumbo.com.ar/roller-gel-filgo-gel-pop-glitter-1un/p
https://www.jumbo.com.ar/una-familia-anormal-el-misterio-de-prh/p
https://www.jumbo.com.ar/veggie-stick-tomate-y-oliva-via-vita-x-50grs/p
https://www.jumbo.com.ar/bowl-stor-bicolor-mickey-mouse/p
https://www.jumbo.com.ar/vino-blanco-don-valentin-lacrado-750-ml/p
https://www.jumbo.com.ar/un-vecino-anormal-2-prh/p
https://www.jumbo.com.ar/paleta-pet-cancat-mordillo-ice/p
https://www.jumbo.com.ar/aceitunas-nucete-premium-descarozadas-180-gr/p
https://www.jumbo.com.ar/caja-plastica-6l-teen-boy-pv23-krea-2/p
https://www.jumbo.com.ar/vino-santa-julia-chardonnay-x-750-cc/p
https://www.jumbo.com.ar/protecor-solar-dermaglos-bebes-fps65-120gr/p
https://www.jumbo.com.ar/oregano-100-gr/p
https://www.jumbo.com.ar/puerro-song/p
https://www.jumbo.com.ar/repuesto-difusor-sandia-pepino-350-ml-2/p
https://www.jumbo.com.ar/botellas-plasticas-origin-580ml-rosa-2/p
https://www.jumbo.com.ar/nescafe-dolca-original-x-170gr/p
https://www.jumbo.com.ar/tapa-empanada-veggie-signo-de-oro-x-500g/p
https://www.jumbo.com.ar/inflador-de-pie-bestway-air-hammer/p
https://www.jumbo.com.ar/ketchup-ahumado-marian-arytza-400g/p
https://www.jumbo.com.ar/sal-marina-finas-hierbas-ahumada-s-tacc-450g/p
https://www.jumbo.com.ar/jugo-smudis-pomelo-500ml-brk-0-5-lt/p
https://www.jumbo.com.ar/limpiador-antihongos-ayudin-removedor-activo-envase-economico-450-ml/p
https://www.jumbo.com.ar/marcador-permanente-punta-redonda-color-negro/p
https://www.jumbo.com.ar/galletitas-dulces-con-chips-de-chocolate-pepitos-119g/p
https://www.jumbo.com.ar/afeitadora-bic-comfort-twin-l5p4-2/p
https://www.jumbo.com.ar/canvas-20x20-cm-paisajes-04-krea/p
https://www.jumbo.com.ar/turron-georgalos-de-mani-con-chocolate-x-90-gr/p
https://www.jumbo.com.ar/arroz-vanguardia-elaborado-largo-fino/p
https://www.jumbo.com.ar/set-x-3-pastafrola-fija-n-14/p
https://www.jumbo.com.ar/pulpa-fina-basilico-mutti-400-gr/p
https://www.jumbo.com.ar/vino-tinto-elementos-malbec-750-cc/p
https://www.jumbo.com.ar/enjuague-bucal-listerine-antisarro-suave-sn-alcohol-x250/p
https://www.jumbo.com.ar/almohaditas-lasfor-avellana-200-grs/p
https://www.jumbo.com.ar/vino-tinto-los-haroldos-estate-cabernet-sauvignon-750-ml/p
https://www.jumbo.com.ar/peluche-funnyland-maxtoys-tibalt-perro-28cm/p
https://www.jumbo.com.ar/cafetera-filtro-negro-electrolux-1-2-litros/p
https://www.jumbo.com.ar/media-nina-ciudadella-minnie-t2/p
https://www.jumbo.com.ar/portaretrato-colores-13x18cm-4c-krea4136010100/p
https://www.jumbo.com.ar/lustramuebles-blem-madera-aceite-de-argan-aerosol-360cc/p
https://www.jumbo.com.ar/sriracha-sauce-hashi-x250ml-2/p
https://www.jumbo.com.ar/plato-hondo-22-1-cm-ceramica-blanca/p
https://www.jumbo.com.ar/limpiador-harpic-banos-sarro-y-manchas-495ml/p
https://www.jumbo.com.ar/shampoo-dove-real-poder-de-las-plantas-purificacion-jengibre-300-ml/p
https://www.jumbo.com.ar/aromatizador-glade-mini-gel-car-3/p
https://www.jumbo.com.ar/carpeta-con-10-folios-a4/p
https://www.jumbo.com.ar/sabana-king-caracol-krea/p
https://www.jumbo.com.ar/leche-en-polvo-nutribaby-1-hmo-x-800-grs/p
https://www.jumbo.com.ar/chalitas-viavita-clasicas-x-100-grs-sin-tacc/p
https://www.jumbo.com.ar/hervidor-tramontina-14cm-cm-x1/p
https://www.jumbo.com.ar/aceitunas-de-gordal-ybarra-x240gr-2/p
https://www.jumbo.com.ar/tableta-vizzio-relleno-nugaton-x100g-2/p
https://www.jumbo.com.ar/mortadela-paladini-fetas-finas-x-200-gr-2/p
https://www.jumbo.com.ar/budin-limon-y-amapolas/p
https://www.jumbo.com.ar/vino-chac-chac-sauvingnon-blanc-lata-269cc/p
https://www.jumbo.com.ar/whisky-chivas-regal-18-yo-700cc/p
https://www.jumbo.com.ar/copa-de-vidrio-rigolleau-6/p
https://www.jumbo.com.ar/notcreamcheese-210-gr/p
https://www.jumbo.com.ar/oso-con-miel-de-abejas-cuisine-co-340-gr/p
https://www.jumbo.com.ar/difusor-aromas-spirit-spirit-win-home-250ml-x1/p
https://www.jumbo.com.ar/exprimidor-ultracomb-ex-2302/p

View file

@ -11,7 +11,7 @@
"author": "", "author": "",
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"drizzle-orm": "=0.29.1" "drizzle-orm": "^0.29.1"
}, },
"devDependencies": { "devDependencies": {
"@types/bun": "^1.0.0", "@types/bun": "^1.0.0",

View file

@ -2,15 +2,23 @@ export enum Supermercado {
Dia = "Dia", Dia = "Dia",
Carrefour = "Carrefour", Carrefour = "Carrefour",
Coto = "Coto", Coto = "Coto",
Jumbo = "Jumbo",
} }
export const supermercados: Supermercado[] = [
Supermercado.Carrefour,
Supermercado.Coto,
Supermercado.Dia,
Supermercado.Jumbo,
];
export const hosts: { [host: string]: Supermercado } = { export const hosts: { [host: string]: Supermercado } = {
"diaonline.supermercadosdia.com.ar": Supermercado.Dia, "diaonline.supermercadosdia.com.ar": Supermercado.Dia,
"www.carrefour.com.ar": Supermercado.Carrefour, "www.carrefour.com.ar": Supermercado.Carrefour,
"www.cotodigital3.com.ar": Supermercado.Coto, "www.cotodigital3.com.ar": Supermercado.Coto,
"www.jumbo.com.ar": Supermercado.Jumbo,
}; };
export const colorBySupermercado: { [supermercado in Supermercado]: string } = { export const colorBySupermercado: { [supermercado in Supermercado]: string } = {
[Supermercado.Dia]: "#d52b1e", [Supermercado.Dia]: "#d52b1e",
[Supermercado.Carrefour]: "#19549d", [Supermercado.Carrefour]: "#19549d",
[Supermercado.Coto]: "#e20025", [Supermercado.Coto]: "#e20025",
[Supermercado.Jumbo]: "#2dc850",
}; };

View file

@ -1,17 +0,0 @@
{
"name": "dia-link-scraper",
"type": "module",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"linkedom": "^0.16.5",
"p-map": "^7.0.0"
}
}

View file

@ -1,6 +1,6 @@
import pMap from "p-map"; import pMap from "p-map";
import { decodeXML } from "entities";
import { saveUrls } from "db-datos/urlHelpers.js"; import { saveUrls } from "db-datos/urlHelpers.js";
import { getUrlsFromSitemap } from "./common.js";
export async function scrapCarrefourProducts() { export async function scrapCarrefourProducts() {
await scrapBySitemap(); await scrapBySitemap();
@ -26,17 +26,7 @@ async function scrapBySitemap() {
async (sitemapUrl) => { async (sitemapUrl) => {
const res = await fetch(sitemapUrl); const res = await fetch(sitemapUrl);
const xml = await res.text(); const xml = await res.text();
let urls = new Set<string>(); saveUrls(getUrlsFromSitemap(xml));
new HTMLRewriter()
.on("loc", {
text(element) {
const txt = element.text.trim();
if (!txt) return;
urls.add(decodeXML(txt));
},
})
.transform(new Response(xml));
saveUrls(Array.from(urls));
}, },
{ concurrency: 3 } { concurrency: 3 }
); );

14
link-scrapers/common.ts Normal file
View file

@ -0,0 +1,14 @@
import { decodeXML } from "entities";
export function getUrlsFromSitemap(xml: string) {
let urls = new Set<string>();
new HTMLRewriter()
.on("loc", {
text(element) {
const txt = element.text.trim();
if (!txt) return;
urls.add(decodeXML(txt));
},
})
.transform(new Response(xml));
return Array.from(urls);
}

View file

@ -1,4 +1,3 @@
import { getHtml } from "../scraper/fetch.js";
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import PQueue from "p-queue"; import PQueue from "p-queue";
import { saveUrls } from "db-datos/urlHelpers.js"; import { saveUrls } from "db-datos/urlHelpers.js";
@ -28,12 +27,13 @@ function getPage(url: string) {
return async () => { return async () => {
let html; let html;
try { try {
html = await getHtml(url); const res = await fetch(url);
html = await res.text();
} catch (error) { } catch (error) {
await getPage(url)(); await getPage(url)();
return; return;
} }
const { document } = parseHTML(html.toString("utf-8")); const { document } = parseHTML(html);
const hrefs = Array.from( const hrefs = Array.from(
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"), document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),

View file

@ -1,8 +1,7 @@
import pMap from "p-map"; import pMap from "p-map";
import { decodeXML } from "entities";
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import { getHtml } from "../scraper/fetch.js";
import { saveUrls } from "db-datos/urlHelpers.js"; import { saveUrls } from "db-datos/urlHelpers.js";
import { getUrlsFromSitemap } from "./common.js";
const categorias = [ const categorias = [
"https://diaonline.supermercadosdia.com.ar/almacen", "https://diaonline.supermercadosdia.com.ar/almacen",
@ -82,21 +81,15 @@ async function scrapBySitemap() {
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml", "https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
]; ];
await pMap(sitemaps, async (sitemapUrl) => { await pMap(
const res = await fetch(sitemapUrl); sitemaps,
const xml = await res.text(); async (sitemapUrl) => {
let urls = new Set<string>(); const res = await fetch(sitemapUrl);
new HTMLRewriter() const xml = await res.text();
.on("loc", { saveUrls(getUrlsFromSitemap(xml));
text(element) { },
const txt = element.text.trim(); { concurrency: 3 }
if (!txt) return; );
urls.add(decodeXML(txt));
},
})
.transform(new Response(xml));
saveUrls(Array.from(urls));
});
} }
async function scrapBySite() { async function scrapBySite() {
@ -111,8 +104,9 @@ async function scrapBySite() {
await pMap( await pMap(
links, links,
async (url) => { async (url) => {
const html = await getHtml(url); const res = await fetch(url);
const { document } = parseHTML(html.toString("utf-8")); const html = await res.text();
const { document } = parseHTML(html);
const hrefs = Array.from( const hrefs = Array.from(
document.querySelectorAll<HTMLAnchorElement>( document.querySelectorAll<HTMLAnchorElement>(

38
link-scrapers/jumbo.ts Normal file
View file

@ -0,0 +1,38 @@
import pMap from "p-map";
import { saveUrls } from "db-datos/urlHelpers.js";
import { getUrlsFromSitemap } from "./common.js";
export async function scrapJumboProducts() {
await scrapBySitemap();
}
async function scrapBySitemap() {
// de https://www.jumbo.com.ar/sitemap.xml
const sitemaps = [
"https://www.jumbo.com.ar/sitemap/product-1.xml",
"https://www.jumbo.com.ar/sitemap/product-10.xml",
"https://www.jumbo.com.ar/sitemap/product-11.xml",
"https://www.jumbo.com.ar/sitemap/product-12.xml",
"https://www.jumbo.com.ar/sitemap/product-13.xml",
"https://www.jumbo.com.ar/sitemap/product-14.xml",
"https://www.jumbo.com.ar/sitemap/product-15.xml",
"https://www.jumbo.com.ar/sitemap/product-2.xml",
"https://www.jumbo.com.ar/sitemap/product-3.xml",
"https://www.jumbo.com.ar/sitemap/product-4.xml",
"https://www.jumbo.com.ar/sitemap/product-5.xml",
"https://www.jumbo.com.ar/sitemap/product-6.xml",
"https://www.jumbo.com.ar/sitemap/product-7.xml",
"https://www.jumbo.com.ar/sitemap/product-8.xml",
"https://www.jumbo.com.ar/sitemap/product-9.xml",
];
await pMap(
sitemaps,
async (sitemapUrl) => {
const res = await fetch(sitemapUrl);
const xml = await res.text();
saveUrls(getUrlsFromSitemap(xml));
},
{ concurrency: 3 }
);
}

View file

@ -1,5 +1,5 @@
{ {
"name": "coto-link-scraper", "name": "link-scrapers",
"type": "module", "type": "module",
"version": "1.0.0", "version": "1.0.0",
"description": "", "description": "",
@ -11,6 +11,7 @@
"author": "", "author": "",
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"entities": "^4.5.0",
"linkedom": "^0.16.5", "linkedom": "^0.16.5",
"p-queue": "^8.0.1" "p-queue": "^8.0.1"
} }

View file

@ -2,9 +2,7 @@
"name": "preciazo", "name": "preciazo",
"private": true, "private": true,
"workspaces": [ "workspaces": [
"dia-link-scraper", "link-scrapers",
"coto-link-scraper",
"carrefour-link-scraper",
"scraper", "scraper",
"sitio", "sitio",
"db-datos" "db-datos"

View file

@ -4,7 +4,7 @@ scrapeo "masivo" de precios y datos en supermercados argentinos
## componentes (en orden de proceso) ## componentes (en orden de proceso)
- los link scrapers ([coto-link-scraper](./coto-link-scraper/), [dia-link-scraper](./dia-link-scraper/) y [carrefour-link-scraper](./carrefour-link-scraper)) crean listas de links a productos para scrapear - los link scrapers ([link-scrapers/](./link-scrapers/)) crean listas de links a productos para scrapear
(no hace falta correrlos porque ya hay listas armadas en [data/](./data/)) (no hace falta correrlos porque ya hay listas armadas en [data/](./data/))

View file

@ -1,22 +1,17 @@
import { mkdtemp, writeFile } from "node:fs/promises"; import { mkdtemp, writeFile } from "node:fs/promises";
import { tmpdir } from "node:os"; import { tmpdir } from "node:os";
import { join } from "node:path"; import { join } from "node:path";
import { Supermercado, hosts } from "db-datos/supermercado.js"; import { Supermercado, hosts, supermercados } from "db-datos/supermercado.js";
import PQueue from "p-queue"; import PQueue from "p-queue";
import { formatDuration, intervalToDuration } from "date-fns"; import { formatDuration, intervalToDuration } from "date-fns";
import { downloadList } from "./scrap.js"; import { downloadList } from "./scrap.js";
import { db } from "db-datos/db.js"; import { db } from "db-datos/db.js";
import { like } from "drizzle-orm"; import { like } from "drizzle-orm";
import { productoUrls } from "db-datos/schema.js"; import { productoUrls } from "db-datos/schema.js";
import { scrapDiaProducts } from "../dia-link-scraper/index.js"; import { scrapDiaProducts } from "../link-scrapers/dia.js";
import { scrapCotoProducts } from "../coto-link-scraper/index.js"; import { scrapCotoProducts } from "../link-scrapers/coto.js";
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js"; import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
const supermercados: Supermercado[] = [
Supermercado.Carrefour,
Supermercado.Coto,
Supermercado.Dia,
];
// hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU // hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU
const scrapQueue = new PQueue({ concurrency: 4 }); const scrapQueue = new PQueue({ concurrency: 4 });
@ -59,6 +54,9 @@ class Auto {
case "Carrefour": case "Carrefour":
await scrapCarrefourProducts(); await scrapCarrefourProducts();
break; break;
case "Jumbo":
await scrapJumboProducts();
break;
} }
this.inform( this.inform(
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}` `[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`

View file

@ -1,8 +1,9 @@
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js"; import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
import { scrapCotoProducts } from "../coto-link-scraper/index.js"; import { scrapCotoProducts } from "../link-scrapers/coto.js";
import { scrapDiaProducts } from "../dia-link-scraper/index.js"; import { scrapDiaProducts } from "../link-scrapers/dia.js";
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
import { auto } from "./auto.js"; import { auto } from "./auto.js";
import { downloadList } from "./scrap.js"; import { downloadList, getProduct } from "./scrap.js";
if (process.argv[2] === "auto") { if (process.argv[2] === "auto") {
await auto(); await auto();
@ -12,6 +13,13 @@ if (process.argv[2] === "auto") {
await scrapDiaProducts(); await scrapDiaProducts();
} else if (process.argv[2] === "scrap-coto-links") { } else if (process.argv[2] === "scrap-coto-links") {
await scrapCotoProducts(); await scrapCotoProducts();
} else if (process.argv[2] === "scrap-jumbo-links") {
await scrapJumboProducts();
} else if (process.argv[2] === "scrap-link") {
const url = new URL(process.argv[3]);
const res = await fetch(url);
const text = await res.text();
console.info(await getProduct(url, text));
} else if (process.argv[2] === "scrap") { } else if (process.argv[2] === "scrap") {
const urlLists = process.argv.slice(3); const urlLists = process.argv.slice(3);
if (urlLists.length > 0) { if (urlLists.length > 0) {

View file

@ -1,13 +0,0 @@
export async function getHtml(url: string) {
const res = await fetch(url);
return readableToBuffer(res.body!);
}
async function readableToBuffer(source: AsyncIterable<any>) {
// https://stackoverflow.com/a/72891118
const buffers = [];
for await (const data of source) {
buffers.push(data);
}
return Buffer.concat(buffers);
}

View file

@ -5,8 +5,7 @@
"description": "", "description": "",
"main": "index.js", "main": "index.js",
"scripts": { "scripts": {
"build:container": "podman build -t gitea.nulo.in/nulo/preciazo/scraper -f ./Containerfile ..", "check": "tsc"
"push:container": "bun build:container && podman push gitea.nulo.in/nulo/preciazo/scraper"
}, },
"keywords": [], "keywords": [],
"author": "", "author": "",
@ -16,8 +15,7 @@
"@aws-sdk/lib-storage": "^3.478.0", "@aws-sdk/lib-storage": "^3.478.0",
"date-fns": "^3.0.6", "date-fns": "^3.0.6",
"db-datos": "workspace:^", "db-datos": "workspace:^",
"drizzle-orm": "=0.29.1", "drizzle-orm": "^0.29.1",
"entities": "^4.5.0",
"linkedom": "^0.16.5", "linkedom": "^0.16.5",
"nanoid": "^5.0.4", "nanoid": "^5.0.4",
"p-map": "^7.0.1", "p-map": "^7.0.1",

View file

@ -1,6 +1,6 @@
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import { Precioish } from "../scrap.js"; import { Precioish } from "../scrap.js";
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "../common.js"; import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
function parseScriptJson<T>(dom: Window, varname: string): T { function parseScriptJson<T>(dom: Window, varname: string): T {
const script = dom.window.document.querySelector<HTMLTemplateElement>( const script = dom.window.document.querySelector<HTMLTemplateElement>(

View file

@ -21,7 +21,7 @@ function parseJsonLds(dom: Window): object[] {
const scripts = dom.window.document.querySelectorAll( const scripts = dom.window.document.querySelectorAll(
'script[type="application/ld+json"]' 'script[type="application/ld+json"]'
); );
return [...scripts].map((scripts) => JSON.parse(scripts.innerHTML)); return Array.from(scripts, (script) => JSON.parse(script.innerHTML));
} }
function findJsonLd(dom: Window, type: string): object | undefined { function findJsonLd(dom: Window, type: string): object | undefined {
return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type); return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type);
@ -31,6 +31,7 @@ const zProductLd = z.object({
"@type": z.literal("Product"), "@type": z.literal("Product"),
name: z.string(), name: z.string(),
image: z.string(), image: z.string(),
sku: z.string().optional(),
offers: z.object({ offers: z.object({
offers: z.array( offers: z.array(
z.object({ z.object({

View file

@ -19,7 +19,7 @@ function getEanFromText({ document }: Window) {
} }
function getPriceFromText({ document }: Window) { function getPriceFromText({ document }: Window) {
const el = document.querySelector(".atg_store_newPrice"); const el = document.querySelector(".atg_store_newPrice");
if (!el?.textContent) throw new Error("no encuentro el precio"); if (!el?.textContent) return null;
const nStr = el.textContent const nStr = el.textContent
.trim() .trim()
.replace("$", "") .replace("$", "")
@ -27,12 +27,16 @@ function getPriceFromText({ document }: Window) {
.replace(",", "."); .replace(",", ".");
return parseFloat(nStr) * 100; return parseFloat(nStr) * 100;
} }
function getInStock({ document }: Window) {
return !document.querySelector(".product_not_available");
}
export function getCotoProduct(html: string | Buffer): Precioish { export function getCotoProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html); const dom = parseHTML(html);
const ean = getEanFromText(dom); const ean = getEanFromText(dom);
const precioCentavos = getPriceFromText(dom); const precioCentavos = getPriceFromText(dom);
const inStock = getInStock(dom);
const name = dom.document const name = dom.document
.querySelector("h1.product_page") .querySelector("h1.product_page")
@ -40,5 +44,5 @@ export function getCotoProduct(html: string | Buffer): Precioish {
const imageUrl = const imageUrl =
dom.document.querySelector<HTMLImageElement>(".zoom img")?.src; dom.document.querySelector<HTMLImageElement>(".zoom img")?.src;
return { name, imageUrl, ean, precioCentavos }; return { name, imageUrl, ean, precioCentavos, inStock };
} }

View file

@ -1,6 +1,6 @@
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import { type Precioish } from "../scrap.js"; import { type Precioish } from "../scrap.js";
import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js"; import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
export function getDiaProduct(html: string | Buffer): Precioish { export function getDiaProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html); const dom = parseHTML(html);

54
scraper/parsers/jumbo.ts Normal file
View file

@ -0,0 +1,54 @@
import { parseHTML } from "linkedom";
import { type Precioish } from "../scrap.js";
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
import { z } from "zod";
const zJumboSearch = z.tuple([
z.object({
items: z.array(
z.object({
ean: z.string(),
})
),
}),
]);
async function getEanFromSearch(sku: string) {
const url = new URL(
"https://www.jumbo.com.ar/api/catalog_system/pub/products/search"
);
url.searchParams.set("fq", `skuId:${sku}`);
const res = await fetch(url);
const json = await res.json();
const parsed = zJumboSearch.parse(json);
const ean = parsed[0].items[0].ean;
if (!parsed[0].items.every((x) => x.ean === ean)) {
throw new Error("Inesperado: no todos los items tienen el mismo EAN");
}
return ean;
}
export async function getJumboProduct(
html: string | Buffer
): Promise<Precioish> {
const dom = parseHTML(html);
const precioCentavos = priceFromMeta(dom);
const inStock = stockFromMeta(dom);
const ld = getProductJsonLd(dom);
const name = ld.name;
const imageUrl = ld.image;
const retailerSku = ld.sku;
if (!retailerSku)
throw new Error("No encontré el SKU de Jumbo para pedir el EAN");
const ean = await getEanFromSearch(retailerSku);
return {
name,
imageUrl,
ean,
precioCentavos,
inStock,
};
}

View file

@ -1,5 +1,6 @@
/// <reference lib="dom" />
import * as schema from "db-datos/schema.js"; import * as schema from "db-datos/schema.js";
import { writeFile } from "fs/promises"; import { writeFile, mkdir } from "fs/promises";
import { createHash } from "crypto"; import { createHash } from "crypto";
import { getCarrefourProduct } from "./parsers/carrefour.js"; import { getCarrefourProduct } from "./parsers/carrefour.js";
import { getDiaProduct } from "./parsers/dia.js"; import { getDiaProduct } from "./parsers/dia.js";
@ -7,8 +8,9 @@ import { getCotoProduct } from "./parsers/coto.js";
import { join } from "path"; import { join } from "path";
import { db } from "db-datos/db.js"; import { db } from "db-datos/db.js";
import pMap from "p-map"; import pMap from "p-map";
import { getJumboProduct } from "./parsers/jumbo.js";
const DEBUG = false; const DEBUG = true;
const PARSER_VERSION = 4; const PARSER_VERSION = 4;
export type Precio = typeof schema.precios.$inferInsert; export type Precio = typeof schema.precios.$inferInsert;
@ -18,73 +20,108 @@ export type Precioish = Omit<
>; >;
export async function downloadList(path: string) { export async function downloadList(path: string) {
let progress: {
done: number;
skipped: number;
errors: { error: any; url: string; path: string }[];
} = { done: 0, skipped: 0, errors: [] };
let list = (await Bun.file(path).text()) let list = (await Bun.file(path).text())
.split("\n") .split("\n")
.filter((s) => s.length > 0); .filter((s) => s.length > 0);
await pMap( const results = await pMap(
list, list,
async (urlS) => { async (urlS) => {
let url; let res: ScrapResult = { type: "skipped" };
try { for (let attempts = 0; attempts < 6; attempts++) {
url = new URL(urlS); if (attempts !== 0) await wait(1500);
} catch (err) { res = await scrap(urlS);
console.error("error parseando", urlS); if (res.type === "done" || res.type === "skipped") {
return; break;
}
const res = await fetch(url);
if (!res.ok) {
console.debug(`skipped ${urlS} because status=${res.status} (!=200)`);
progress.skipped++;
return;
}
const html = await res.text();
try {
let ish: Precioish | undefined = undefined;
if (url.hostname === "www.carrefour.com.ar")
ish = getCarrefourProduct(html);
else if (url.hostname === "diaonline.supermercadosdia.com.ar")
ish = getDiaProduct(html);
else if (url.hostname === "www.cotodigital3.com.ar")
ish = getCotoProduct(html);
else throw new Error(`Unknown host ${url.hostname}`);
const p: Precio = {
...ish,
fetchedAt: new Date(),
url: urlS,
parserVersion: PARSER_VERSION,
};
await db.insert(schema.precios).values(p);
progress.done++;
} catch (error) {
console.error({ path, urlS, error });
progress.errors.push({
path,
url: urlS,
error,
});
if (DEBUG) {
const urlHash = createHash("md5").update(urlS).digest("hex");
const output = join("debug", `${urlHash}.html`);
await writeFile(output, html);
console.error(`wrote html to ${output}`);
} }
} }
if (res.type === "error") console.error(res);
return res;
}, },
{ concurrency: 32 } { concurrency: 32 }
); );
let progress: {
done: number;
skipped: number;
errors: { error: any; url: string; debugPath: string }[];
} = { done: 0, skipped: 0, errors: [] };
for (const result of results) {
switch (result.type) {
case "done":
progress.done++;
break;
case "error":
progress.errors.push(result);
break;
case "skipped":
progress.skipped++;
break;
}
}
return progress; return progress;
} }
export async function getProduct(url: URL, html: string): Promise<Precioish> {
if (url.hostname === "www.carrefour.com.ar") return getCarrefourProduct(html);
else if (url.hostname === "diaonline.supermercadosdia.com.ar")
return getDiaProduct(html);
else if (url.hostname === "www.cotodigital3.com.ar")
return getCotoProduct(html);
else if (url.hostname === "www.jumbo.com.ar")
return await getJumboProduct(html);
else throw new Error(`Unknown host ${url.hostname}`);
}
type ScrapResult =
| { type: "skipped" }
| { type: "done" }
| { type: "error"; url: string; error: any; debugPath: string };
async function scrap(urlS: string): Promise<ScrapResult> {
let url;
try {
url = new URL(urlS);
} catch (err) {
console.error(`skipped ${urlS} because ${err}`);
return { type: "skipped" };
}
const res = await fetch(url);
if (!res.ok) {
console.debug(`skipped ${urlS} because status=${res.status} (!=200)`);
return { type: "skipped" };
}
const html = await res.text();
try {
let ish = await getProduct(url, html);
const p: Precio = {
...ish,
fetchedAt: new Date(),
url: urlS,
parserVersion: PARSER_VERSION,
};
await db.insert(schema.precios).values(p);
return { type: "done" };
} catch (error) {
const urlHash = createHash("md5").update(urlS).digest("hex");
const output = join("debug", `${urlHash}.html`);
if (DEBUG) {
await mkdir("debug", { recursive: true });
await writeFile(output, html);
}
return {
type: "error",
url: urlS,
error,
debugPath: output,
};
}
}
function wait(ms: number) {
return new Promise((resolve) => setTimeout(resolve, ms));
}

View file

@ -1,3 +1,4 @@
{ {
"extends": "../tsconfig.json" "extends": "../tsconfig.json",
"exclude": ["../sitio"]
} }

View file

@ -39,6 +39,6 @@
"chart.js": "^4.4.1", "chart.js": "^4.4.1",
"chartjs-adapter-dayjs-4": "^1.0.4", "chartjs-adapter-dayjs-4": "^1.0.4",
"dayjs": "^1.11.10", "dayjs": "^1.11.10",
"drizzle-orm": "=0.29.1" "drizzle-orm": "^0.29.1"
} }
} }

View file

@ -1,8 +1,10 @@
<script lang="ts"> <script lang="ts">
export let product: { ean: string; name: string; imageUrl: string }; export let product: { ean: string; name: string; imageUrl?: string | null };
</script> </script>
<a href={`/ean/${product.ean}`} class="flex"> <a href={`/ean/${product.ean}`} class="flex">
<img src={product.imageUrl} alt={product.name} class="max-h-48" /> {#if product.imageUrl}
<img src={product.imageUrl} alt={product.name} class="max-h-48" />
{/if}
<p class="text-xl">{product.name}</p> <p class="text-xl">{product.name}</p>
</a> </a>

View file

@ -1,9 +1,14 @@
import type { PageServerLoad } from "./$types"; import type { PageData, PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db"; import { db, schema } from "$lib/server/db";
const { precios } = schema; const { precios } = schema;
import { sql } from "drizzle-orm"; import { sql } from "drizzle-orm";
let cache: null | { key: Date; data: PageData } = null;
export const load: PageServerLoad = async ({ params }) => { export const load: PageServerLoad = async ({ params }) => {
if (cache && +new Date() < +cache.key + 1000 * 60 * 10) {
return cache.data;
}
const q = db const q = db
.select({ .select({
ean: precios.ean, ean: precios.ean,
@ -16,5 +21,7 @@ export const load: PageServerLoad = async ({ params }) => {
.orderBy(sql`random()`) .orderBy(sql`random()`)
.limit(150); .limit(150);
const res = await q; const res = await q;
return { precios: res }; const data = { precios: res };
cache = { key: new Date(), data };
return data;
}; };

View file

@ -3,6 +3,10 @@
import type { PageData } from "./$types"; import type { PageData } from "./$types";
export let data: PageData; export let data: PageData;
$: precios = data.precios.filter(
(d): d is { ean: string; name: string; imageUrl: string | null } =>
!!d.name,
);
</script> </script>
<h1 class="text-xl">WIP</h1> <h1 class="text-xl">WIP</h1>
@ -32,7 +36,7 @@
<section> <section>
<h2 class="text-lg font-bold">Random</h2> <h2 class="text-lg font-bold">Random</h2>
<ul class="grid grid-cols-1 gap-4 md:grid-cols-2 lg:grid-cols-3"> <ul class="grid grid-cols-1 gap-4 md:grid-cols-2 lg:grid-cols-3">
{#each data.precios as product} {#each precios as product}
<li> <li>
<ProductPreview {product} /> <ProductPreview {product} />
</li> </li>

View file

@ -9,8 +9,6 @@ export const load: PageServerLoad = async ({ params }) => {
.select() .select()
.from(precios) .from(precios)
.where(eq(precios.ean, params.ean)) .where(eq(precios.ean, params.ean))
.groupBy(precios.warcRecordId)
.having(max(precios.parserVersion))
.orderBy(precios.fetchedAt); .orderBy(precios.fetchedAt);
const res = await q; const res = await q;
if (res.length === 0) return error(404, "Not Found"); if (res.length === 0) return error(404, "Not Found");

View file

@ -17,6 +17,7 @@
[Supermercado.Dia]: "bg-[#d52b1e] focus:ring-[#d52b1e]", [Supermercado.Dia]: "bg-[#d52b1e] focus:ring-[#d52b1e]",
[Supermercado.Carrefour]: "bg-[#19549d] focus:ring-[#19549d]", [Supermercado.Carrefour]: "bg-[#19549d] focus:ring-[#19549d]",
[Supermercado.Coto]: "bg-[#e20025] focus:ring-[#e20025]", [Supermercado.Coto]: "bg-[#e20025] focus:ring-[#e20025]",
[Supermercado.Jumbo]: "bg-[#2dc850] focus:ring-[#2dc850]",
}; };
</script> </script>

View file

@ -1,19 +1,17 @@
import { error } from "@sveltejs/kit"; import { error } from "@sveltejs/kit";
import { eq, max, sql } from "drizzle-orm"; import { sql } from "drizzle-orm";
import type { PageServerLoad } from "./$types"; import type { PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db"; import { db } from "$lib/server/db";
const { precios } = schema;
export const load: PageServerLoad = async ({ url }) => { export const load: PageServerLoad = async ({ url }) => {
const query = url.searchParams.get("q"); const query = url.searchParams.get("q");
let results: null | { ean: string; name: string; imageUrl: string }[] = null; let results: null | { ean: string; name: string; imageUrl: string }[] = null;
if (query) { if (query) {
results = db.all( const sqlQuery = sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
join precios p on p.ean = f.ean join precios p on p.ean = f.ean
where f.name match ${query} where f.name match ${`"${query}"`}
group by p.ean;`, group by p.ean;`;
); results = db.all(sqlQuery);
} }
return { query, results }; return { query, results };

View file

@ -15,5 +15,6 @@
"noEmit": true, "noEmit": true,
"forceConsistentCasingInFileNames": true "forceConsistentCasingInFileNames": true
}, },
"include": ["**/*.ts", "**/*.js"] "include": ["**/*.ts", "**/*.js"],
"exclude": ["sitio/build"]
} }