mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-22 14:16:19 +00:00
Compare commits
29 commits
71e66cf437
...
387036a958
Author | SHA1 | Date | |
---|---|---|---|
387036a958 | |||
0dd725aafd | |||
899133e474 | |||
802d2c3c4d | |||
92e814b13a | |||
6a29ed257d | |||
1ce33c250e | |||
845dc2dac1 | |||
f089ff5047 | |||
16a51e41b1 | |||
2f14580142 | |||
70298a601f | |||
f154053204 | |||
1ce87c4fce | |||
525510a8dd | |||
e890d5f63b | |||
f0798e8620 | |||
fa6de68f60 | |||
a322bc36fc | |||
da9f2c8348 | |||
e6f084b1da | |||
6256817ee1 | |||
df845acc66 | |||
3c9788647b | |||
4f5994a2e1 | |||
087be6714c | |||
7e58397c8c | |||
5312861c42 | |||
f80c3ad4fc |
35 changed files with 481 additions and 187 deletions
83
.github/workflows/container.yml
vendored
Normal file
83
.github/workflows/container.yml
vendored
Normal file
|
@ -0,0 +1,83 @@
|
||||||
|
name: check and publish container image
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: ["master"]
|
||||||
|
|
||||||
|
env:
|
||||||
|
REGISTRY: ghcr.io
|
||||||
|
IMAGE_NAME: ${{ github.repository }}
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
check:
|
||||||
|
name: chequear typescript
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
- uses: oven-sh/setup-bun@v1
|
||||||
|
|
||||||
|
- run: bun install
|
||||||
|
working-directory: ./sitio
|
||||||
|
- run: bun check
|
||||||
|
working-directory: ./sitio
|
||||||
|
- run: bun install
|
||||||
|
working-directory: ./scraper
|
||||||
|
- run: bun check
|
||||||
|
working-directory: ./scraper
|
||||||
|
|
||||||
|
build-and-push-scraper:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: check
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
packages: write
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Log in to the Container registry
|
||||||
|
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
|
||||||
|
with:
|
||||||
|
registry: ${{ env.REGISTRY }}
|
||||||
|
username: ${{ github.actor }}
|
||||||
|
password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
- name: Extract metadata (tags, labels) for Docker
|
||||||
|
id: meta
|
||||||
|
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
|
||||||
|
with:
|
||||||
|
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/scraper
|
||||||
|
- name: Build and push Docker image
|
||||||
|
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
file: scraper/Containerfile
|
||||||
|
push: true
|
||||||
|
tags: ${{ steps.meta.outputs.tags }}
|
||||||
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
|
build-and-push-sitio:
|
||||||
|
needs: check
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
packages: write
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Log in to the Container registry
|
||||||
|
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
|
||||||
|
with:
|
||||||
|
registry: ${{ env.REGISTRY }}
|
||||||
|
username: ${{ github.actor }}
|
||||||
|
password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
- name: Extract metadata (tags, labels) for Docker
|
||||||
|
id: meta
|
||||||
|
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
|
||||||
|
with:
|
||||||
|
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/sitio
|
||||||
|
- name: Build and push Docker image
|
||||||
|
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
file: sitio/Containerfile
|
||||||
|
push: true
|
||||||
|
tags: ${{ steps.meta.outputs.tags }}
|
||||||
|
labels: ${{ steps.meta.outputs.labels }}
|
BIN
bun.lockb
BIN
bun.lockb
Binary file not shown.
|
@ -1,17 +0,0 @@
|
||||||
{
|
|
||||||
"name": "carrefour-link-scraper",
|
|
||||||
"type": "module",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"description": "",
|
|
||||||
"main": "index.js",
|
|
||||||
"scripts": {
|
|
||||||
"test": "echo \"Error: no test specified\" && exit 1"
|
|
||||||
},
|
|
||||||
"keywords": [],
|
|
||||||
"author": "",
|
|
||||||
"license": "ISC",
|
|
||||||
"dependencies": {
|
|
||||||
"linkedom": "^0.16.5",
|
|
||||||
"p-map": "^7.0.1"
|
|
||||||
}
|
|
||||||
}
|
|
3
data/Jumbo.txt
Normal file
3
data/Jumbo.txt
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:6299b470d9debc9a173b40c2ba91208eb43a6c8cde02a4819e7bcd76368e4363
|
||||||
|
size 922185
|
100
data/samples/Jumbo.100.txt
Normal file
100
data/samples/Jumbo.100.txt
Normal file
|
@ -0,0 +1,100 @@
|
||||||
|
https://www.jumbo.com.ar/huevos-de-color-avicoper-6-u-1-paquete-2/p
|
||||||
|
https://www.jumbo.com.ar/ajo-ahumado-organico-pampa-gourmet-285g/p
|
||||||
|
https://www.jumbo.com.ar/boxer-dst-raya-finita-art-b278-talle-m/p
|
||||||
|
https://www.jumbo.com.ar/yogur-bebible-ser-sachet-vainilla-900g/p
|
||||||
|
https://www.jumbo.com.ar/plato-playo-melamina-27-cm-boho-krea-2/p
|
||||||
|
https://www.jumbo.com.ar/mermelada-la-vieja-fabrica-frutos-del-bosque-350-gr/p
|
||||||
|
https://www.jumbo.com.ar/dr-lemon-vodka-pomelo-5/p
|
||||||
|
https://www.jumbo.com.ar/vino-cuvelier-los-andes-grand-vin-750cc/p
|
||||||
|
https://www.jumbo.com.ar/capsulas-cafe-cabrales-dg-cortado-x88gr/p
|
||||||
|
https://www.jumbo.com.ar/pizza-muzarella-e/p
|
||||||
|
https://www.jumbo.com.ar/filet-de-merluza-rebozado-8/p
|
||||||
|
https://www.jumbo.com.ar/ron-bacardi-carta-blanca-750-ml/p
|
||||||
|
https://www.jumbo.com.ar/sal-gruesa-celusal-1-kg/p
|
||||||
|
https://www.jumbo.com.ar/vaso-bajo-acrilico-boho-krea-2/p
|
||||||
|
https://www.jumbo.com.ar/espumante-chandon-demi-sec/p
|
||||||
|
https://www.jumbo.com.ar/jarra-electrica-smartlife-sl-ek1714wpn/p
|
||||||
|
https://www.jumbo.com.ar/espumante-dada-7-rose-dulce-750-cc/p
|
||||||
|
https://www.jumbo.com.ar/panquequera-hudson-de-aluminio-con-antiadherente-22cm/p
|
||||||
|
https://www.jumbo.com.ar/sacapuntas-de-plastico-pizzini-2un/p
|
||||||
|
https://www.jumbo.com.ar/vino-vinas-de-alvear-tinto-750ml/p
|
||||||
|
https://www.jumbo.com.ar/campera-mujer-puffer-larga/p
|
||||||
|
https://www.jumbo.com.ar/tabla-de-quesos/p
|
||||||
|
https://www.jumbo.com.ar/frutos-del-bosque-frutas-del-sur-x400gr/p
|
||||||
|
https://www.jumbo.com.ar/blister-resaltador-flash-amarillo-x-1-un/p
|
||||||
|
https://www.jumbo.com.ar/alim-whiskas-gatitos-carne-y-leche-500gr/p
|
||||||
|
https://www.jumbo.com.ar/detergente-polvo-zorro-blue-3k-x-1un/p
|
||||||
|
https://www.jumbo.com.ar/media-vestir-hombre-1s10471-negro/p
|
||||||
|
https://www.jumbo.com.ar/nachos-macritas-ketchup-x90g/p
|
||||||
|
https://www.jumbo.com.ar/pack-x3-medias-juvenil-liso-t-5-elemento/p
|
||||||
|
https://www.jumbo.com.ar/set-de-vehiculos-emergencias-duravit/p
|
||||||
|
https://www.jumbo.com.ar/carbon-patagonia-x-4kgs/p
|
||||||
|
https://www.jumbo.com.ar/rejilla-mr-trapo-cocina-algodon/p
|
||||||
|
https://www.jumbo.com.ar/jugo-exprimido-pura-frutta-arandanos-manzana-verde-x-1l/p
|
||||||
|
https://www.jumbo.com.ar/media-dama-invisible-alta-nyb-urb-2/p
|
||||||
|
https://www.jumbo.com.ar/boxer-nino-raya-violeta-2-colores-dst-t-10/p
|
||||||
|
https://www.jumbo.com.ar/barra-zafran-caju-y-sem-de-zapallo-x112g/p
|
||||||
|
https://www.jumbo.com.ar/iniciador-de-fuego-maderasa/p
|
||||||
|
https://www.jumbo.com.ar/queso-mozzarella-barraza-x-500grs-paq-gr-500/p
|
||||||
|
https://www.jumbo.com.ar/vaso-de-vidrio-cuadrado-360-cc/p
|
||||||
|
https://www.jumbo.com.ar/shampoo-sedal-jengibre-y-ricino-190ml/p
|
||||||
|
https://www.jumbo.com.ar/roller-gel-filgo-gel-pop-glitter-1un/p
|
||||||
|
https://www.jumbo.com.ar/una-familia-anormal-el-misterio-de-prh/p
|
||||||
|
https://www.jumbo.com.ar/veggie-stick-tomate-y-oliva-via-vita-x-50grs/p
|
||||||
|
https://www.jumbo.com.ar/bowl-stor-bicolor-mickey-mouse/p
|
||||||
|
https://www.jumbo.com.ar/vino-blanco-don-valentin-lacrado-750-ml/p
|
||||||
|
https://www.jumbo.com.ar/un-vecino-anormal-2-prh/p
|
||||||
|
https://www.jumbo.com.ar/paleta-pet-cancat-mordillo-ice/p
|
||||||
|
https://www.jumbo.com.ar/aceitunas-nucete-premium-descarozadas-180-gr/p
|
||||||
|
https://www.jumbo.com.ar/caja-plastica-6l-teen-boy-pv23-krea-2/p
|
||||||
|
https://www.jumbo.com.ar/vino-santa-julia-chardonnay-x-750-cc/p
|
||||||
|
https://www.jumbo.com.ar/protecor-solar-dermaglos-bebes-fps65-120gr/p
|
||||||
|
https://www.jumbo.com.ar/oregano-100-gr/p
|
||||||
|
https://www.jumbo.com.ar/puerro-song/p
|
||||||
|
https://www.jumbo.com.ar/repuesto-difusor-sandia-pepino-350-ml-2/p
|
||||||
|
https://www.jumbo.com.ar/botellas-plasticas-origin-580ml-rosa-2/p
|
||||||
|
https://www.jumbo.com.ar/nescafe-dolca-original-x-170gr/p
|
||||||
|
https://www.jumbo.com.ar/tapa-empanada-veggie-signo-de-oro-x-500g/p
|
||||||
|
https://www.jumbo.com.ar/inflador-de-pie-bestway-air-hammer/p
|
||||||
|
https://www.jumbo.com.ar/ketchup-ahumado-marian-arytza-400g/p
|
||||||
|
https://www.jumbo.com.ar/sal-marina-finas-hierbas-ahumada-s-tacc-450g/p
|
||||||
|
https://www.jumbo.com.ar/jugo-smudis-pomelo-500ml-brk-0-5-lt/p
|
||||||
|
https://www.jumbo.com.ar/limpiador-antihongos-ayudin-removedor-activo-envase-economico-450-ml/p
|
||||||
|
https://www.jumbo.com.ar/marcador-permanente-punta-redonda-color-negro/p
|
||||||
|
https://www.jumbo.com.ar/galletitas-dulces-con-chips-de-chocolate-pepitos-119g/p
|
||||||
|
https://www.jumbo.com.ar/afeitadora-bic-comfort-twin-l5p4-2/p
|
||||||
|
https://www.jumbo.com.ar/canvas-20x20-cm-paisajes-04-krea/p
|
||||||
|
https://www.jumbo.com.ar/turron-georgalos-de-mani-con-chocolate-x-90-gr/p
|
||||||
|
https://www.jumbo.com.ar/arroz-vanguardia-elaborado-largo-fino/p
|
||||||
|
https://www.jumbo.com.ar/set-x-3-pastafrola-fija-n-14/p
|
||||||
|
https://www.jumbo.com.ar/pulpa-fina-basilico-mutti-400-gr/p
|
||||||
|
https://www.jumbo.com.ar/vino-tinto-elementos-malbec-750-cc/p
|
||||||
|
https://www.jumbo.com.ar/enjuague-bucal-listerine-antisarro-suave-sn-alcohol-x250/p
|
||||||
|
https://www.jumbo.com.ar/almohaditas-lasfor-avellana-200-grs/p
|
||||||
|
https://www.jumbo.com.ar/vino-tinto-los-haroldos-estate-cabernet-sauvignon-750-ml/p
|
||||||
|
https://www.jumbo.com.ar/peluche-funnyland-maxtoys-tibalt-perro-28cm/p
|
||||||
|
https://www.jumbo.com.ar/cafetera-filtro-negro-electrolux-1-2-litros/p
|
||||||
|
https://www.jumbo.com.ar/media-nina-ciudadella-minnie-t2/p
|
||||||
|
https://www.jumbo.com.ar/portaretrato-colores-13x18cm-4c-krea4136010100/p
|
||||||
|
https://www.jumbo.com.ar/lustramuebles-blem-madera-aceite-de-argan-aerosol-360cc/p
|
||||||
|
https://www.jumbo.com.ar/sriracha-sauce-hashi-x250ml-2/p
|
||||||
|
https://www.jumbo.com.ar/plato-hondo-22-1-cm-ceramica-blanca/p
|
||||||
|
https://www.jumbo.com.ar/limpiador-harpic-banos-sarro-y-manchas-495ml/p
|
||||||
|
https://www.jumbo.com.ar/shampoo-dove-real-poder-de-las-plantas-purificacion-jengibre-300-ml/p
|
||||||
|
https://www.jumbo.com.ar/aromatizador-glade-mini-gel-car-3/p
|
||||||
|
https://www.jumbo.com.ar/carpeta-con-10-folios-a4/p
|
||||||
|
https://www.jumbo.com.ar/sabana-king-caracol-krea/p
|
||||||
|
https://www.jumbo.com.ar/leche-en-polvo-nutribaby-1-hmo-x-800-grs/p
|
||||||
|
https://www.jumbo.com.ar/chalitas-viavita-clasicas-x-100-grs-sin-tacc/p
|
||||||
|
https://www.jumbo.com.ar/hervidor-tramontina-14cm-cm-x1/p
|
||||||
|
https://www.jumbo.com.ar/aceitunas-de-gordal-ybarra-x240gr-2/p
|
||||||
|
https://www.jumbo.com.ar/tableta-vizzio-relleno-nugaton-x100g-2/p
|
||||||
|
https://www.jumbo.com.ar/mortadela-paladini-fetas-finas-x-200-gr-2/p
|
||||||
|
https://www.jumbo.com.ar/budin-limon-y-amapolas/p
|
||||||
|
https://www.jumbo.com.ar/vino-chac-chac-sauvingnon-blanc-lata-269cc/p
|
||||||
|
https://www.jumbo.com.ar/whisky-chivas-regal-18-yo-700cc/p
|
||||||
|
https://www.jumbo.com.ar/copa-de-vidrio-rigolleau-6/p
|
||||||
|
https://www.jumbo.com.ar/notcreamcheese-210-gr/p
|
||||||
|
https://www.jumbo.com.ar/oso-con-miel-de-abejas-cuisine-co-340-gr/p
|
||||||
|
https://www.jumbo.com.ar/difusor-aromas-spirit-spirit-win-home-250ml-x1/p
|
||||||
|
https://www.jumbo.com.ar/exprimidor-ultracomb-ex-2302/p
|
|
@ -11,7 +11,7 @@
|
||||||
"author": "",
|
"author": "",
|
||||||
"license": "ISC",
|
"license": "ISC",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"drizzle-orm": "=0.29.1"
|
"drizzle-orm": "^0.29.1"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/bun": "^1.0.0",
|
"@types/bun": "^1.0.0",
|
||||||
|
|
|
@ -2,15 +2,23 @@ export enum Supermercado {
|
||||||
Dia = "Dia",
|
Dia = "Dia",
|
||||||
Carrefour = "Carrefour",
|
Carrefour = "Carrefour",
|
||||||
Coto = "Coto",
|
Coto = "Coto",
|
||||||
|
Jumbo = "Jumbo",
|
||||||
}
|
}
|
||||||
|
export const supermercados: Supermercado[] = [
|
||||||
|
Supermercado.Carrefour,
|
||||||
|
Supermercado.Coto,
|
||||||
|
Supermercado.Dia,
|
||||||
|
Supermercado.Jumbo,
|
||||||
|
];
|
||||||
export const hosts: { [host: string]: Supermercado } = {
|
export const hosts: { [host: string]: Supermercado } = {
|
||||||
"diaonline.supermercadosdia.com.ar": Supermercado.Dia,
|
"diaonline.supermercadosdia.com.ar": Supermercado.Dia,
|
||||||
"www.carrefour.com.ar": Supermercado.Carrefour,
|
"www.carrefour.com.ar": Supermercado.Carrefour,
|
||||||
"www.cotodigital3.com.ar": Supermercado.Coto,
|
"www.cotodigital3.com.ar": Supermercado.Coto,
|
||||||
|
"www.jumbo.com.ar": Supermercado.Jumbo,
|
||||||
};
|
};
|
||||||
export const colorBySupermercado: { [supermercado in Supermercado]: string } = {
|
export const colorBySupermercado: { [supermercado in Supermercado]: string } = {
|
||||||
[Supermercado.Dia]: "#d52b1e",
|
[Supermercado.Dia]: "#d52b1e",
|
||||||
[Supermercado.Carrefour]: "#19549d",
|
[Supermercado.Carrefour]: "#19549d",
|
||||||
[Supermercado.Coto]: "#e20025",
|
[Supermercado.Coto]: "#e20025",
|
||||||
|
[Supermercado.Jumbo]: "#2dc850",
|
||||||
};
|
};
|
||||||
|
|
|
@ -1,17 +0,0 @@
|
||||||
{
|
|
||||||
"name": "dia-link-scraper",
|
|
||||||
"type": "module",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"description": "",
|
|
||||||
"main": "index.js",
|
|
||||||
"scripts": {
|
|
||||||
"test": "echo \"Error: no test specified\" && exit 1"
|
|
||||||
},
|
|
||||||
"keywords": [],
|
|
||||||
"author": "",
|
|
||||||
"license": "ISC",
|
|
||||||
"dependencies": {
|
|
||||||
"linkedom": "^0.16.5",
|
|
||||||
"p-map": "^7.0.0"
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,6 +1,6 @@
|
||||||
import pMap from "p-map";
|
import pMap from "p-map";
|
||||||
import { decodeXML } from "entities";
|
|
||||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
|
import { getUrlsFromSitemap } from "./common.js";
|
||||||
|
|
||||||
export async function scrapCarrefourProducts() {
|
export async function scrapCarrefourProducts() {
|
||||||
await scrapBySitemap();
|
await scrapBySitemap();
|
||||||
|
@ -26,17 +26,7 @@ async function scrapBySitemap() {
|
||||||
async (sitemapUrl) => {
|
async (sitemapUrl) => {
|
||||||
const res = await fetch(sitemapUrl);
|
const res = await fetch(sitemapUrl);
|
||||||
const xml = await res.text();
|
const xml = await res.text();
|
||||||
let urls = new Set<string>();
|
saveUrls(getUrlsFromSitemap(xml));
|
||||||
new HTMLRewriter()
|
|
||||||
.on("loc", {
|
|
||||||
text(element) {
|
|
||||||
const txt = element.text.trim();
|
|
||||||
if (!txt) return;
|
|
||||||
urls.add(decodeXML(txt));
|
|
||||||
},
|
|
||||||
})
|
|
||||||
.transform(new Response(xml));
|
|
||||||
saveUrls(Array.from(urls));
|
|
||||||
},
|
},
|
||||||
{ concurrency: 3 }
|
{ concurrency: 3 }
|
||||||
);
|
);
|
14
link-scrapers/common.ts
Normal file
14
link-scrapers/common.ts
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
import { decodeXML } from "entities";
|
||||||
|
export function getUrlsFromSitemap(xml: string) {
|
||||||
|
let urls = new Set<string>();
|
||||||
|
new HTMLRewriter()
|
||||||
|
.on("loc", {
|
||||||
|
text(element) {
|
||||||
|
const txt = element.text.trim();
|
||||||
|
if (!txt) return;
|
||||||
|
urls.add(decodeXML(txt));
|
||||||
|
},
|
||||||
|
})
|
||||||
|
.transform(new Response(xml));
|
||||||
|
return Array.from(urls);
|
||||||
|
}
|
|
@ -1,4 +1,3 @@
|
||||||
import { getHtml } from "../scraper/fetch.js";
|
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import PQueue from "p-queue";
|
import PQueue from "p-queue";
|
||||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
|
@ -28,12 +27,13 @@ function getPage(url: string) {
|
||||||
return async () => {
|
return async () => {
|
||||||
let html;
|
let html;
|
||||||
try {
|
try {
|
||||||
html = await getHtml(url);
|
const res = await fetch(url);
|
||||||
|
html = await res.text();
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
await getPage(url)();
|
await getPage(url)();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const { document } = parseHTML(html.toString("utf-8"));
|
const { document } = parseHTML(html);
|
||||||
|
|
||||||
const hrefs = Array.from(
|
const hrefs = Array.from(
|
||||||
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
|
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
|
|
@ -1,8 +1,7 @@
|
||||||
import pMap from "p-map";
|
import pMap from "p-map";
|
||||||
import { decodeXML } from "entities";
|
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import { getHtml } from "../scraper/fetch.js";
|
|
||||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
|
import { getUrlsFromSitemap } from "./common.js";
|
||||||
|
|
||||||
const categorias = [
|
const categorias = [
|
||||||
"https://diaonline.supermercadosdia.com.ar/almacen",
|
"https://diaonline.supermercadosdia.com.ar/almacen",
|
||||||
|
@ -82,21 +81,15 @@ async function scrapBySitemap() {
|
||||||
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
|
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
|
||||||
];
|
];
|
||||||
|
|
||||||
await pMap(sitemaps, async (sitemapUrl) => {
|
await pMap(
|
||||||
const res = await fetch(sitemapUrl);
|
sitemaps,
|
||||||
const xml = await res.text();
|
async (sitemapUrl) => {
|
||||||
let urls = new Set<string>();
|
const res = await fetch(sitemapUrl);
|
||||||
new HTMLRewriter()
|
const xml = await res.text();
|
||||||
.on("loc", {
|
saveUrls(getUrlsFromSitemap(xml));
|
||||||
text(element) {
|
},
|
||||||
const txt = element.text.trim();
|
{ concurrency: 3 }
|
||||||
if (!txt) return;
|
);
|
||||||
urls.add(decodeXML(txt));
|
|
||||||
},
|
|
||||||
})
|
|
||||||
.transform(new Response(xml));
|
|
||||||
saveUrls(Array.from(urls));
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapBySite() {
|
async function scrapBySite() {
|
||||||
|
@ -111,8 +104,9 @@ async function scrapBySite() {
|
||||||
await pMap(
|
await pMap(
|
||||||
links,
|
links,
|
||||||
async (url) => {
|
async (url) => {
|
||||||
const html = await getHtml(url);
|
const res = await fetch(url);
|
||||||
const { document } = parseHTML(html.toString("utf-8"));
|
const html = await res.text();
|
||||||
|
const { document } = parseHTML(html);
|
||||||
|
|
||||||
const hrefs = Array.from(
|
const hrefs = Array.from(
|
||||||
document.querySelectorAll<HTMLAnchorElement>(
|
document.querySelectorAll<HTMLAnchorElement>(
|
38
link-scrapers/jumbo.ts
Normal file
38
link-scrapers/jumbo.ts
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
import pMap from "p-map";
|
||||||
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
|
import { getUrlsFromSitemap } from "./common.js";
|
||||||
|
|
||||||
|
export async function scrapJumboProducts() {
|
||||||
|
await scrapBySitemap();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapBySitemap() {
|
||||||
|
// de https://www.jumbo.com.ar/sitemap.xml
|
||||||
|
const sitemaps = [
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-1.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-10.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-11.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-12.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-13.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-14.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-15.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-2.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-3.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-4.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-5.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-6.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-7.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-8.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-9.xml",
|
||||||
|
];
|
||||||
|
|
||||||
|
await pMap(
|
||||||
|
sitemaps,
|
||||||
|
async (sitemapUrl) => {
|
||||||
|
const res = await fetch(sitemapUrl);
|
||||||
|
const xml = await res.text();
|
||||||
|
saveUrls(getUrlsFromSitemap(xml));
|
||||||
|
},
|
||||||
|
{ concurrency: 3 }
|
||||||
|
);
|
||||||
|
}
|
|
@ -1,5 +1,5 @@
|
||||||
{
|
{
|
||||||
"name": "coto-link-scraper",
|
"name": "link-scrapers",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"description": "",
|
"description": "",
|
||||||
|
@ -11,6 +11,7 @@
|
||||||
"author": "",
|
"author": "",
|
||||||
"license": "ISC",
|
"license": "ISC",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"entities": "^4.5.0",
|
||||||
"linkedom": "^0.16.5",
|
"linkedom": "^0.16.5",
|
||||||
"p-queue": "^8.0.1"
|
"p-queue": "^8.0.1"
|
||||||
}
|
}
|
|
@ -2,9 +2,7 @@
|
||||||
"name": "preciazo",
|
"name": "preciazo",
|
||||||
"private": true,
|
"private": true,
|
||||||
"workspaces": [
|
"workspaces": [
|
||||||
"dia-link-scraper",
|
"link-scrapers",
|
||||||
"coto-link-scraper",
|
|
||||||
"carrefour-link-scraper",
|
|
||||||
"scraper",
|
"scraper",
|
||||||
"sitio",
|
"sitio",
|
||||||
"db-datos"
|
"db-datos"
|
||||||
|
|
|
@ -4,7 +4,7 @@ scrapeo "masivo" de precios y datos en supermercados argentinos
|
||||||
|
|
||||||
## componentes (en orden de proceso)
|
## componentes (en orden de proceso)
|
||||||
|
|
||||||
- los link scrapers ([coto-link-scraper](./coto-link-scraper/), [dia-link-scraper](./dia-link-scraper/) y [carrefour-link-scraper](./carrefour-link-scraper)) crean listas de links a productos para scrapear
|
- los link scrapers ([link-scrapers/](./link-scrapers/)) crean listas de links a productos para scrapear
|
||||||
|
|
||||||
(no hace falta correrlos porque ya hay listas armadas en [data/](./data/))
|
(no hace falta correrlos porque ya hay listas armadas en [data/](./data/))
|
||||||
|
|
||||||
|
|
|
@ -1,22 +1,17 @@
|
||||||
import { mkdtemp, writeFile } from "node:fs/promises";
|
import { mkdtemp, writeFile } from "node:fs/promises";
|
||||||
import { tmpdir } from "node:os";
|
import { tmpdir } from "node:os";
|
||||||
import { join } from "node:path";
|
import { join } from "node:path";
|
||||||
import { Supermercado, hosts } from "db-datos/supermercado.js";
|
import { Supermercado, hosts, supermercados } from "db-datos/supermercado.js";
|
||||||
import PQueue from "p-queue";
|
import PQueue from "p-queue";
|
||||||
import { formatDuration, intervalToDuration } from "date-fns";
|
import { formatDuration, intervalToDuration } from "date-fns";
|
||||||
import { downloadList } from "./scrap.js";
|
import { downloadList } from "./scrap.js";
|
||||||
import { db } from "db-datos/db.js";
|
import { db } from "db-datos/db.js";
|
||||||
import { like } from "drizzle-orm";
|
import { like } from "drizzle-orm";
|
||||||
import { productoUrls } from "db-datos/schema.js";
|
import { productoUrls } from "db-datos/schema.js";
|
||||||
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
|
import { scrapDiaProducts } from "../link-scrapers/dia.js";
|
||||||
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
|
import { scrapCotoProducts } from "../link-scrapers/coto.js";
|
||||||
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
|
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
|
||||||
|
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
|
||||||
const supermercados: Supermercado[] = [
|
|
||||||
Supermercado.Carrefour,
|
|
||||||
Supermercado.Coto,
|
|
||||||
Supermercado.Dia,
|
|
||||||
];
|
|
||||||
|
|
||||||
// hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU
|
// hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU
|
||||||
const scrapQueue = new PQueue({ concurrency: 4 });
|
const scrapQueue = new PQueue({ concurrency: 4 });
|
||||||
|
@ -59,6 +54,9 @@ class Auto {
|
||||||
case "Carrefour":
|
case "Carrefour":
|
||||||
await scrapCarrefourProducts();
|
await scrapCarrefourProducts();
|
||||||
break;
|
break;
|
||||||
|
case "Jumbo":
|
||||||
|
await scrapJumboProducts();
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
this.inform(
|
this.inform(
|
||||||
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
|
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
|
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
|
||||||
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
|
import { scrapCotoProducts } from "../link-scrapers/coto.js";
|
||||||
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
|
import { scrapDiaProducts } from "../link-scrapers/dia.js";
|
||||||
|
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
|
||||||
import { auto } from "./auto.js";
|
import { auto } from "./auto.js";
|
||||||
import { downloadList } from "./scrap.js";
|
import { downloadList, getProduct } from "./scrap.js";
|
||||||
|
|
||||||
if (process.argv[2] === "auto") {
|
if (process.argv[2] === "auto") {
|
||||||
await auto();
|
await auto();
|
||||||
|
@ -12,6 +13,13 @@ if (process.argv[2] === "auto") {
|
||||||
await scrapDiaProducts();
|
await scrapDiaProducts();
|
||||||
} else if (process.argv[2] === "scrap-coto-links") {
|
} else if (process.argv[2] === "scrap-coto-links") {
|
||||||
await scrapCotoProducts();
|
await scrapCotoProducts();
|
||||||
|
} else if (process.argv[2] === "scrap-jumbo-links") {
|
||||||
|
await scrapJumboProducts();
|
||||||
|
} else if (process.argv[2] === "scrap-link") {
|
||||||
|
const url = new URL(process.argv[3]);
|
||||||
|
const res = await fetch(url);
|
||||||
|
const text = await res.text();
|
||||||
|
console.info(await getProduct(url, text));
|
||||||
} else if (process.argv[2] === "scrap") {
|
} else if (process.argv[2] === "scrap") {
|
||||||
const urlLists = process.argv.slice(3);
|
const urlLists = process.argv.slice(3);
|
||||||
if (urlLists.length > 0) {
|
if (urlLists.length > 0) {
|
||||||
|
|
|
@ -1,13 +0,0 @@
|
||||||
export async function getHtml(url: string) {
|
|
||||||
const res = await fetch(url);
|
|
||||||
return readableToBuffer(res.body!);
|
|
||||||
}
|
|
||||||
|
|
||||||
async function readableToBuffer(source: AsyncIterable<any>) {
|
|
||||||
// https://stackoverflow.com/a/72891118
|
|
||||||
const buffers = [];
|
|
||||||
for await (const data of source) {
|
|
||||||
buffers.push(data);
|
|
||||||
}
|
|
||||||
return Buffer.concat(buffers);
|
|
||||||
}
|
|
|
@ -5,8 +5,7 @@
|
||||||
"description": "",
|
"description": "",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"build:container": "podman build -t gitea.nulo.in/nulo/preciazo/scraper -f ./Containerfile ..",
|
"check": "tsc"
|
||||||
"push:container": "bun build:container && podman push gitea.nulo.in/nulo/preciazo/scraper"
|
|
||||||
},
|
},
|
||||||
"keywords": [],
|
"keywords": [],
|
||||||
"author": "",
|
"author": "",
|
||||||
|
@ -16,8 +15,7 @@
|
||||||
"@aws-sdk/lib-storage": "^3.478.0",
|
"@aws-sdk/lib-storage": "^3.478.0",
|
||||||
"date-fns": "^3.0.6",
|
"date-fns": "^3.0.6",
|
||||||
"db-datos": "workspace:^",
|
"db-datos": "workspace:^",
|
||||||
"drizzle-orm": "=0.29.1",
|
"drizzle-orm": "^0.29.1",
|
||||||
"entities": "^4.5.0",
|
|
||||||
"linkedom": "^0.16.5",
|
"linkedom": "^0.16.5",
|
||||||
"nanoid": "^5.0.4",
|
"nanoid": "^5.0.4",
|
||||||
"p-map": "^7.0.1",
|
"p-map": "^7.0.1",
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import { Precioish } from "../scrap.js";
|
import { Precioish } from "../scrap.js";
|
||||||
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "../common.js";
|
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
|
||||||
|
|
||||||
function parseScriptJson<T>(dom: Window, varname: string): T {
|
function parseScriptJson<T>(dom: Window, varname: string): T {
|
||||||
const script = dom.window.document.querySelector<HTMLTemplateElement>(
|
const script = dom.window.document.querySelector<HTMLTemplateElement>(
|
||||||
|
|
|
@ -21,7 +21,7 @@ function parseJsonLds(dom: Window): object[] {
|
||||||
const scripts = dom.window.document.querySelectorAll(
|
const scripts = dom.window.document.querySelectorAll(
|
||||||
'script[type="application/ld+json"]'
|
'script[type="application/ld+json"]'
|
||||||
);
|
);
|
||||||
return [...scripts].map((scripts) => JSON.parse(scripts.innerHTML));
|
return Array.from(scripts, (script) => JSON.parse(script.innerHTML));
|
||||||
}
|
}
|
||||||
function findJsonLd(dom: Window, type: string): object | undefined {
|
function findJsonLd(dom: Window, type: string): object | undefined {
|
||||||
return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type);
|
return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type);
|
||||||
|
@ -31,6 +31,7 @@ const zProductLd = z.object({
|
||||||
"@type": z.literal("Product"),
|
"@type": z.literal("Product"),
|
||||||
name: z.string(),
|
name: z.string(),
|
||||||
image: z.string(),
|
image: z.string(),
|
||||||
|
sku: z.string().optional(),
|
||||||
offers: z.object({
|
offers: z.object({
|
||||||
offers: z.array(
|
offers: z.array(
|
||||||
z.object({
|
z.object({
|
|
@ -19,7 +19,7 @@ function getEanFromText({ document }: Window) {
|
||||||
}
|
}
|
||||||
function getPriceFromText({ document }: Window) {
|
function getPriceFromText({ document }: Window) {
|
||||||
const el = document.querySelector(".atg_store_newPrice");
|
const el = document.querySelector(".atg_store_newPrice");
|
||||||
if (!el?.textContent) throw new Error("no encuentro el precio");
|
if (!el?.textContent) return null;
|
||||||
const nStr = el.textContent
|
const nStr = el.textContent
|
||||||
.trim()
|
.trim()
|
||||||
.replace("$", "")
|
.replace("$", "")
|
||||||
|
@ -27,12 +27,16 @@ function getPriceFromText({ document }: Window) {
|
||||||
.replace(",", ".");
|
.replace(",", ".");
|
||||||
return parseFloat(nStr) * 100;
|
return parseFloat(nStr) * 100;
|
||||||
}
|
}
|
||||||
|
function getInStock({ document }: Window) {
|
||||||
|
return !document.querySelector(".product_not_available");
|
||||||
|
}
|
||||||
|
|
||||||
export function getCotoProduct(html: string | Buffer): Precioish {
|
export function getCotoProduct(html: string | Buffer): Precioish {
|
||||||
const dom = parseHTML(html);
|
const dom = parseHTML(html);
|
||||||
|
|
||||||
const ean = getEanFromText(dom);
|
const ean = getEanFromText(dom);
|
||||||
const precioCentavos = getPriceFromText(dom);
|
const precioCentavos = getPriceFromText(dom);
|
||||||
|
const inStock = getInStock(dom);
|
||||||
|
|
||||||
const name = dom.document
|
const name = dom.document
|
||||||
.querySelector("h1.product_page")
|
.querySelector("h1.product_page")
|
||||||
|
@ -40,5 +44,5 @@ export function getCotoProduct(html: string | Buffer): Precioish {
|
||||||
const imageUrl =
|
const imageUrl =
|
||||||
dom.document.querySelector<HTMLImageElement>(".zoom img")?.src;
|
dom.document.querySelector<HTMLImageElement>(".zoom img")?.src;
|
||||||
|
|
||||||
return { name, imageUrl, ean, precioCentavos };
|
return { name, imageUrl, ean, precioCentavos, inStock };
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import { type Precioish } from "../scrap.js";
|
import { type Precioish } from "../scrap.js";
|
||||||
import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js";
|
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
|
||||||
|
|
||||||
export function getDiaProduct(html: string | Buffer): Precioish {
|
export function getDiaProduct(html: string | Buffer): Precioish {
|
||||||
const dom = parseHTML(html);
|
const dom = parseHTML(html);
|
||||||
|
|
54
scraper/parsers/jumbo.ts
Normal file
54
scraper/parsers/jumbo.ts
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
import { parseHTML } from "linkedom";
|
||||||
|
import { type Precioish } from "../scrap.js";
|
||||||
|
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
|
||||||
|
import { z } from "zod";
|
||||||
|
|
||||||
|
const zJumboSearch = z.tuple([
|
||||||
|
z.object({
|
||||||
|
items: z.array(
|
||||||
|
z.object({
|
||||||
|
ean: z.string(),
|
||||||
|
})
|
||||||
|
),
|
||||||
|
}),
|
||||||
|
]);
|
||||||
|
|
||||||
|
async function getEanFromSearch(sku: string) {
|
||||||
|
const url = new URL(
|
||||||
|
"https://www.jumbo.com.ar/api/catalog_system/pub/products/search"
|
||||||
|
);
|
||||||
|
url.searchParams.set("fq", `skuId:${sku}`);
|
||||||
|
const res = await fetch(url);
|
||||||
|
const json = await res.json();
|
||||||
|
const parsed = zJumboSearch.parse(json);
|
||||||
|
const ean = parsed[0].items[0].ean;
|
||||||
|
if (!parsed[0].items.every((x) => x.ean === ean)) {
|
||||||
|
throw new Error("Inesperado: no todos los items tienen el mismo EAN");
|
||||||
|
}
|
||||||
|
return ean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getJumboProduct(
|
||||||
|
html: string | Buffer
|
||||||
|
): Promise<Precioish> {
|
||||||
|
const dom = parseHTML(html);
|
||||||
|
const precioCentavos = priceFromMeta(dom);
|
||||||
|
const inStock = stockFromMeta(dom);
|
||||||
|
|
||||||
|
const ld = getProductJsonLd(dom);
|
||||||
|
const name = ld.name;
|
||||||
|
const imageUrl = ld.image;
|
||||||
|
|
||||||
|
const retailerSku = ld.sku;
|
||||||
|
if (!retailerSku)
|
||||||
|
throw new Error("No encontré el SKU de Jumbo para pedir el EAN");
|
||||||
|
const ean = await getEanFromSearch(retailerSku);
|
||||||
|
|
||||||
|
return {
|
||||||
|
name,
|
||||||
|
imageUrl,
|
||||||
|
ean,
|
||||||
|
precioCentavos,
|
||||||
|
inStock,
|
||||||
|
};
|
||||||
|
}
|
153
scraper/scrap.ts
153
scraper/scrap.ts
|
@ -1,5 +1,6 @@
|
||||||
|
/// <reference lib="dom" />
|
||||||
import * as schema from "db-datos/schema.js";
|
import * as schema from "db-datos/schema.js";
|
||||||
import { writeFile } from "fs/promises";
|
import { writeFile, mkdir } from "fs/promises";
|
||||||
import { createHash } from "crypto";
|
import { createHash } from "crypto";
|
||||||
import { getCarrefourProduct } from "./parsers/carrefour.js";
|
import { getCarrefourProduct } from "./parsers/carrefour.js";
|
||||||
import { getDiaProduct } from "./parsers/dia.js";
|
import { getDiaProduct } from "./parsers/dia.js";
|
||||||
|
@ -7,8 +8,9 @@ import { getCotoProduct } from "./parsers/coto.js";
|
||||||
import { join } from "path";
|
import { join } from "path";
|
||||||
import { db } from "db-datos/db.js";
|
import { db } from "db-datos/db.js";
|
||||||
import pMap from "p-map";
|
import pMap from "p-map";
|
||||||
|
import { getJumboProduct } from "./parsers/jumbo.js";
|
||||||
|
|
||||||
const DEBUG = false;
|
const DEBUG = true;
|
||||||
const PARSER_VERSION = 4;
|
const PARSER_VERSION = 4;
|
||||||
|
|
||||||
export type Precio = typeof schema.precios.$inferInsert;
|
export type Precio = typeof schema.precios.$inferInsert;
|
||||||
|
@ -18,73 +20,108 @@ export type Precioish = Omit<
|
||||||
>;
|
>;
|
||||||
|
|
||||||
export async function downloadList(path: string) {
|
export async function downloadList(path: string) {
|
||||||
let progress: {
|
|
||||||
done: number;
|
|
||||||
skipped: number;
|
|
||||||
errors: { error: any; url: string; path: string }[];
|
|
||||||
} = { done: 0, skipped: 0, errors: [] };
|
|
||||||
|
|
||||||
let list = (await Bun.file(path).text())
|
let list = (await Bun.file(path).text())
|
||||||
.split("\n")
|
.split("\n")
|
||||||
.filter((s) => s.length > 0);
|
.filter((s) => s.length > 0);
|
||||||
|
|
||||||
await pMap(
|
const results = await pMap(
|
||||||
list,
|
list,
|
||||||
async (urlS) => {
|
async (urlS) => {
|
||||||
let url;
|
let res: ScrapResult = { type: "skipped" };
|
||||||
try {
|
for (let attempts = 0; attempts < 6; attempts++) {
|
||||||
url = new URL(urlS);
|
if (attempts !== 0) await wait(1500);
|
||||||
} catch (err) {
|
res = await scrap(urlS);
|
||||||
console.error("error parseando", urlS);
|
if (res.type === "done" || res.type === "skipped") {
|
||||||
return;
|
break;
|
||||||
}
|
|
||||||
const res = await fetch(url);
|
|
||||||
if (!res.ok) {
|
|
||||||
console.debug(`skipped ${urlS} because status=${res.status} (!=200)`);
|
|
||||||
progress.skipped++;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const html = await res.text();
|
|
||||||
|
|
||||||
try {
|
|
||||||
let ish: Precioish | undefined = undefined;
|
|
||||||
if (url.hostname === "www.carrefour.com.ar")
|
|
||||||
ish = getCarrefourProduct(html);
|
|
||||||
else if (url.hostname === "diaonline.supermercadosdia.com.ar")
|
|
||||||
ish = getDiaProduct(html);
|
|
||||||
else if (url.hostname === "www.cotodigital3.com.ar")
|
|
||||||
ish = getCotoProduct(html);
|
|
||||||
else throw new Error(`Unknown host ${url.hostname}`);
|
|
||||||
|
|
||||||
const p: Precio = {
|
|
||||||
...ish,
|
|
||||||
fetchedAt: new Date(),
|
|
||||||
url: urlS,
|
|
||||||
parserVersion: PARSER_VERSION,
|
|
||||||
};
|
|
||||||
|
|
||||||
await db.insert(schema.precios).values(p);
|
|
||||||
|
|
||||||
progress.done++;
|
|
||||||
} catch (error) {
|
|
||||||
console.error({ path, urlS, error });
|
|
||||||
progress.errors.push({
|
|
||||||
path,
|
|
||||||
url: urlS,
|
|
||||||
error,
|
|
||||||
});
|
|
||||||
|
|
||||||
if (DEBUG) {
|
|
||||||
const urlHash = createHash("md5").update(urlS).digest("hex");
|
|
||||||
const output = join("debug", `${urlHash}.html`);
|
|
||||||
await writeFile(output, html);
|
|
||||||
console.error(`wrote html to ${output}`);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (res.type === "error") console.error(res);
|
||||||
|
return res;
|
||||||
},
|
},
|
||||||
{ concurrency: 32 }
|
{ concurrency: 32 }
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let progress: {
|
||||||
|
done: number;
|
||||||
|
skipped: number;
|
||||||
|
errors: { error: any; url: string; debugPath: string }[];
|
||||||
|
} = { done: 0, skipped: 0, errors: [] };
|
||||||
|
for (const result of results) {
|
||||||
|
switch (result.type) {
|
||||||
|
case "done":
|
||||||
|
progress.done++;
|
||||||
|
break;
|
||||||
|
case "error":
|
||||||
|
progress.errors.push(result);
|
||||||
|
break;
|
||||||
|
case "skipped":
|
||||||
|
progress.skipped++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
return progress;
|
return progress;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function getProduct(url: URL, html: string): Promise<Precioish> {
|
||||||
|
if (url.hostname === "www.carrefour.com.ar") return getCarrefourProduct(html);
|
||||||
|
else if (url.hostname === "diaonline.supermercadosdia.com.ar")
|
||||||
|
return getDiaProduct(html);
|
||||||
|
else if (url.hostname === "www.cotodigital3.com.ar")
|
||||||
|
return getCotoProduct(html);
|
||||||
|
else if (url.hostname === "www.jumbo.com.ar")
|
||||||
|
return await getJumboProduct(html);
|
||||||
|
else throw new Error(`Unknown host ${url.hostname}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
type ScrapResult =
|
||||||
|
| { type: "skipped" }
|
||||||
|
| { type: "done" }
|
||||||
|
| { type: "error"; url: string; error: any; debugPath: string };
|
||||||
|
async function scrap(urlS: string): Promise<ScrapResult> {
|
||||||
|
let url;
|
||||||
|
try {
|
||||||
|
url = new URL(urlS);
|
||||||
|
} catch (err) {
|
||||||
|
console.error(`skipped ${urlS} because ${err}`);
|
||||||
|
return { type: "skipped" };
|
||||||
|
}
|
||||||
|
const res = await fetch(url);
|
||||||
|
if (!res.ok) {
|
||||||
|
console.debug(`skipped ${urlS} because status=${res.status} (!=200)`);
|
||||||
|
return { type: "skipped" };
|
||||||
|
}
|
||||||
|
|
||||||
|
const html = await res.text();
|
||||||
|
|
||||||
|
try {
|
||||||
|
let ish = await getProduct(url, html);
|
||||||
|
|
||||||
|
const p: Precio = {
|
||||||
|
...ish,
|
||||||
|
fetchedAt: new Date(),
|
||||||
|
url: urlS,
|
||||||
|
parserVersion: PARSER_VERSION,
|
||||||
|
};
|
||||||
|
|
||||||
|
await db.insert(schema.precios).values(p);
|
||||||
|
|
||||||
|
return { type: "done" };
|
||||||
|
} catch (error) {
|
||||||
|
const urlHash = createHash("md5").update(urlS).digest("hex");
|
||||||
|
const output = join("debug", `${urlHash}.html`);
|
||||||
|
if (DEBUG) {
|
||||||
|
await mkdir("debug", { recursive: true });
|
||||||
|
await writeFile(output, html);
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
type: "error",
|
||||||
|
url: urlS,
|
||||||
|
error,
|
||||||
|
debugPath: output,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function wait(ms: number) {
|
||||||
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
{
|
{
|
||||||
"extends": "../tsconfig.json"
|
"extends": "../tsconfig.json",
|
||||||
|
"exclude": ["../sitio"]
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,6 +39,6 @@
|
||||||
"chart.js": "^4.4.1",
|
"chart.js": "^4.4.1",
|
||||||
"chartjs-adapter-dayjs-4": "^1.0.4",
|
"chartjs-adapter-dayjs-4": "^1.0.4",
|
||||||
"dayjs": "^1.11.10",
|
"dayjs": "^1.11.10",
|
||||||
"drizzle-orm": "=0.29.1"
|
"drizzle-orm": "^0.29.1"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
<script lang="ts">
|
<script lang="ts">
|
||||||
export let product: { ean: string; name: string; imageUrl: string };
|
export let product: { ean: string; name: string; imageUrl?: string | null };
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<a href={`/ean/${product.ean}`} class="flex">
|
<a href={`/ean/${product.ean}`} class="flex">
|
||||||
<img src={product.imageUrl} alt={product.name} class="max-h-48" />
|
{#if product.imageUrl}
|
||||||
|
<img src={product.imageUrl} alt={product.name} class="max-h-48" />
|
||||||
|
{/if}
|
||||||
<p class="text-xl">{product.name}</p>
|
<p class="text-xl">{product.name}</p>
|
||||||
</a>
|
</a>
|
||||||
|
|
|
@ -1,9 +1,14 @@
|
||||||
import type { PageServerLoad } from "./$types";
|
import type { PageData, PageServerLoad } from "./$types";
|
||||||
import { db, schema } from "$lib/server/db";
|
import { db, schema } from "$lib/server/db";
|
||||||
const { precios } = schema;
|
const { precios } = schema;
|
||||||
import { sql } from "drizzle-orm";
|
import { sql } from "drizzle-orm";
|
||||||
|
|
||||||
|
let cache: null | { key: Date; data: PageData } = null;
|
||||||
|
|
||||||
export const load: PageServerLoad = async ({ params }) => {
|
export const load: PageServerLoad = async ({ params }) => {
|
||||||
|
if (cache && +new Date() < +cache.key + 1000 * 60 * 10) {
|
||||||
|
return cache.data;
|
||||||
|
}
|
||||||
const q = db
|
const q = db
|
||||||
.select({
|
.select({
|
||||||
ean: precios.ean,
|
ean: precios.ean,
|
||||||
|
@ -16,5 +21,7 @@ export const load: PageServerLoad = async ({ params }) => {
|
||||||
.orderBy(sql`random()`)
|
.orderBy(sql`random()`)
|
||||||
.limit(150);
|
.limit(150);
|
||||||
const res = await q;
|
const res = await q;
|
||||||
return { precios: res };
|
const data = { precios: res };
|
||||||
|
cache = { key: new Date(), data };
|
||||||
|
return data;
|
||||||
};
|
};
|
||||||
|
|
|
@ -3,6 +3,10 @@
|
||||||
import type { PageData } from "./$types";
|
import type { PageData } from "./$types";
|
||||||
|
|
||||||
export let data: PageData;
|
export let data: PageData;
|
||||||
|
$: precios = data.precios.filter(
|
||||||
|
(d): d is { ean: string; name: string; imageUrl: string | null } =>
|
||||||
|
!!d.name,
|
||||||
|
);
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<h1 class="text-xl">WIP</h1>
|
<h1 class="text-xl">WIP</h1>
|
||||||
|
@ -32,7 +36,7 @@
|
||||||
<section>
|
<section>
|
||||||
<h2 class="text-lg font-bold">Random</h2>
|
<h2 class="text-lg font-bold">Random</h2>
|
||||||
<ul class="grid grid-cols-1 gap-4 md:grid-cols-2 lg:grid-cols-3">
|
<ul class="grid grid-cols-1 gap-4 md:grid-cols-2 lg:grid-cols-3">
|
||||||
{#each data.precios as product}
|
{#each precios as product}
|
||||||
<li>
|
<li>
|
||||||
<ProductPreview {product} />
|
<ProductPreview {product} />
|
||||||
</li>
|
</li>
|
||||||
|
|
|
@ -9,8 +9,6 @@ export const load: PageServerLoad = async ({ params }) => {
|
||||||
.select()
|
.select()
|
||||||
.from(precios)
|
.from(precios)
|
||||||
.where(eq(precios.ean, params.ean))
|
.where(eq(precios.ean, params.ean))
|
||||||
.groupBy(precios.warcRecordId)
|
|
||||||
.having(max(precios.parserVersion))
|
|
||||||
.orderBy(precios.fetchedAt);
|
.orderBy(precios.fetchedAt);
|
||||||
const res = await q;
|
const res = await q;
|
||||||
if (res.length === 0) return error(404, "Not Found");
|
if (res.length === 0) return error(404, "Not Found");
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
[Supermercado.Dia]: "bg-[#d52b1e] focus:ring-[#d52b1e]",
|
[Supermercado.Dia]: "bg-[#d52b1e] focus:ring-[#d52b1e]",
|
||||||
[Supermercado.Carrefour]: "bg-[#19549d] focus:ring-[#19549d]",
|
[Supermercado.Carrefour]: "bg-[#19549d] focus:ring-[#19549d]",
|
||||||
[Supermercado.Coto]: "bg-[#e20025] focus:ring-[#e20025]",
|
[Supermercado.Coto]: "bg-[#e20025] focus:ring-[#e20025]",
|
||||||
|
[Supermercado.Jumbo]: "bg-[#2dc850] focus:ring-[#2dc850]",
|
||||||
};
|
};
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
|
|
|
@ -1,19 +1,17 @@
|
||||||
import { error } from "@sveltejs/kit";
|
import { error } from "@sveltejs/kit";
|
||||||
import { eq, max, sql } from "drizzle-orm";
|
import { sql } from "drizzle-orm";
|
||||||
import type { PageServerLoad } from "./$types";
|
import type { PageServerLoad } from "./$types";
|
||||||
import { db, schema } from "$lib/server/db";
|
import { db } from "$lib/server/db";
|
||||||
const { precios } = schema;
|
|
||||||
|
|
||||||
export const load: PageServerLoad = async ({ url }) => {
|
export const load: PageServerLoad = async ({ url }) => {
|
||||||
const query = url.searchParams.get("q");
|
const query = url.searchParams.get("q");
|
||||||
let results: null | { ean: string; name: string; imageUrl: string }[] = null;
|
let results: null | { ean: string; name: string; imageUrl: string }[] = null;
|
||||||
if (query) {
|
if (query) {
|
||||||
results = db.all(
|
const sqlQuery = sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
|
||||||
sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
|
|
||||||
join precios p on p.ean = f.ean
|
join precios p on p.ean = f.ean
|
||||||
where f.name match ${query}
|
where f.name match ${`"${query}"`}
|
||||||
group by p.ean;`,
|
group by p.ean;`;
|
||||||
);
|
results = db.all(sqlQuery);
|
||||||
}
|
}
|
||||||
|
|
||||||
return { query, results };
|
return { query, results };
|
||||||
|
|
|
@ -15,5 +15,6 @@
|
||||||
"noEmit": true,
|
"noEmit": true,
|
||||||
"forceConsistentCasingInFileNames": true
|
"forceConsistentCasingInFileNames": true
|
||||||
},
|
},
|
||||||
"include": ["**/*.ts", "**/*.js"]
|
"include": ["**/*.ts", "**/*.js"],
|
||||||
|
"exclude": ["sitio/build"]
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue