Merge branch 'master' into wip-rust-downloader

This commit is contained in:
Cat /dev/Nulo 2024-01-08 10:29:24 -03:00
commit c56272dc30
47 changed files with 959 additions and 1009 deletions

View file

@ -1,7 +1,7 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/alpine
{
"name": "Alpine",
"name": "Debian",
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
"image": "mcr.microsoft.com/devcontainers/base:debian",
"features": {

View file

@ -4,4 +4,11 @@ data/carrefour/
downloader/
node_modules/
*/node_modules/
*/Containerfile
Containerfile
*/Containerfile
Dockerfile
*/Dockerfile
*.warc.zst
.git
scraper/debug/
*/target/

54
.github/workflows/container.yml vendored Normal file
View file

@ -0,0 +1,54 @@
name: check and publish container image
on:
push:
branches: ["master"]
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
jobs:
check:
name: chequear typescript
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: oven-sh/setup-bun@v1
- run: bun install
working-directory: ./sitio
- run: bun check
working-directory: ./sitio
- run: bun install
working-directory: ./scraper
- run: bun check
working-directory: ./scraper
build-and-push-sitio:
needs: check
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Log in to the Container registry
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/sitio
- name: Build and push Docker image
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
with:
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

3
.gitignore vendored
View file

@ -13,3 +13,6 @@ scraper/x.tsv
*.tmp
target/
.env.*
*/flamegraph.svg
*/perf.data*

2
.vscode/launch.json vendored
View file

@ -13,7 +13,7 @@
// https://github.com/vadimcn/codelldb/issues/884
"args": ["build", "--manifest-path=warcificator/Cargo.toml"]
},
"args": ["../data/samples/Carrefour.50.txt"],
"args": ["../data/carrefour"],
"env": {}
},
{

30
Dockerfile Normal file
View file

@ -0,0 +1,30 @@
FROM docker.io/oven/bun:1-alpine AS base
WORKDIR /usr/src/app
FROM base as build
ENV NODE_ENV=production
RUN apk add --no-cache nodejs
COPY . .
RUN bun install --frozen-lockfile
RUN cd sitio && \
bun run build
RUN bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js
FROM cgr.dev/chainguard/wolfi-base
RUN apk add --no-cache nodejs npm jq bun
# Sitio
COPY --from=build /usr/src/app/sitio/package.json package.real.json
RUN sh -c 'echo {\"name\":\"sitio\",\"type\":\"module\",\"dependencies\":$(jq .dependencies < package.real.json)} > package.json' && npm install
COPY --from=build /usr/src/app/db-datos node_modules/db-datos
COPY --from=build /usr/src/app/sitio/build .
# Scraper
COPY --from=build /tmp/cli.build.js /bin/scraper
COPY --from=build /usr/src/app/db-datos/drizzle /bin/drizzle
ENV NODE_ENV=production
ENV DB_PATH=/db/db.db
EXPOSE 3000
CMD ["node", "."]

BIN
bun.lockb

Binary file not shown.

View file

@ -1,17 +0,0 @@
{
"name": "carrefour-link-scraper",
"type": "module",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"linkedom": "^0.16.5",
"p-map": "^7.0.1"
}
}

3
data/Jumbo.txt Normal file
View file

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6299b470d9debc9a173b40c2ba91208eb43a6c8cde02a4819e7bcd76368e4363
size 922185

100
data/samples/Jumbo.100.txt Normal file
View file

@ -0,0 +1,100 @@
https://www.jumbo.com.ar/huevos-de-color-avicoper-6-u-1-paquete-2/p
https://www.jumbo.com.ar/ajo-ahumado-organico-pampa-gourmet-285g/p
https://www.jumbo.com.ar/boxer-dst-raya-finita-art-b278-talle-m/p
https://www.jumbo.com.ar/yogur-bebible-ser-sachet-vainilla-900g/p
https://www.jumbo.com.ar/plato-playo-melamina-27-cm-boho-krea-2/p
https://www.jumbo.com.ar/mermelada-la-vieja-fabrica-frutos-del-bosque-350-gr/p
https://www.jumbo.com.ar/dr-lemon-vodka-pomelo-5/p
https://www.jumbo.com.ar/vino-cuvelier-los-andes-grand-vin-750cc/p
https://www.jumbo.com.ar/capsulas-cafe-cabrales-dg-cortado-x88gr/p
https://www.jumbo.com.ar/pizza-muzarella-e/p
https://www.jumbo.com.ar/filet-de-merluza-rebozado-8/p
https://www.jumbo.com.ar/ron-bacardi-carta-blanca-750-ml/p
https://www.jumbo.com.ar/sal-gruesa-celusal-1-kg/p
https://www.jumbo.com.ar/vaso-bajo-acrilico-boho-krea-2/p
https://www.jumbo.com.ar/espumante-chandon-demi-sec/p
https://www.jumbo.com.ar/jarra-electrica-smartlife-sl-ek1714wpn/p
https://www.jumbo.com.ar/espumante-dada-7-rose-dulce-750-cc/p
https://www.jumbo.com.ar/panquequera-hudson-de-aluminio-con-antiadherente-22cm/p
https://www.jumbo.com.ar/sacapuntas-de-plastico-pizzini-2un/p
https://www.jumbo.com.ar/vino-vinas-de-alvear-tinto-750ml/p
https://www.jumbo.com.ar/campera-mujer-puffer-larga/p
https://www.jumbo.com.ar/tabla-de-quesos/p
https://www.jumbo.com.ar/frutos-del-bosque-frutas-del-sur-x400gr/p
https://www.jumbo.com.ar/blister-resaltador-flash-amarillo-x-1-un/p
https://www.jumbo.com.ar/alim-whiskas-gatitos-carne-y-leche-500gr/p
https://www.jumbo.com.ar/detergente-polvo-zorro-blue-3k-x-1un/p
https://www.jumbo.com.ar/media-vestir-hombre-1s10471-negro/p
https://www.jumbo.com.ar/nachos-macritas-ketchup-x90g/p
https://www.jumbo.com.ar/pack-x3-medias-juvenil-liso-t-5-elemento/p
https://www.jumbo.com.ar/set-de-vehiculos-emergencias-duravit/p
https://www.jumbo.com.ar/carbon-patagonia-x-4kgs/p
https://www.jumbo.com.ar/rejilla-mr-trapo-cocina-algodon/p
https://www.jumbo.com.ar/jugo-exprimido-pura-frutta-arandanos-manzana-verde-x-1l/p
https://www.jumbo.com.ar/media-dama-invisible-alta-nyb-urb-2/p
https://www.jumbo.com.ar/boxer-nino-raya-violeta-2-colores-dst-t-10/p
https://www.jumbo.com.ar/barra-zafran-caju-y-sem-de-zapallo-x112g/p
https://www.jumbo.com.ar/iniciador-de-fuego-maderasa/p
https://www.jumbo.com.ar/queso-mozzarella-barraza-x-500grs-paq-gr-500/p
https://www.jumbo.com.ar/vaso-de-vidrio-cuadrado-360-cc/p
https://www.jumbo.com.ar/shampoo-sedal-jengibre-y-ricino-190ml/p
https://www.jumbo.com.ar/roller-gel-filgo-gel-pop-glitter-1un/p
https://www.jumbo.com.ar/una-familia-anormal-el-misterio-de-prh/p
https://www.jumbo.com.ar/veggie-stick-tomate-y-oliva-via-vita-x-50grs/p
https://www.jumbo.com.ar/bowl-stor-bicolor-mickey-mouse/p
https://www.jumbo.com.ar/vino-blanco-don-valentin-lacrado-750-ml/p
https://www.jumbo.com.ar/un-vecino-anormal-2-prh/p
https://www.jumbo.com.ar/paleta-pet-cancat-mordillo-ice/p
https://www.jumbo.com.ar/aceitunas-nucete-premium-descarozadas-180-gr/p
https://www.jumbo.com.ar/caja-plastica-6l-teen-boy-pv23-krea-2/p
https://www.jumbo.com.ar/vino-santa-julia-chardonnay-x-750-cc/p
https://www.jumbo.com.ar/protecor-solar-dermaglos-bebes-fps65-120gr/p
https://www.jumbo.com.ar/oregano-100-gr/p
https://www.jumbo.com.ar/puerro-song/p
https://www.jumbo.com.ar/repuesto-difusor-sandia-pepino-350-ml-2/p
https://www.jumbo.com.ar/botellas-plasticas-origin-580ml-rosa-2/p
https://www.jumbo.com.ar/nescafe-dolca-original-x-170gr/p
https://www.jumbo.com.ar/tapa-empanada-veggie-signo-de-oro-x-500g/p
https://www.jumbo.com.ar/inflador-de-pie-bestway-air-hammer/p
https://www.jumbo.com.ar/ketchup-ahumado-marian-arytza-400g/p
https://www.jumbo.com.ar/sal-marina-finas-hierbas-ahumada-s-tacc-450g/p
https://www.jumbo.com.ar/jugo-smudis-pomelo-500ml-brk-0-5-lt/p
https://www.jumbo.com.ar/limpiador-antihongos-ayudin-removedor-activo-envase-economico-450-ml/p
https://www.jumbo.com.ar/marcador-permanente-punta-redonda-color-negro/p
https://www.jumbo.com.ar/galletitas-dulces-con-chips-de-chocolate-pepitos-119g/p
https://www.jumbo.com.ar/afeitadora-bic-comfort-twin-l5p4-2/p
https://www.jumbo.com.ar/canvas-20x20-cm-paisajes-04-krea/p
https://www.jumbo.com.ar/turron-georgalos-de-mani-con-chocolate-x-90-gr/p
https://www.jumbo.com.ar/arroz-vanguardia-elaborado-largo-fino/p
https://www.jumbo.com.ar/set-x-3-pastafrola-fija-n-14/p
https://www.jumbo.com.ar/pulpa-fina-basilico-mutti-400-gr/p
https://www.jumbo.com.ar/vino-tinto-elementos-malbec-750-cc/p
https://www.jumbo.com.ar/enjuague-bucal-listerine-antisarro-suave-sn-alcohol-x250/p
https://www.jumbo.com.ar/almohaditas-lasfor-avellana-200-grs/p
https://www.jumbo.com.ar/vino-tinto-los-haroldos-estate-cabernet-sauvignon-750-ml/p
https://www.jumbo.com.ar/peluche-funnyland-maxtoys-tibalt-perro-28cm/p
https://www.jumbo.com.ar/cafetera-filtro-negro-electrolux-1-2-litros/p
https://www.jumbo.com.ar/media-nina-ciudadella-minnie-t2/p
https://www.jumbo.com.ar/portaretrato-colores-13x18cm-4c-krea4136010100/p
https://www.jumbo.com.ar/lustramuebles-blem-madera-aceite-de-argan-aerosol-360cc/p
https://www.jumbo.com.ar/sriracha-sauce-hashi-x250ml-2/p
https://www.jumbo.com.ar/plato-hondo-22-1-cm-ceramica-blanca/p
https://www.jumbo.com.ar/limpiador-harpic-banos-sarro-y-manchas-495ml/p
https://www.jumbo.com.ar/shampoo-dove-real-poder-de-las-plantas-purificacion-jengibre-300-ml/p
https://www.jumbo.com.ar/aromatizador-glade-mini-gel-car-3/p
https://www.jumbo.com.ar/carpeta-con-10-folios-a4/p
https://www.jumbo.com.ar/sabana-king-caracol-krea/p
https://www.jumbo.com.ar/leche-en-polvo-nutribaby-1-hmo-x-800-grs/p
https://www.jumbo.com.ar/chalitas-viavita-clasicas-x-100-grs-sin-tacc/p
https://www.jumbo.com.ar/hervidor-tramontina-14cm-cm-x1/p
https://www.jumbo.com.ar/aceitunas-de-gordal-ybarra-x240gr-2/p
https://www.jumbo.com.ar/tableta-vizzio-relleno-nugaton-x100g-2/p
https://www.jumbo.com.ar/mortadela-paladini-fetas-finas-x-200-gr-2/p
https://www.jumbo.com.ar/budin-limon-y-amapolas/p
https://www.jumbo.com.ar/vino-chac-chac-sauvingnon-blanc-lata-269cc/p
https://www.jumbo.com.ar/whisky-chivas-regal-18-yo-700cc/p
https://www.jumbo.com.ar/copa-de-vidrio-rigolleau-6/p
https://www.jumbo.com.ar/notcreamcheese-210-gr/p
https://www.jumbo.com.ar/oso-con-miel-de-abejas-cuisine-co-340-gr/p
https://www.jumbo.com.ar/difusor-aromas-spirit-spirit-win-home-250ml-x1/p
https://www.jumbo.com.ar/exprimidor-ultracomb-ex-2302/p

View file

@ -11,7 +11,7 @@
"author": "",
"license": "ISC",
"dependencies": {
"drizzle-orm": "=0.29.1"
"drizzle-orm": "^0.29.1"
},
"devDependencies": {
"@types/bun": "^1.0.0",

View file

@ -2,15 +2,23 @@ export enum Supermercado {
Dia = "Dia",
Carrefour = "Carrefour",
Coto = "Coto",
Jumbo = "Jumbo",
}
export const supermercados: Supermercado[] = [
Supermercado.Carrefour,
Supermercado.Coto,
Supermercado.Dia,
Supermercado.Jumbo,
];
export const hosts: { [host: string]: Supermercado } = {
"diaonline.supermercadosdia.com.ar": Supermercado.Dia,
"www.carrefour.com.ar": Supermercado.Carrefour,
"www.cotodigital3.com.ar": Supermercado.Coto,
"www.jumbo.com.ar": Supermercado.Jumbo,
};
export const colorBySupermercado: { [supermercado in Supermercado]: string } = {
[Supermercado.Dia]: "#d52b1e",
[Supermercado.Carrefour]: "#19549d",
[Supermercado.Coto]: "#e20025",
[Supermercado.Jumbo]: "#2dc850",
};

View file

@ -1,17 +0,0 @@
{
"name": "dia-link-scraper",
"type": "module",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"linkedom": "^0.16.5",
"p-map": "^7.0.0"
}
}

View file

@ -1,5 +1,6 @@
import pMap from "p-map";
import { saveUrls } from "db-datos/urlHelpers.js";
import { getUrlsFromSitemap } from "./common.js";
export async function scrapCarrefourProducts() {
await scrapBySitemap();
@ -25,17 +26,7 @@ async function scrapBySitemap() {
async (sitemapUrl) => {
const res = await fetch(sitemapUrl);
const xml = await res.text();
let urls = new Set<string>();
new HTMLRewriter()
.on("loc", {
text(element) {
const txt = element.text.trim();
if (!txt) return;
urls.add(txt);
},
})
.transform(new Response(xml));
saveUrls(Array.from(urls));
saveUrls(getUrlsFromSitemap(xml));
},
{ concurrency: 3 }
);

14
link-scrapers/common.ts Normal file
View file

@ -0,0 +1,14 @@
import { decodeXML } from "entities";
export function getUrlsFromSitemap(xml: string) {
let urls = new Set<string>();
new HTMLRewriter()
.on("loc", {
text(element) {
const txt = element.text.trim();
if (!txt) return;
urls.add(decodeXML(txt));
},
})
.transform(new Response(xml));
return Array.from(urls);
}

View file

@ -1,4 +1,3 @@
import { getHtml } from "../scraper/fetch.js";
import { parseHTML } from "linkedom";
import PQueue from "p-queue";
import { saveUrls } from "db-datos/urlHelpers.js";
@ -28,12 +27,13 @@ function getPage(url: string) {
return async () => {
let html;
try {
html = await getHtml(url);
const res = await fetch(url);
html = await res.text();
} catch (error) {
await getPage(url)();
return;
}
const { document } = parseHTML(html.toString("utf-8"));
const { document } = parseHTML(html);
const hrefs = Array.from(
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),

View file

@ -1,7 +1,7 @@
import pMap from "p-map";
import { parseHTML } from "linkedom";
import { getHtml } from "../scraper/fetch.js";
import { saveUrls } from "db-datos/urlHelpers.js";
import { getUrlsFromSitemap } from "./common.js";
const categorias = [
"https://diaonline.supermercadosdia.com.ar/almacen",
@ -81,21 +81,15 @@ async function scrapBySitemap() {
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
];
await pMap(sitemaps, async (sitemapUrl) => {
const res = await fetch(sitemapUrl);
const xml = await res.text();
let urls = new Set<string>();
new HTMLRewriter()
.on("loc", {
text(element) {
const txt = element.text.trim();
if (!txt) return;
urls.add(txt);
},
})
.transform(new Response(xml));
saveUrls(Array.from(urls));
});
await pMap(
sitemaps,
async (sitemapUrl) => {
const res = await fetch(sitemapUrl);
const xml = await res.text();
saveUrls(getUrlsFromSitemap(xml));
},
{ concurrency: 3 }
);
}
async function scrapBySite() {
@ -110,8 +104,9 @@ async function scrapBySite() {
await pMap(
links,
async (url) => {
const html = await getHtml(url);
const { document } = parseHTML(html.toString("utf-8"));
const res = await fetch(url);
const html = await res.text();
const { document } = parseHTML(html);
const hrefs = Array.from(
document.querySelectorAll<HTMLAnchorElement>(

38
link-scrapers/jumbo.ts Normal file
View file

@ -0,0 +1,38 @@
import pMap from "p-map";
import { saveUrls } from "db-datos/urlHelpers.js";
import { getUrlsFromSitemap } from "./common.js";
export async function scrapJumboProducts() {
await scrapBySitemap();
}
async function scrapBySitemap() {
// de https://www.jumbo.com.ar/sitemap.xml
const sitemaps = [
"https://www.jumbo.com.ar/sitemap/product-1.xml",
"https://www.jumbo.com.ar/sitemap/product-10.xml",
"https://www.jumbo.com.ar/sitemap/product-11.xml",
"https://www.jumbo.com.ar/sitemap/product-12.xml",
"https://www.jumbo.com.ar/sitemap/product-13.xml",
"https://www.jumbo.com.ar/sitemap/product-14.xml",
"https://www.jumbo.com.ar/sitemap/product-15.xml",
"https://www.jumbo.com.ar/sitemap/product-2.xml",
"https://www.jumbo.com.ar/sitemap/product-3.xml",
"https://www.jumbo.com.ar/sitemap/product-4.xml",
"https://www.jumbo.com.ar/sitemap/product-5.xml",
"https://www.jumbo.com.ar/sitemap/product-6.xml",
"https://www.jumbo.com.ar/sitemap/product-7.xml",
"https://www.jumbo.com.ar/sitemap/product-8.xml",
"https://www.jumbo.com.ar/sitemap/product-9.xml",
];
await pMap(
sitemaps,
async (sitemapUrl) => {
const res = await fetch(sitemapUrl);
const xml = await res.text();
saveUrls(getUrlsFromSitemap(xml));
},
{ concurrency: 3 }
);
}

View file

@ -1,5 +1,5 @@
{
"name": "coto-link-scraper",
"name": "link-scrapers",
"type": "module",
"version": "1.0.0",
"description": "",
@ -11,6 +11,7 @@
"author": "",
"license": "ISC",
"dependencies": {
"entities": "^4.5.0",
"linkedom": "^0.16.5",
"p-queue": "^8.0.1"
}

View file

@ -2,9 +2,7 @@
"name": "preciazo",
"private": true,
"workspaces": [
"dia-link-scraper",
"coto-link-scraper",
"carrefour-link-scraper",
"link-scrapers",
"scraper",
"sitio",
"db-datos"

View file

@ -4,33 +4,23 @@ scrapeo "masivo" de precios y datos en supermercados argentinos
## componentes (en orden de proceso)
- los link scrapers ([coto-link-scraper](./coto-link-scraper/), [dia-link-scraper](./dia-link-scraper/) y [carrefour-link-scraper](./carrefour-link-scraper)) crean listas de links a productos para scrapear
- los link scrapers ([link-scrapers/](./link-scrapers/)) crean listas de links a productos para scrapear
(no hace falta correrlos porque ya hay listas armadas en [data/](./data/))
- [warcificator](./warcificator/) descarga las paginas de productos y genera un archivo [WARC](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.0/) con ellas
- el [scraper](./scraper/) procesa estos WARCs, extrayendo varios datos y guardandolos en una base de datos SQLite (definida en [db-datos](./db-datos/schema.ts))
- el [scraper](./scraper/) descarga todos los links, extrayendo varios datos y guardandolos en una base de datos SQLite (definida en [db-datos](./db-datos/schema.ts))
- el [sitio](./sitio/) renderiza páginas a partir de la base de datos y hace gráficos lindos
## setup
hay que instalar [Bun](https://bun.sh/), que lo estoy usando porque hacía que el scraper corra más rápido. quizás en el futuro lo reemplace con good old Node.js.
aparte, se necesita zstd, que se usa para comprimir los WARCs eficientemente. seguro está disponible en las repos de tu distro favorita :)
empezá descargando un WARC con 50 páginas de sample, y recomprimilo con zstd:
```
wget --no-verbose --tries=3 --delete-after --input-file ./data/samples/Dia.txt --warc-file=dia-sample
gzip -dc dia-sample.warc.gz | zstd --long -15 --no-sparse -o dia-sample.warc.zst
```
después, scrapealo a una BD:
después, escrapea un sample de productos de Carrefour a una BD:
```
cd scraper/
bun install
bun cli.ts scrap ../dia-sample.warc.zst
bun cli.ts scrap ./data/samples/Carrefour.50.txt
```
ahora miralo en el sitio:

View file

@ -1,42 +0,0 @@
FROM docker.io/oven/bun:1-alpine AS base
WORKDIR /usr/src/app
FROM base AS builder
ENV NODE_ENV=production
COPY . .
RUN bun install --frozen-lockfile \
&& bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js \
&& rm -rf node_modules/
# https://dev.to/deciduously/use-multi-stage-docker-builds-for-statically-linked-rust-binaries-3jgd
FROM docker.io/rust:1.74 AS warcificator-builder
WORKDIR /usr/src/
RUN rustup target add x86_64-unknown-linux-musl
RUN apt-get update && apt-get install -y musl-tools musl-dev
RUN USER=root cargo new warcificator
WORKDIR /usr/src/warcificator
COPY ./warcificator/Cargo.toml ./warcificator/Cargo.lock ./
RUN cargo build --release
COPY ./warcificator/src ./src
RUN cargo install --target x86_64-unknown-linux-musl --path .
FROM base
RUN apk add --no-cache wget zstd tini
RUN printf "#!/bin/sh\nexec bun /bin/scraper auto\n" > /etc/periodic/daily/scraper \
&& chmod +x /etc/periodic/daily/scraper
COPY --from=builder /tmp/cli.build.js /bin/scraper
COPY --from=warcificator-builder /usr/local/cargo/bin/warcificator /bin/
COPY --from=builder /usr/src/app/db-datos/drizzle /bin/drizzle
COPY --from=builder /usr/src/app/data /listas
WORKDIR /app
VOLUME /db
ENV NODE_ENV=production
ENV DB_PATH=/db/db.db
ENV LISTS_DIR=/listas/
CMD ["tini", "/bin/busybox", "crond", "-f", "-l2"]
# CMD ["bun", "/bin/scraper"]

View file

@ -1,29 +1,20 @@
import { mkdtemp, access, writeFile } from "node:fs/promises";
import { mkdtemp, writeFile } from "node:fs/promises";
import { tmpdir } from "node:os";
import { join, resolve } from "node:path";
import { spawn } from "node:child_process";
import { Supermercado, hosts } from "db-datos/supermercado.js";
import { join } from "node:path";
import { Supermercado, hosts, supermercados } from "db-datos/supermercado.js";
import PQueue from "p-queue";
import { format, formatDuration, intervalToDuration } from "date-fns";
import { parseWarc } from "./scrap.js";
import { S3Client } from "@aws-sdk/client-s3";
import { Upload } from "@aws-sdk/lib-storage";
import { BunFile } from "bun";
import { formatDuration, intervalToDuration } from "date-fns";
import { downloadList } from "./scrap.js";
import { db } from "db-datos/db.js";
import { like } from "drizzle-orm";
import { productoUrls } from "db-datos/schema.js";
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
const supermercados: Supermercado[] = [
Supermercado.Carrefour,
Supermercado.Coto,
Supermercado.Dia,
];
import { scrapDiaProducts } from "../link-scrapers/dia.js";
import { scrapCotoProducts } from "../link-scrapers/coto.js";
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
// hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU
const scrapQueue = new PQueue({ concurrency: 1 });
const scrapQueue = new PQueue({ concurrency: 4 });
export async function auto() {
const a = new Auto();
@ -31,35 +22,9 @@ export async function auto() {
}
class Auto {
s3Config?: { s3: S3Client; bucketName: string };
telegramConfig?: { token: string; chatId: string };
constructor() {
if (
!process.env.S3_ACCESS_KEY_ID ||
!process.env.S3_SECRET_ACCESS_KEY ||
!process.env.S3_BUCKET_NAME
) {
if (process.env.NODE_ENV === "development") {
console.warn("faltan creds de s3, no voy a subir a s3");
} else {
throw new Error("faltan creds de s3");
}
} else {
this.s3Config = {
// https://www.backblaze.com/docs/cloud-storage-use-the-aws-sdk-for-javascript-v3-with-backblaze-b2
s3: new S3Client({
endpoint: "https://s3.us-west-004.backblazeb2.com",
region: "us-west-004",
credentials: {
accessKeyId: process.env.S3_ACCESS_KEY_ID,
secretAccessKey: process.env.S3_SECRET_ACCESS_KEY,
},
}),
bucketName: process.env.S3_BUCKET_NAME,
};
}
if (!process.env.TELEGRAM_BOT_TOKEN)
console.warn("no hay TELEGRAM_BOT_TOKEN, no voy a loggear por allá");
else if (!process.env.TELEGRAM_BOT_CHAT_ID)
@ -89,6 +54,9 @@ class Auto {
case "Carrefour":
await scrapCarrefourProducts();
break;
case "Jumbo":
await scrapJumboProducts();
break;
}
this.inform(
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
@ -107,93 +75,29 @@ class Auto {
const urls = results.map((r) => r.url);
await writeFile(listPath, urls.join("\n") + "\n");
const date = new Date();
const zstdWarcName = `${supermercado}-${format(
date,
"yyyy-MM-dd-HH:mm"
)}.warc.zst`;
const zstdWarcPath = join(ctxPath, zstdWarcName);
const subproc = Bun.spawn({
cmd: ["warcificator", listPath, zstdWarcPath],
stderr: "ignore",
stdout: "ignore",
cwd: ctxPath,
});
const t0 = performance.now();
await subproc.exited;
this.inform(
`[downloader] ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}`
);
if (!(await fileExists(zstdWarcPath))) {
const err = this.report(`no encontré el ${zstdWarcPath}`);
throw err;
}
this.scrapAndInform({ zstdWarcPath, zstdWarcName });
try {
await this.uploadToBucket({
fileName: zstdWarcName,
file: Bun.file(zstdWarcPath),
});
} catch (error) {
this.inform(`Falló subir ${zstdWarcName} a S3; ${error}`);
console.error(error);
}
this.scrapAndInform({ listPath });
// TODO: borrar archivos temporales
}
async scrapAndInform({
zstdWarcPath,
zstdWarcName,
}: {
zstdWarcPath: string;
zstdWarcName: string;
}) {
async scrapAndInform({ listPath }: { listPath: string }) {
const res = await scrapQueue.add(async () => {
const t0 = performance.now();
const progress = await parseWarc(zstdWarcPath);
const progress = await downloadList(listPath);
return { took: performance.now() - t0, progress };
});
if (res) {
const { took, progress } = res;
this.inform(
`Procesado ${zstdWarcName} (${progress.done} ok, ${
progress.errors.length
} errores) (tardó ${formatMs(took)})`
`Procesado ${listPath} (${progress.done} ok, ${
progress.skipped
} skipped, ${progress.errors.length} errores) (tardó ${formatMs(took)})`
);
} else {
this.inform(`Algo falló en ${zstdWarcName}`);
this.inform(`Algo falló en ${listPath}`);
}
}
async uploadToBucket({
fileName,
file,
}: {
fileName: string;
file: BunFile;
}) {
if (!this.s3Config) {
this.inform(
`[s3] Se intentó subir ${fileName} pero no tenemos creds de S3`
);
return;
}
const parallelUploads3 = new Upload({
client: this.s3Config.s3,
params: {
Bucket: this.s3Config.bucketName,
Key: fileName,
Body: file,
},
});
await parallelUploads3.done();
}
inform(msg: string) {
this.sendTelegramMsg(msg);
console.info(msg);
@ -216,16 +120,6 @@ class Auto {
}
}
// no se llama exists porque bun tiene un bug en el que usa fs.exists por mas que exista una funcion llamada exists
async function fileExists(path: string) {
try {
access(path);
return true;
} catch {
return false;
}
}
function formatMs(ms: number) {
return formatDuration(intervalToDuration({ start: 0, end: Math.round(ms) }));
}

View file

@ -1,8 +1,9 @@
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
import { scrapCotoProducts } from "../link-scrapers/coto.js";
import { scrapDiaProducts } from "../link-scrapers/dia.js";
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
import { auto } from "./auto.js";
import { parseWarc } from "./scrap.js";
import { downloadList, getProduct } from "./scrap.js";
if (process.argv[2] === "auto") {
await auto();
@ -12,17 +13,24 @@ if (process.argv[2] === "auto") {
await scrapDiaProducts();
} else if (process.argv[2] === "scrap-coto-links") {
await scrapCotoProducts();
} else if (process.argv[2] === "scrap-jumbo-links") {
await scrapJumboProducts();
} else if (process.argv[2] === "scrap-link") {
const url = new URL(process.argv[3]);
const res = await fetch(url);
const text = await res.text();
console.info(await getProduct(url, text));
} else if (process.argv[2] === "scrap") {
const warcPaths = process.argv.slice(3);
if (warcPaths.length > 0) {
for (const path of warcPaths) {
const res = await parseWarc(path);
const urlLists = process.argv.slice(3);
if (urlLists.length > 0) {
for (const path of urlLists) {
const res = await downloadList(path);
console.info("=======================================");
console.info(path, res);
console.info("=======================================");
}
} else {
console.error("Especificá WARCs para scrapear.");
console.error("Especificá listas de urls para scrapear.");
process.exit(1);
}
} else {

View file

@ -1,13 +0,0 @@
export async function getHtml(url: string) {
const res = await fetch(url);
return readableToBuffer(res.body!);
}
async function readableToBuffer(source: AsyncIterable<any>) {
// https://stackoverflow.com/a/72891118
const buffers = [];
for await (const data of source) {
buffers.push(data);
}
return Buffer.concat(buffers);
}

View file

@ -5,8 +5,7 @@
"description": "",
"main": "index.js",
"scripts": {
"build:container": "podman build -t gitea.nulo.in/nulo/preciazo/scraper -f ./Containerfile ..",
"push:container": "bun build:container && podman push gitea.nulo.in/nulo/preciazo/scraper"
"check": "tsc"
},
"keywords": [],
"author": "",
@ -16,11 +15,11 @@
"@aws-sdk/lib-storage": "^3.478.0",
"date-fns": "^3.0.6",
"db-datos": "workspace:^",
"drizzle-orm": "=0.29.1",
"drizzle-orm": "^0.29.1",
"linkedom": "^0.16.5",
"nanoid": "^5.0.4",
"p-map": "^7.0.1",
"p-queue": "^8.0.1",
"warcio": "^2.2.1",
"zod": "^3.22.4"
},
"devDependencies": {

View file

@ -1,6 +1,6 @@
import { parseHTML } from "linkedom";
import { Precioish } from "../scrap.js";
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "../common.js";
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
function parseScriptJson<T>(dom: Window, varname: string): T {
const script = dom.window.document.querySelector<HTMLTemplateElement>(

View file

@ -21,7 +21,7 @@ function parseJsonLds(dom: Window): object[] {
const scripts = dom.window.document.querySelectorAll(
'script[type="application/ld+json"]'
);
return [...scripts].map((scripts) => JSON.parse(scripts.innerHTML));
return Array.from(scripts, (script) => JSON.parse(script.innerHTML));
}
function findJsonLd(dom: Window, type: string): object | undefined {
return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type);
@ -31,8 +31,9 @@ const zProductLd = z.object({
"@type": z.literal("Product"),
name: z.string(),
image: z.string(),
sku: z.string().optional(),
offers: z.object({
offers: z.tuple([
offers: z.array(
z.object({
"@type": z.literal("Offer"),
price: z.number(),
@ -41,8 +42,8 @@ const zProductLd = z.object({
"http://schema.org/OutOfStock",
"http://schema.org/InStock",
]),
}),
]),
})
),
}),
});
type ProductLd = z.infer<typeof zProductLd>;

View file

@ -19,7 +19,7 @@ function getEanFromText({ document }: Window) {
}
function getPriceFromText({ document }: Window) {
const el = document.querySelector(".atg_store_newPrice");
if (!el?.textContent) throw new Error("no encuentro el precio");
if (!el?.textContent) return null;
const nStr = el.textContent
.trim()
.replace("$", "")
@ -27,12 +27,16 @@ function getPriceFromText({ document }: Window) {
.replace(",", ".");
return parseFloat(nStr) * 100;
}
function getInStock({ document }: Window) {
return !document.querySelector(".product_not_available");
}
export function getCotoProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html);
const ean = getEanFromText(dom);
const precioCentavos = getPriceFromText(dom);
const inStock = getInStock(dom);
const name = dom.document
.querySelector("h1.product_page")
@ -40,5 +44,5 @@ export function getCotoProduct(html: string | Buffer): Precioish {
const imageUrl =
dom.document.querySelector<HTMLImageElement>(".zoom img")?.src;
return { name, imageUrl, ean, precioCentavos };
return { name, imageUrl, ean, precioCentavos, inStock };
}

View file

@ -1,6 +1,6 @@
import { parseHTML } from "linkedom";
import { type Precioish } from "../scrap.js";
import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js";
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
export function getDiaProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html);

54
scraper/parsers/jumbo.ts Normal file
View file

@ -0,0 +1,54 @@
import { parseHTML } from "linkedom";
import { type Precioish } from "../scrap.js";
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
import { z } from "zod";
const zJumboSearch = z.tuple([
z.object({
items: z.array(
z.object({
ean: z.string(),
})
),
}),
]);
async function getEanFromSearch(sku: string) {
const url = new URL(
"https://www.jumbo.com.ar/api/catalog_system/pub/products/search"
);
url.searchParams.set("fq", `skuId:${sku}`);
const res = await fetch(url);
const json = await res.json();
const parsed = zJumboSearch.parse(json);
const ean = parsed[0].items[0].ean;
if (!parsed[0].items.every((x) => x.ean === ean)) {
throw new Error("Inesperado: no todos los items tienen el mismo EAN");
}
return ean;
}
export async function getJumboProduct(
html: string | Buffer
): Promise<Precioish> {
const dom = parseHTML(html);
const precioCentavos = priceFromMeta(dom);
const inStock = stockFromMeta(dom);
const ld = getProductJsonLd(dom);
const name = ld.name;
const imageUrl = ld.image;
const retailerSku = ld.sku;
if (!retailerSku)
throw new Error("No encontré el SKU de Jumbo para pedir el EAN");
const ean = await getEanFromSearch(retailerSku);
return {
name,
imageUrl,
ean,
precioCentavos,
inStock,
};
}

View file

@ -1,112 +1,127 @@
/// <reference lib="dom" />
import * as schema from "db-datos/schema.js";
import { WARCParser } from "warcio";
import { writeFile } from "fs/promises";
import { writeFile, mkdir } from "fs/promises";
import { createHash } from "crypto";
import { getCarrefourProduct } from "./parsers/carrefour.js";
import { getDiaProduct } from "./parsers/dia.js";
import { getCotoProduct } from "./parsers/coto.js";
import { join } from "path";
import { and, eq, sql } from "drizzle-orm";
import { db } from "db-datos/db.js";
import pMap from "p-map";
import { getJumboProduct } from "./parsers/jumbo.js";
const DEBUG = false;
const DEBUG = true;
const PARSER_VERSION = 4;
const getPrevPrecio = db
.select({ id: schema.precios.id })
.from(schema.precios)
.where(
and(
eq(schema.precios.warcRecordId, sql.placeholder("warcRecordId")),
eq(schema.precios.parserVersion, PARSER_VERSION)
)
)
.limit(1)
.prepare();
export type Precio = typeof schema.precios.$inferInsert;
export type Precioish = Omit<
Precio,
"fetchedAt" | "url" | "id" | "warcRecordId" | "parserVersion"
>;
export async function parseWarc(path: string) {
// const warc = createReadStream(path);
let progress: {
done: number;
errors: { error: any; warcRecordId: string; path: string }[];
} = { done: 0, errors: [] };
export async function downloadList(path: string) {
let list = (await Bun.file(path).text())
.split("\n")
.filter((s) => s.length > 0);
const proc = Bun.spawn(["zstdcat", "-d", path], {});
const warc = proc.stdout;
// TODO: tirar error si falla zstd
const parser = new WARCParser(warc);
for await (const record of parser) {
if (record.warcType === "response") {
if (!record.warcTargetURI) continue;
const warcRecordId = record.warcHeader("WARC-Record-ID");
if (!warcRecordId) throw new Error("No tiene WARC-Record-ID");
if (getPrevPrecio.get({ warcRecordId })) {
console.debug(`skipped ${warcRecordId}`);
continue;
}
if (record.httpHeaders?.statusCode !== 200) {
console.debug(
`skipped ${warcRecordId} because status=${record.httpHeaders?.statusCode} (!=200)`
);
continue;
}
// TODO: sobreescribir si existe el mismo record-id pero con version mas bajo?
const html = await record.contentText();
const url = new URL(record.warcTargetURI);
try {
let ish: Precioish | undefined = undefined;
if (url.hostname === "www.carrefour.com.ar")
ish = getCarrefourProduct(html);
else if (url.hostname === "diaonline.supermercadosdia.com.ar")
ish = getDiaProduct(html);
else if (url.hostname === "www.cotodigital3.com.ar")
ish = getCotoProduct(html);
else throw new Error(`Unknown host ${url.hostname}`);
const p: Precio = {
...ish,
fetchedAt: new Date(record.warcDate!),
url: record.warcTargetURI,
warcRecordId,
parserVersion: PARSER_VERSION,
};
await db.insert(schema.precios).values(p);
progress.done++;
} catch (error) {
console.error({ path, warcRecordId, error });
progress.errors.push({
path,
warcRecordId,
error,
});
if (DEBUG) {
const urlHash = createHash("md5")
.update(record.warcTargetURI!)
.digest("hex");
const output = join("debug", `${urlHash}.html`);
await writeFile(output, html);
console.error(`wrote html to ${output}`);
const results = await pMap(
list,
async (urlS) => {
let res: ScrapResult = { type: "skipped" };
for (let attempts = 0; attempts < 6; attempts++) {
if (attempts !== 0) await wait(1500);
res = await scrap(urlS);
if (res.type === "done" || res.type === "skipped") {
break;
}
}
if (res.type === "error") console.error(res);
return res;
},
{ concurrency: 32 }
);
let progress: {
done: number;
skipped: number;
errors: { error: any; url: string; debugPath: string }[];
} = { done: 0, skipped: 0, errors: [] };
for (const result of results) {
switch (result.type) {
case "done":
progress.done++;
break;
case "error":
progress.errors.push(result);
break;
case "skipped":
progress.skipped++;
break;
}
}
if ((await proc.exited) !== 0) {
throw new Error("zstd tiró un error");
}
return progress;
}
export async function getProduct(url: URL, html: string): Promise<Precioish> {
if (url.hostname === "www.carrefour.com.ar") return getCarrefourProduct(html);
else if (url.hostname === "diaonline.supermercadosdia.com.ar")
return getDiaProduct(html);
else if (url.hostname === "www.cotodigital3.com.ar")
return getCotoProduct(html);
else if (url.hostname === "www.jumbo.com.ar")
return await getJumboProduct(html);
else throw new Error(`Unknown host ${url.hostname}`);
}
type ScrapResult =
| { type: "skipped" }
| { type: "done" }
| { type: "error"; url: string; error: any; debugPath: string };
async function scrap(urlS: string): Promise<ScrapResult> {
let url;
try {
url = new URL(urlS);
} catch (err) {
console.error(`skipped ${urlS} because ${err}`);
return { type: "skipped" };
}
const res = await fetch(url);
if (!res.ok) {
console.debug(`skipped ${urlS} because status=${res.status} (!=200)`);
return { type: "skipped" };
}
const html = await res.text();
try {
let ish = await getProduct(url, html);
const p: Precio = {
...ish,
fetchedAt: new Date(),
url: urlS,
parserVersion: PARSER_VERSION,
};
await db.insert(schema.precios).values(p);
return { type: "done" };
} catch (error) {
const urlHash = createHash("md5").update(urlS).digest("hex");
const output = join("debug", `${urlHash}.html`);
if (DEBUG) {
await mkdir("debug", { recursive: true });
await writeFile(output, html);
}
return {
type: "error",
url: urlS,
error,
debugPath: output,
};
}
}
function wait(ms: number) {
return new Promise((resolve) => setTimeout(resolve, ms));
}

View file

@ -1,3 +1,4 @@
{
"extends": "../tsconfig.json"
"extends": "../tsconfig.json",
"exclude": ["../sitio"]
}

View file

@ -1,157 +0,0 @@
const crlf = "\r\n";
const crlfB = Buffer.from(crlf, "utf-8");
const crlfcrlf = crlf + crlf;
const crlfcrlfB = Buffer.from(crlfcrlf, "utf-8");
const warc10B = Buffer.from("WARC/1.0", "utf-8");
const emptyBuffer = Buffer.from("", "utf-8");
export async function* parseWARC(path: string) {
const warc = Bun.spawn(["zstd", "-do", "/dev/stdout", path], {
stderr: "ignore",
}).stdout;
// const warc = Bun.stdin.stream(1024 * 1024 * 128);
// let buffer: Uint8Array[] = [];
// const transform = new TransformStream<Uint8Array, Buffer>({
// transform(chunk, controller) {
// buffer.push(chunk);
// if (
// buffer.reduce((prev, curr) => prev + curr.length, 0) >
// 1024 * 1024 * 64
// ) {
// controller.enqueue(Buffer.concat(buffer));
// buffer = [];
// }
// },
// flush(controller) {
// controller.enqueue(Buffer.concat(buffer));
// },
// });
// warc.pipeTo(transform.writable);
const reader = warc.getReader();
// const reader = transform.readable.getReader();
// const warc = process.stdin;
let arrays: Buffer[] = [];
let done = false;
while (!done) {
const r = await reader.readMany();
if (r.done) {
done = true;
} else {
arrays = arrays.concat(r.value.map((x) => Buffer.from(x)));
if (
arrays.reduce((prev, curr) => prev + curr.length, 0) <
1024 * 1024 * 10
)
continue;
}
let buf: Buffer;
while (
((buf = arrays.length === 1 ? arrays[0] : Buffer.concat(arrays)),
buf.subarray(warc10B.length).includes(warc10B))
) {
const until = buf.indexOf(crlfcrlfB);
const header = buf.subarray(0, until);
const lines = splitBuffer(header, crlfB);
let i = 0;
const nextLine = () => {
const line = lines[i];
i++;
return line ? line : emptyBuffer;
};
let line: Buffer;
if (!(line = nextLine()).equals(warc10B)) {
throw new Error(`No WARC 1.0 header in '${line}'`);
}
let field;
let fields = new Map<string, string>();
while (
((line = nextLine()),
(field = parseField(line.toString("utf8"))),
line.length !== 0)
) {
fields.set(field[0], field[1]);
}
const length = parseInt(fields.get("Content-Length")!);
const rawHttp = buf.subarray(
until + crlfcrlfB.length,
until + crlfcrlfB.length + length
);
const rawHttpHeaders = rawHttp
.subarray(
rawHttp.indexOf(crlfB) + crlfB.length,
rawHttp.indexOf(crlfcrlfB) + crlfcrlfB.length
)
.toString();
let httpHeaders = new Map<string, string>();
rawHttpHeaders.split(crlf).forEach((line) => {
if (!line.length) return;
const [key, val] = line.split(": ");
httpHeaders.set(key, val);
});
let content = rawHttp.subarray(
rawHttp.indexOf(crlfcrlfB) + crlfcrlfB.length
);
if (httpHeaders.get("Transfer-Encoding") === "chunked") {
content = dechunk(content);
}
// console.debug(fields.get("WARC-Date"), content.length);
yield {
fields,
content,
};
arrays = [
buf.subarray(until + crlfcrlfB.length + length + crlfcrlfB.length),
];
if (!arrays[0].length) break;
}
}
}
function splitBuffer(buffer: Buffer, val: Buffer): Buffer[] {
let bufs = [];
let rest = buffer;
let i;
while (((i = rest.indexOf(val)), i !== -1)) {
bufs.push(rest.subarray(0, i));
rest = rest.subarray(i + val.length);
}
bufs.push(rest);
return bufs;
}
function parseField(line: string): [string, string] {
const [key, val] = line.split(": ");
return [key, val];
}
function dechunk(content: Buffer): Buffer {
let actualContent = [];
while (true) {
let until = content.indexOf(crlf);
const hexLen = content.subarray(0, until).toString();
if (hexLen.length === 0) break;
const len = parseInt(hexLen, 16);
actualContent.push(
content.subarray(until + crlfB.length, until + crlfB.length + len)
);
content = content.subarray(until + crlfB.length + len + crlfB.length);
}
return Buffer.concat(actualContent);
}

View file

@ -1,31 +0,0 @@
FROM docker.io/oven/bun:1-alpine as build
RUN apk add --no-cache nodejs
WORKDIR /usr/src/app
COPY . .
WORKDIR /usr/src/app/sitio
RUN bun install && \
bun run build
# FROM docker.io/oven/bun:1-alpine as deps
# WORKDIR /usr/src/app/sitio
# RUN bun init && bun install "better-sqlite3"@"^9.2.2" "chart.js"@"^4.4.1" "chartjs-adapter-dayjs-4"@"^1.0.4" "dayjs"@"^1.11.10" "drizzle-orm"@"^0.29.1"
# COPY --from=build /usr/src/app/db-datos node_modules/db-datos
FROM docker.io/alpine:3.19
RUN apk add --no-cache tini nodejs npm jq
WORKDIR /app
COPY --from=build /usr/src/app/sitio/package.json package.real.json
RUN sh -c 'echo {\"name\":\"sitio\",\"type\":\"module\",\"dependencies\":$(jq .dependencies < package.real.json)} > package.json' && npm install
COPY --from=build /usr/src/app/db-datos node_modules/db-datos
COPY --from=build /usr/src/app/sitio/build .
# https://github.com/gornostay25/svelte-adapter-bun/issues/39
ENV PROTOCOL_HEADER=x-forwarded-proto
ENV HOST_HEADER=x-forwarded-host
VOLUME /db
ENV DB_PATH=/db/db.db
EXPOSE 3000
CMD ["tini", "node", "."]

View file

@ -38,7 +38,8 @@
"better-sqlite3": "^9.2.2",
"chart.js": "^4.4.1",
"chartjs-adapter-dayjs-4": "^1.0.4",
"croner": "^8.0.0",
"dayjs": "^1.11.10",
"drizzle-orm": "=0.29.1"
"drizzle-orm": "^0.29.1"
}
}

12
sitio/src/hooks.server.ts Normal file
View file

@ -0,0 +1,12 @@
import { spawn } from "child_process";
import Cron from "croner";
if (process.env.NODE_ENV === "production") {
const job = Cron("15 3 * * *", () => {
runScraper();
});
}
function runScraper() {
spawn("bun", ["/bin/scraper", "auto"], { stdio: "inherit" });
}

View file

@ -1,8 +1,10 @@
<script lang="ts">
export let product: { ean: string; name: string; imageUrl: string };
export let product: { ean: string; name: string; imageUrl?: string | null };
</script>
<a href={`/ean/${product.ean}`} class="flex">
<img src={product.imageUrl} alt={product.name} class="max-h-48" />
{#if product.imageUrl}
<img src={product.imageUrl} alt={product.name} class="max-h-48" />
{/if}
<p class="text-xl">{product.name}</p>
</a>

View file

@ -1,9 +1,22 @@
import type { PageServerLoad } from "./$types";
import type { PageData, PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db";
const { precios } = schema;
import { sql } from "drizzle-orm";
export const load: PageServerLoad = async ({ params }) => {
let cache: null | { key: Date; data: { precios: Precios } } = null;
type Precios = {
ean: string;
name: string | null;
imageUrl: string | null;
}[];
export const load: PageServerLoad = async ({
params,
}): Promise<{ precios: Precios }> => {
if (cache && +new Date() < +cache.key + 1000 * 60 * 10) {
return cache.data;
}
const q = db
.select({
ean: precios.ean,
@ -12,9 +25,11 @@ export const load: PageServerLoad = async ({ params }) => {
})
.from(precios)
.groupBy(precios.ean)
.having(sql`max(length(name))`)
.having(sql`max(length(name)) and max(parser_version) and in_stock`)
.orderBy(sql`random()`)
.limit(150);
const res = await q;
return { precios: res };
const data = { precios: res };
cache = { key: new Date(), data };
return data;
};

View file

@ -3,6 +3,10 @@
import type { PageData } from "./$types";
export let data: PageData;
$: precios = data.precios.filter(
(d): d is { ean: string; name: string; imageUrl: string | null } =>
!!d.name,
);
</script>
<h1 class="text-xl">WIP</h1>
@ -32,7 +36,7 @@
<section>
<h2 class="text-lg font-bold">Random</h2>
<ul class="grid grid-cols-1 gap-4 md:grid-cols-2 lg:grid-cols-3">
{#each data.precios as product}
{#each precios as product}
<li>
<ProductPreview {product} />
</li>

View file

@ -9,13 +9,11 @@ export const load: PageServerLoad = async ({ params }) => {
.select()
.from(precios)
.where(eq(precios.ean, params.ean))
.groupBy(precios.warcRecordId)
.having(max(precios.parserVersion))
.orderBy(precios.fetchedAt);
const res = await q;
if (res.length === 0) return error(404, "Not Found");
const meta = res.find((p) => p.name);
const meta = res.findLast((p) => p.name);
return { precios: res, meta };
};

View file

@ -17,6 +17,7 @@
[Supermercado.Dia]: "bg-[#d52b1e] focus:ring-[#d52b1e]",
[Supermercado.Carrefour]: "bg-[#19549d] focus:ring-[#19549d]",
[Supermercado.Coto]: "bg-[#e20025] focus:ring-[#e20025]",
[Supermercado.Jumbo]: "bg-[#2dc850] focus:ring-[#2dc850]",
};
</script>

View file

@ -1,18 +1,19 @@
import { error } from "@sveltejs/kit";
import { eq, max, sql } from "drizzle-orm";
import { sql } from "drizzle-orm";
import type { PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db";
const { precios } = schema;
import { db } from "$lib/server/db";
export const load: PageServerLoad = async ({ url }) => {
const query = url.searchParams.get("q");
let results: null | { ean: string; name: string; imageUrl: string }[] = null;
if (query) {
results = db.all(
sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
const sqlQuery = sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
join precios p on p.ean = f.ean
where f.name match ${query};`,
);
where f.name match ${`"${query}"`}
group by p.ean
having max(p.fetched_at) and max(p.in_stock)
order by p.in_stock desc;`;
results = db.all(sqlQuery);
}
return { query, results };

View file

@ -15,5 +15,6 @@
"noEmit": true,
"forceConsistentCasingInFileNames": true
},
"include": ["**/*.ts", "**/*.js"]
"include": ["**/*.ts", "**/*.js"],
"exclude": ["sitio/build"]
}

534
warcificator/Cargo.lock generated
View file

@ -24,7 +24,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01"
dependencies = [
"cfg-if",
"getrandom",
"once_cell",
"version_check",
"zerocopy",
@ -144,12 +143,6 @@ version = "3.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec"
[[package]]
name = "byteorder"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "bytes"
version = "1.5.0"
@ -205,6 +198,16 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "crossbeam-channel"
version = "0.5.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "82a9b73a36529d9c47029b9fb3a6f0ea3cc916a261195352ba19e770fc1748b2"
dependencies = [
"cfg-if",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.18"
@ -215,60 +218,14 @@ dependencies = [
]
[[package]]
name = "cssparser"
version = "0.31.2"
name = "deranged"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be"
checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
dependencies = [
"cssparser-macros",
"dtoa-short",
"itoa",
"phf 0.11.2",
"smallvec",
"powerfmt",
]
[[package]]
name = "cssparser-macros"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
dependencies = [
"quote",
"syn 2.0.43",
]
[[package]]
name = "derive_more"
version = "0.99.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "dtoa"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653"
[[package]]
name = "dtoa-short"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbaceec3c6e4211c79e7b1800fb9680527106beb2f9c51904a3210c03a448c74"
dependencies = [
"dtoa",
]
[[package]]
name = "ego-tree"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591"
[[package]]
name = "encoding_rs"
version = "0.8.33"
@ -342,16 +299,6 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "futf"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
dependencies = [
"mac",
"new_debug_unreachable",
]
[[package]]
name = "futures-channel"
version = "0.3.30"
@ -391,24 +338,6 @@ dependencies = [
"pin-utils",
]
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]]
name = "getopts"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
dependencies = [
"unicode-width",
]
[[package]]
name = "getrandom"
version = "0.2.11"
@ -470,20 +399,6 @@ version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
[[package]]
name = "html5ever"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
dependencies = [
"log",
"mac",
"markup5ever",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "http"
version = "0.2.11"
@ -597,6 +512,12 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
version = "0.2.151"
@ -629,26 +550,6 @@ version = "0.4.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
[[package]]
name = "mac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "markup5ever"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
dependencies = [
"log",
"phf 0.10.1",
"phf_codegen",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "memchr"
version = "2.7.1"
@ -682,10 +583,14 @@ dependencies = [
]
[[package]]
name = "new_debug_unreachable"
version = "1.0.4"
name = "nu-ansi-term"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54"
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
dependencies = [
"overload",
"winapi",
]
[[package]]
name = "num_cpus"
@ -712,6 +617,12 @@ version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
[[package]]
name = "overload"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "parking"
version = "2.2.0"
@ -747,86 +658,6 @@ version = "2.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
[[package]]
name = "phf"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
dependencies = [
"phf_shared 0.10.0",
]
[[package]]
name = "phf"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
dependencies = [
"phf_macros",
"phf_shared 0.11.2",
]
[[package]]
name = "phf_codegen"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
]
[[package]]
name = "phf_generator"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
dependencies = [
"phf_shared 0.10.0",
"rand",
]
[[package]]
name = "phf_generator"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0"
dependencies = [
"phf_shared 0.11.2",
"rand",
]
[[package]]
name = "phf_macros"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b"
dependencies = [
"phf_generator 0.11.2",
"phf_shared 0.11.2",
"proc-macro2",
"quote",
"syn 2.0.43",
]
[[package]]
name = "phf_shared"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
dependencies = [
"siphasher",
]
[[package]]
name = "phf_shared"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
dependencies = [
"siphasher",
]
[[package]]
name = "pin-project-lite"
version = "0.2.13"
@ -846,16 +677,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69d3587f8a9e599cc7ec2c00e331f71c4e69a5f9a4b8a6efd5b07466b9736f9a"
[[package]]
name = "ppv-lite86"
version = "0.2.17"
name = "powerfmt"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
[[package]]
name = "proc-macro2"
@ -875,36 +700,6 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
"rand_chacha",
"rand_core",
]
[[package]]
name = "rand_chacha"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
"getrandom",
]
[[package]]
name = "redox_syscall"
version = "0.4.1"
@ -1033,22 +828,6 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scraper"
version = "0.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "585480e3719b311b78a573db1c9d9c4c1f8010c2dee4cc59c2efe58ea4dbc3e1"
dependencies = [
"ahash",
"cssparser",
"ego-tree",
"getopts",
"html5ever",
"once_cell",
"selectors",
"tendril",
]
[[package]]
name = "sct"
version = "0.7.1"
@ -1059,25 +838,6 @@ dependencies = [
"untrusted",
]
[[package]]
name = "selectors"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06"
dependencies = [
"bitflags 2.4.1",
"cssparser",
"derive_more",
"fxhash",
"log",
"new_debug_unreachable",
"phf 0.10.1",
"phf_codegen",
"precomputed-hash",
"servo_arc",
"smallvec",
]
[[package]]
name = "serde"
version = "1.0.193"
@ -1095,7 +855,7 @@ checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.43",
"syn",
]
[[package]]
@ -1122,12 +882,12 @@ dependencies = [
]
[[package]]
name = "servo_arc"
version = "0.3.0"
name = "sharded-slab"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44"
checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
dependencies = [
"stable_deref_trait",
"lazy_static",
]
[[package]]
@ -1139,12 +899,6 @@ dependencies = [
"libc",
]
[[package]]
name = "siphasher"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
[[package]]
name = "slab"
version = "0.4.9"
@ -1176,49 +930,6 @@ version = "0.9.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
[[package]]
name = "stable_deref_trait"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "string_cache"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b"
dependencies = [
"new_debug_unreachable",
"once_cell",
"parking_lot",
"phf_shared 0.10.0",
"precomputed-hash",
"serde",
]
[[package]]
name = "string_cache_codegen"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
"proc-macro2",
"quote",
]
[[package]]
name = "syn"
version = "1.0.109"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "syn"
version = "2.0.43"
@ -1252,14 +963,62 @@ dependencies = [
]
[[package]]
name = "tendril"
version = "0.4.3"
name = "thiserror"
version = "1.0.55"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
checksum = "6e3de26b0965292219b4287ff031fcba86837900fe9cd2b34ea8ad893c0953d2"
dependencies = [
"futf",
"mac",
"utf-8",
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.55"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "268026685b2be38d7103e9e507c938a1fcb3d7e6eb15e87870b617bf37b6d581"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "thread_local"
version = "1.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152"
dependencies = [
"cfg-if",
"once_cell",
]
[[package]]
name = "time"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f657ba42c3f86e7680e53c8cd3af8abbe56b5491790b46e22e19c0d57463583e"
dependencies = [
"deranged",
"itoa",
"powerfmt",
"serde",
"time-core",
"time-macros",
]
[[package]]
name = "time-core"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
[[package]]
name = "time-macros"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26197e33420244aeb70c3e8c78376ca46571bc4e701e4791c2cd9f57dcb3a43f"
dependencies = [
"time-core",
]
[[package]]
@ -1277,6 +1036,11 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tl"
version = "0.7.7"
source = "git+https://github.com/evertedsphere/tl?branch=patch-1#56711166588fa6c7729a08e5740dca2526436316"
[[package]]
name = "tokio"
version = "1.35.1"
@ -1304,7 +1068,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.43",
"syn",
]
[[package]]
@ -1343,10 +1107,35 @@ version = "0.1.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
dependencies = [
"log",
"pin-project-lite",
"tracing-attributes",
"tracing-core",
]
[[package]]
name = "tracing-appender"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3566e8ce28cc0a3fe42519fc80e6b4c943cc4c8cef275620eb8dac2d3d4e06cf"
dependencies = [
"crossbeam-channel",
"thiserror",
"time",
"tracing-subscriber",
]
[[package]]
name = "tracing-attributes"
version = "0.1.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "tracing-core"
version = "0.1.32"
@ -1354,6 +1143,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
dependencies = [
"once_cell",
"valuable",
]
[[package]]
name = "tracing-log"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
dependencies = [
"log",
"once_cell",
"tracing-core",
]
[[package]]
name = "tracing-subscriber"
version = "0.3.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
dependencies = [
"nu-ansi-term",
"sharded-slab",
"smallvec",
"thread_local",
"tracing-core",
"tracing-log",
]
[[package]]
@ -1383,12 +1198,6 @@ dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-width"
version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85"
[[package]]
name = "untrusted"
version = "0.9.0"
@ -1407,10 +1216,10 @@ dependencies = [
]
[[package]]
name = "utf-8"
version = "0.7.6"
name = "valuable"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
[[package]]
name = "vcpkg"
@ -1440,10 +1249,13 @@ dependencies = [
"async-channel",
"reqwest",
"rusqlite",
"scraper",
"serde",
"serde_json",
"tl",
"tokio",
"tracing",
"tracing-appender",
"tracing-subscriber",
]
[[package]]
@ -1473,7 +1285,7 @@ dependencies = [
"once_cell",
"proc-macro2",
"quote",
"syn 2.0.43",
"syn",
"wasm-bindgen-shared",
]
@ -1507,7 +1319,7 @@ checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.43",
"syn",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
@ -1534,6 +1346,28 @@ version = "0.25.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1778a42e8b3b90bff8d0f5032bf22250792889a5cdc752aa0020c84abe3aaf10"
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-sys"
version = "0.48.0"
@ -1627,5 +1461,5 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.43",
"syn",
]

View file

@ -7,13 +7,18 @@ edition = "2021"
[dependencies]
async-channel = "2.1.1"
# lol_html = "1.2.0"
reqwest = { version = "0.11.23", default-features = false, features = [
"rustls-tls",
"gzip",
"brotli",
] }
rusqlite = "0.30.0"
scraper = "0.18.1"
# scraper = "0.18.1"
serde = { version = "1.0.193", features = ["derive"] }
serde_json = "1.0.109"
tl = { git = "https://github.com/evertedsphere/tl", branch = "patch-1", features = ["simd"] }
tokio = { version = "1.35.1", features = ["full"] }
tracing = { version = "0.1", features = ["log"] }
tracing-appender = "0.2.3"
tracing-subscriber = "0.3.18"

View file

@ -1,9 +1,18 @@
use async_channel::{Receiver, Sender};
// use lol_html::{
// element,
// html_content::{Element, TextChunk},
// text, ElementContentHandlers, HtmlRewriter, Selector, Settings,
// };
use rusqlite::Connection;
use scraper::{Element, Html, Selector};
use serde::de::value;
use tl::VDom;
// use scraper::{Element, Html, Selector};
use std::{
borrow::Cow,
env::args,
fs,
ops::Deref,
time::{SystemTime, UNIX_EPOCH},
};
use tokio::io::{stderr, AsyncWriteExt};
@ -21,6 +30,109 @@ struct PrecioPoint {
image_url: Option<String>,
}
// fn main() {
// let arg = args().skip(1).next().unwrap();
// let file_iter = fs::read_dir(arg)
// .unwrap()
// .filter(|pr| {
// if let Ok(p) = pr {
// !p.file_name().to_str().unwrap().ends_with(".link")
// } else {
// false
// }
// })
// .take(1000)
// .map(|f| fs::read(f.unwrap().path()).unwrap());
// let mut i = 0;
// for item in file_iter {
// i = i + 1;
// {
// // let mut text: Option<String> = None;
// // let mut price_str: Option<String> = None;
// // let mut rewriter = HtmlRewriter::new(
// // Settings {
// // element_content_handlers: vec![
// // // Rewrite insecure hyperlinks
// // element!("a[href]", |el| {
// // let href = el.get_attribute("href").unwrap().replace("http:", "https:");
// // el.set_attribute("href", &href).unwrap();
// // Ok(())
// // }),
// // (
// // Cow::Owned("a".parse().unwrap()),
// // ElementContentHandlers::default().text(extract_first_text(&mut text)),
// // ),
// // element!(
// // "meta[property=\"product:price:amount\"]",
// // extract_first_attr(&mut price_str, "content")
// // ),
// // ],
// // memory_settings: lol_html::MemorySettings {
// // preallocated_parsing_buffer_size: 1024 * 16,
// // max_allowed_memory_usage: std::usize::MAX,
// // },
// // ..Settings::default()
// // },
// // |_: &[u8]| {},
// // );
// // rewriter.write(&item).unwrap();
// // rewriter.end().unwrap();
// // println!("{:#?}", price_str);
// // let html = scraper::Html::parse_document(&String::from_utf8(item).unwrap());
// let html = String::from_utf8(item).unwrap();
// let dom = tl::parse(&html, tl::ParserOptions::default()).unwrap();
// match parse_carrefour("".into(), &dom) {
// Ok(point) => {
// // println!("{:?}", point);
// }
// Err(err) => {
// // println!("Error {:#?}: {}", err, html);
// }
// };
// }
// }
// println!("n={}", i);
// }
// fn extract_first_text(
// output: &mut Option<String>,
// ) -> impl FnMut(
// &mut TextChunk,
// ) -> Result<(), Box<(dyn std::error::Error + std::marker::Send + Sync + 'static)>>
// + '_ {
// move |el| {
// if *output == None {
// *output = Some(el.as_str().to_owned());
// }
// Ok(())
// }
// }
// fn extract_first_attr<'a>(
// output: &'a mut Option<String>,
// attr: &'a str,
// ) -> impl FnMut(
// &mut Element,
// ) -> Result<(), Box<(dyn std::error::Error + std::marker::Send + Sync + 'static)>>
// + 'a {
// move |el| {
// if *output == None {
// if let Some(value) = el.get_attribute(attr) {
// *output = Some(value);
// }
// }
// Ok(())
// }
// }
#[tokio::main]
async fn main() {
let mut args = args().skip(1);
@ -38,7 +150,7 @@ async fn main() {
let (res_sender, res_receiver) = async_channel::unbounded::<PrecioPoint>();
let mut handles = Vec::new();
for _ in 1..16 {
for _ in 1..32 {
let rx = receiver.clone();
let tx = res_sender.clone();
handles.push(tokio::spawn(worker(rx, tx)));
@ -81,14 +193,7 @@ async fn worker(rx: Receiver<String>, tx: Sender<PrecioPoint>) {
#[derive(Debug)]
enum FetchError {
HttpError(reqwest::Error),
NoPriceMetaEl,
NoMetaContent,
NotANumber,
NoStockMetaEl,
NoValidStockMeta,
NoSeedState,
NoProductInSeedState,
NoProductSkuInSeedState,
ParseError(&'static str),
}
async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result<PrecioPoint, FetchError> {
@ -102,69 +207,68 @@ async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result<Precio
.await
.map_err(|e| FetchError::HttpError(e))?;
let html = Html::parse_document(&body);
let dom = tl::parse(&body, tl::ParserOptions::default()).unwrap();
// let parser = dom.parser();
let point = parse_carrefour(url, html)?;
let point = parse_carrefour(url, &dom)?;
Ok(point)
}
fn parse_carrefour(url: String, html: Html) -> Result<PrecioPoint, FetchError> {
let meta_price_sel = Selector::parse("meta[property=\"product:price:amount\"]").unwrap();
let precio_centavos = match html.select(&meta_price_sel).next() {
Some(el) => match el.attr("content") {
Some(attr) => match attr.parse::<f64>() {
Ok(f) => Ok((f * 100.0) as u64),
Err(_) => Err(FetchError::NotANumber),
},
None => Err(FetchError::NoMetaContent),
},
None => Err(FetchError::NoPriceMetaEl),
fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, FetchError> {
let precio_centavos = {
get_meta_content(dom, "product:price:amount")?
.map(|s| {
s.parse::<f64>()
.map_err(|_| FetchError::ParseError("Failed to parse number"))
})
.transpose()
.map(|f| f.map(|f| (f * 100.0) as u64))
}?;
let meta_stock_el = Selector::parse("meta[property=\"product:availability\"]").unwrap();
let in_stock = match html.select(&meta_stock_el).next() {
Some(el) => match el.attr("content") {
Some(attr) => match attr {
"oos" => Ok(Some(false)),
"instock" => Ok(Some(true)),
_ => Err(FetchError::NoValidStockMeta),
},
None => Err(FetchError::NoMetaContent),
let in_stock_meta = get_meta_content(dom, "product:availability")?.map(|s| s.into_owned());
let in_stock = match in_stock_meta {
Some(s) => match s.as_ref() {
"oos" => Some(false),
"instock" => Some(true),
_ => return Err(FetchError::ParseError("Not a valid product:availability")),
},
None => Err(FetchError::NoStockMetaEl),
}?;
None => None,
};
let ean = {
let state = parse_script_json(&html, "__STATE__").ok_or(FetchError::NoSeedState)?;
let seed_state = &state.as_object().ok_or(FetchError::NoSeedState)?;
let (_, product_json) = seed_state
let json = &parse_script_json(dom, "__STATE__")?;
let state = json
.as_object()
.ok_or(FetchError::ParseError("Seed state not an object"))?;
let (_, product_json) = state
.into_iter()
.find(|(key, val)| {
key.starts_with("Product:")
&& val.as_object().map_or(false, |val| {
val.get("__typename")
.map_or(false, |typename| typename == "Product")
})
&& val
.as_object()
.and_then(|val| val.get("__typename"))
.map_or(false, |typename| typename == "Product")
})
.ok_or(FetchError::NoProductInSeedState)?;
.ok_or(FetchError::ParseError("No product in seed state"))?;
let cache_id = product_json
.get("cacheId")
.ok_or(FetchError::NoProductInSeedState)?;
let (_, product_sku_json) = seed_state
.into_iter()
.filter_map(|(key, val)| val.as_object().map_or(None, |o| Some((key, o))))
.and_then(|v| v.as_str())
.ok_or(FetchError::ParseError("No cacheId in seed state"))?;
let (_, product_sku_json) = state
.iter()
.find(|(key, val)| {
key.starts_with(&format!("Product:{}", cache_id))
&& val
.get("__typename")
.map_or(false, |typename| typename == "SKU")
&& val.as_object().map_or(false, |obj| {
obj.get("__typename")
.map_or(false, |typename| typename == "SKU")
})
})
.ok_or(FetchError::NoProductSkuInSeedState)?;
.ok_or(FetchError::ParseError("No Product:cacheId* found"))?;
product_sku_json
.get("ean")
.ok_or(FetchError::NoProductSkuInSeedState)?
.as_str()
.ok_or(FetchError::NoProductSkuInSeedState)?
.and_then(|v| v.as_str())
.ok_or(FetchError::ParseError("No product SKU in seed state"))?
.to_string()
};
@ -175,29 +279,70 @@ fn parse_carrefour(url: String, html: Html) -> Result<PrecioPoint, FetchError> {
name: None,
image_url: None,
parser_version: 5,
precio_centavos: Some(precio_centavos),
precio_centavos: precio_centavos,
url: url,
})
}
fn parse_script_json(html: &Html, varname: &str) -> Option<serde_json::Value> {
let template_sel = Selector::parse(&format!(
"template[data-type=\"json\"][data-varname=\"{}\"]",
varname
))
.unwrap();
match html.select(&template_sel).next() {
Some(value) => match value.first_element_child() {
Some(script) => match serde_json::from_str(&script.inner_html()) {
Ok(val) => val,
Err(_) => None,
},
None => None,
},
None => None,
fn get_meta_content<'a>(dom: &'a VDom<'a>, prop: &str) -> Result<Option<Cow<'a, str>>, FetchError> {
let tag = &dom
.query_selector(&format!("meta[property=\"{}\"]", prop))
.and_then(|mut iter| iter.next())
.and_then(|h| h.get(dom.parser()))
.and_then(|n| n.as_tag());
match tag {
Some(tag) => Ok(Some(
tag.attributes()
.get("content")
.flatten()
.ok_or(FetchError::ParseError("Failed to get content attr"))?
.as_utf8_str(),
)),
None => Ok(None),
}
}
fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, FetchError> {
let parser = dom.parser();
let inner_html = &dom
.query_selector(&format!(
"template[data-type=\"json\"][data-varname=\"{}\"]",
varname
))
.and_then(|mut iter| iter.next())
.and_then(|h| h.get(parser))
.and_then(|n| n.as_tag())
.and_then(|t| {
t.children()
.all(parser)
.iter()
.find(|n| n.as_tag().is_some())
})
.ok_or(FetchError::ParseError("Failed to get script tag"))?
.inner_html(parser);
Ok(inner_html
.parse()
.map_err(|_| FetchError::ParseError("Couldn't parse JSON in script"))?)
}
// fn parse_script_json(html: &Html, varname: &str) -> Option<serde_json::Value> {
// let template_sel = Selector::parse(&format!(
// "template[data-type=\"json\"][data-varname=\"{}\"]",
// varname
// ))
// .unwrap();
// match html.select(&template_sel).next() {
// Some(value) => match value.first_element_child() {
// Some(script) => match serde_json::from_str(&script.inner_html()) {
// Ok(val) => val,
// Err(_) => None,
// },
// None => None,
// },
// None => None,
// }
// }
fn now_sec() -> u64 {
let start = SystemTime::now();
let since_the_epoch = start
@ -210,6 +355,6 @@ async fn db_writer(rx: Receiver<PrecioPoint>) {
let conn = Connection::open("../scraper/sqlite.db").unwrap();
// let mut stmt = conn.prepare("SELECT id, name, data FROM person")?;
while let Ok(res) = rx.recv().await {
println!("{:#?}", res)
println!("{:?}", res)
}
}