Compare commits

...

62 commits

Author SHA1 Message Date
78878d8b7e warcificator: seguir limpiando 2024-01-08 11:57:08 -03:00
1abd98724d warcificator: limpiar 2024-01-08 11:55:59 -03:00
56a257c389 warcificator: limpiar 2024-01-08 11:53:46 -03:00
2d2912e4e9 warcificator : conseguir threads de env 2024-01-08 11:51:49 -03:00
abd430421c sqlite en contenedor 2024-01-08 10:29:55 -03:00
c56272dc30 Merge branch 'master' into wip-rust-downloader 2024-01-08 10:29:24 -03:00
448341d4e9 pushear nuevo contenedor en gh actions 2024-01-06 23:05:49 -03:00
f68ede609d actualizar lockfile 2024-01-06 23:04:18 -03:00
dbdd19a4c2 usar croner en vez de cron
uso de memoria?
2024-01-06 23:03:16 -03:00
923dd0b910 basar imagen en wolfi 2024-01-06 22:42:24 -03:00
10ea426fb1 ignorar rust target 2024-01-06 22:42:19 -03:00
adb98caa7f probar usar imagen basada en debian 2024-01-06 22:30:55 -03:00
7c967a8d74 no usar tini
parece que railway.app ya usa algun pid 1
2024-01-06 21:19:53 -03:00
43954d7af8 correr cron en sitio 2024-01-06 21:17:18 -03:00
14873c95db railway.app no permite VOLUME 2024-01-06 20:25:01 -03:00
add02ac2b0 llamarlo dockerfile para que lo entienda railway.app 2024-01-06 20:23:55 -03:00
a12476129c usar solo una imagen docker 2024-01-06 20:23:06 -03:00
f645607adf search: elegir el que mas stock tenga 2024-01-05 15:41:19 -03:00
5754214498 search: poner cosas sin stock al final 2024-01-05 15:40:54 -03:00
3e4d91f330 devcontainer: corregir nombre 2024-01-05 15:20:12 -03:00
9829d40ee9 chore: limpiar 2024-01-04 22:52:53 -03:00
26cb7b80e3 siempre conseguir ultima entry para busqueda 2024-01-04 22:50:44 -03:00
a1faa4f73c conseguir ultima imagen y nombre para pagina 2024-01-04 22:47:57 -03:00
930d1b109d sitio: corregir types 2024-01-04 22:33:36 -03:00
387036a958 cachear home por 10 min
tarda mucho hacer la query random
2024-01-04 20:44:42 -03:00
0dd725aafd ups
cuando no hay warc_version_id, todos los NULL se groupean entre si dejando solo una entry.

esto hace que funcione bien, excepto cuando hay varios con el mismo warc_record_id, en ese caso van a aparecer como entries distintas. en la práctica creo que en prod no hay warc_version_ids duplicados.
2024-01-04 20:04:50 -03:00
899133e474 actualizar drizzle-orm scraper 2024-01-04 19:58:25 -03:00
802d2c3c4d ci: chequear types antes de pushear imagen 2024-01-04 19:56:46 -03:00
92e814b13a corregir types + ci types 2024-01-04 19:56:18 -03:00
6a29ed257d ci: chequear ts sitio 2024-01-04 19:48:26 -03:00
1ce33c250e lockfile 2024-01-04 19:46:08 -03:00
845dc2dac1 sitio: colores jumbo 2024-01-04 19:46:03 -03:00
f089ff5047 arreglar types productpreview 2024-01-04 19:45:56 -03:00
16a51e41b1 actualizar drizzle-orm 2024-01-04 19:39:21 -03:00
2f14580142 actualizar drizzle-orm sitio 2024-01-04 19:38:41 -03:00
70298a601f activar jumbo en auto 2024-01-04 19:32:16 -03:00
f154053204 lockfile 2024-01-04 19:28:03 -03:00
1ce87c4fce esperar mas 2024-01-04 19:27:19 -03:00
525510a8dd Jumbo 2024-01-04 19:25:17 -03:00
e890d5f63b arreglar busqueda para queries extrañas 2024-01-04 18:47:24 -03:00
f0798e8620 link-scrapers: reutilizar codigo sitemaps 2024-01-04 18:12:55 -03:00
fa6de68f60 scraper: reordenar codigo
- borrar código viejo
- centralizar scrapers de links
2024-01-04 18:10:02 -03:00
a322bc36fc coto: corregir chequeo instock 2024-01-04 17:49:10 -03:00
da9f2c8348 cli: poder scrappear links especificos 2024-01-04 17:48:27 -03:00
e6f084b1da coto instock 2024-01-04 17:45:46 -03:00
6256817ee1 retornar progres 2024-01-04 16:55:48 -03:00
df845acc66 esperar entre pedidos fallidso 2024-01-04 16:52:11 -03:00
3c9788647b mostrar path a html debug junto al error 2024-01-04 16:42:47 -03:00
4f5994a2e1 no volver a scrapear cosas salteadas 2024-01-04 16:42:37 -03:00
087be6714c reintentar scrap 2024-01-04 16:31:00 -03:00
7e58397c8c siempre guardar html debug 2024-01-04 15:30:25 -03:00
5312861c42 gh actions sitio 2024-01-04 15:24:56 -03:00
f80c3ad4fc github actions 2024-01-04 15:20:56 -03:00
71e66cf437 no asumir que offers es solo length===1 2024-01-02 19:34:35 -03:00
db008c582f solo mostrar articulos con stock en home 2024-01-02 19:25:31 -03:00
21439a5da5 conseguir ultima entry
fixes #10
2024-01-02 19:05:50 -03:00
5c52a12fdf juntar por ean busqueda
fixes #11
2024-01-02 10:34:43 -03:00
47f566cd82 bunlock 2024-01-02 10:29:29 -03:00
97d94037e3 parsear urls xml 2024-01-02 10:28:55 -03:00
4d3793ddad chore: dockerignore 2024-01-02 00:24:04 -03:00
951ac32368 chore 2024-01-02 00:23:30 -03:00
c4b49814fb RIP WARC 2024-01-02 00:21:21 -03:00
47 changed files with 911 additions and 1015 deletions

View file

@ -1,7 +1,7 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/alpine
{
"name": "Alpine",
"name": "Debian",
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
"image": "mcr.microsoft.com/devcontainers/base:debian",
"features": {

View file

@ -4,4 +4,11 @@ data/carrefour/
downloader/
node_modules/
*/node_modules/
*/Containerfile
Containerfile
*/Containerfile
Dockerfile
*/Dockerfile
*.warc.zst
.git
scraper/debug/
*/target/

54
.github/workflows/container.yml vendored Normal file
View file

@ -0,0 +1,54 @@
name: check and publish container image
on:
push:
branches: ["master"]
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
jobs:
check:
name: chequear typescript
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: oven-sh/setup-bun@v1
- run: bun install
working-directory: ./sitio
- run: bun check
working-directory: ./sitio
- run: bun install
working-directory: ./scraper
- run: bun check
working-directory: ./scraper
build-and-push-sitio:
needs: check
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Log in to the Container registry
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/sitio
- name: Build and push Docker image
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
with:
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

3
.gitignore vendored
View file

@ -13,3 +13,6 @@ scraper/x.tsv
*.tmp
target/
.env.*
*/flamegraph.svg
*/perf.data*

2
.vscode/launch.json vendored
View file

@ -13,7 +13,7 @@
// https://github.com/vadimcn/codelldb/issues/884
"args": ["build", "--manifest-path=warcificator/Cargo.toml"]
},
"args": ["../data/samples/Carrefour.50.txt"],
"args": ["../data/carrefour"],
"env": {}
},
{

30
Dockerfile Normal file
View file

@ -0,0 +1,30 @@
FROM docker.io/oven/bun:1-alpine AS base
WORKDIR /usr/src/app
FROM base as build
ENV NODE_ENV=production
RUN apk add --no-cache nodejs
COPY . .
RUN bun install --frozen-lockfile
RUN cd sitio && \
bun run build
RUN bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js
FROM cgr.dev/chainguard/wolfi-base
RUN apk add --no-cache nodejs npm jq bun sqlite
# Sitio
COPY --from=build /usr/src/app/sitio/package.json package.real.json
RUN sh -c 'echo {\"name\":\"sitio\",\"type\":\"module\",\"dependencies\":$(jq .dependencies < package.real.json)} > package.json' && npm install
COPY --from=build /usr/src/app/db-datos node_modules/db-datos
COPY --from=build /usr/src/app/sitio/build .
# Scraper
COPY --from=build /tmp/cli.build.js /bin/scraper
COPY --from=build /usr/src/app/db-datos/drizzle /bin/drizzle
ENV NODE_ENV=production
ENV DB_PATH=/db/db.db
EXPOSE 3000
CMD ["node", "."]

BIN
bun.lockb

Binary file not shown.

View file

@ -1,17 +0,0 @@
{
"name": "carrefour-link-scraper",
"type": "module",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"linkedom": "^0.16.5",
"p-map": "^7.0.1"
}
}

3
data/Jumbo.txt Normal file
View file

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6299b470d9debc9a173b40c2ba91208eb43a6c8cde02a4819e7bcd76368e4363
size 922185

100
data/samples/Jumbo.100.txt Normal file
View file

@ -0,0 +1,100 @@
https://www.jumbo.com.ar/huevos-de-color-avicoper-6-u-1-paquete-2/p
https://www.jumbo.com.ar/ajo-ahumado-organico-pampa-gourmet-285g/p
https://www.jumbo.com.ar/boxer-dst-raya-finita-art-b278-talle-m/p
https://www.jumbo.com.ar/yogur-bebible-ser-sachet-vainilla-900g/p
https://www.jumbo.com.ar/plato-playo-melamina-27-cm-boho-krea-2/p
https://www.jumbo.com.ar/mermelada-la-vieja-fabrica-frutos-del-bosque-350-gr/p
https://www.jumbo.com.ar/dr-lemon-vodka-pomelo-5/p
https://www.jumbo.com.ar/vino-cuvelier-los-andes-grand-vin-750cc/p
https://www.jumbo.com.ar/capsulas-cafe-cabrales-dg-cortado-x88gr/p
https://www.jumbo.com.ar/pizza-muzarella-e/p
https://www.jumbo.com.ar/filet-de-merluza-rebozado-8/p
https://www.jumbo.com.ar/ron-bacardi-carta-blanca-750-ml/p
https://www.jumbo.com.ar/sal-gruesa-celusal-1-kg/p
https://www.jumbo.com.ar/vaso-bajo-acrilico-boho-krea-2/p
https://www.jumbo.com.ar/espumante-chandon-demi-sec/p
https://www.jumbo.com.ar/jarra-electrica-smartlife-sl-ek1714wpn/p
https://www.jumbo.com.ar/espumante-dada-7-rose-dulce-750-cc/p
https://www.jumbo.com.ar/panquequera-hudson-de-aluminio-con-antiadherente-22cm/p
https://www.jumbo.com.ar/sacapuntas-de-plastico-pizzini-2un/p
https://www.jumbo.com.ar/vino-vinas-de-alvear-tinto-750ml/p
https://www.jumbo.com.ar/campera-mujer-puffer-larga/p
https://www.jumbo.com.ar/tabla-de-quesos/p
https://www.jumbo.com.ar/frutos-del-bosque-frutas-del-sur-x400gr/p
https://www.jumbo.com.ar/blister-resaltador-flash-amarillo-x-1-un/p
https://www.jumbo.com.ar/alim-whiskas-gatitos-carne-y-leche-500gr/p
https://www.jumbo.com.ar/detergente-polvo-zorro-blue-3k-x-1un/p
https://www.jumbo.com.ar/media-vestir-hombre-1s10471-negro/p
https://www.jumbo.com.ar/nachos-macritas-ketchup-x90g/p
https://www.jumbo.com.ar/pack-x3-medias-juvenil-liso-t-5-elemento/p
https://www.jumbo.com.ar/set-de-vehiculos-emergencias-duravit/p
https://www.jumbo.com.ar/carbon-patagonia-x-4kgs/p
https://www.jumbo.com.ar/rejilla-mr-trapo-cocina-algodon/p
https://www.jumbo.com.ar/jugo-exprimido-pura-frutta-arandanos-manzana-verde-x-1l/p
https://www.jumbo.com.ar/media-dama-invisible-alta-nyb-urb-2/p
https://www.jumbo.com.ar/boxer-nino-raya-violeta-2-colores-dst-t-10/p
https://www.jumbo.com.ar/barra-zafran-caju-y-sem-de-zapallo-x112g/p
https://www.jumbo.com.ar/iniciador-de-fuego-maderasa/p
https://www.jumbo.com.ar/queso-mozzarella-barraza-x-500grs-paq-gr-500/p
https://www.jumbo.com.ar/vaso-de-vidrio-cuadrado-360-cc/p
https://www.jumbo.com.ar/shampoo-sedal-jengibre-y-ricino-190ml/p
https://www.jumbo.com.ar/roller-gel-filgo-gel-pop-glitter-1un/p
https://www.jumbo.com.ar/una-familia-anormal-el-misterio-de-prh/p
https://www.jumbo.com.ar/veggie-stick-tomate-y-oliva-via-vita-x-50grs/p
https://www.jumbo.com.ar/bowl-stor-bicolor-mickey-mouse/p
https://www.jumbo.com.ar/vino-blanco-don-valentin-lacrado-750-ml/p
https://www.jumbo.com.ar/un-vecino-anormal-2-prh/p
https://www.jumbo.com.ar/paleta-pet-cancat-mordillo-ice/p
https://www.jumbo.com.ar/aceitunas-nucete-premium-descarozadas-180-gr/p
https://www.jumbo.com.ar/caja-plastica-6l-teen-boy-pv23-krea-2/p
https://www.jumbo.com.ar/vino-santa-julia-chardonnay-x-750-cc/p
https://www.jumbo.com.ar/protecor-solar-dermaglos-bebes-fps65-120gr/p
https://www.jumbo.com.ar/oregano-100-gr/p
https://www.jumbo.com.ar/puerro-song/p
https://www.jumbo.com.ar/repuesto-difusor-sandia-pepino-350-ml-2/p
https://www.jumbo.com.ar/botellas-plasticas-origin-580ml-rosa-2/p
https://www.jumbo.com.ar/nescafe-dolca-original-x-170gr/p
https://www.jumbo.com.ar/tapa-empanada-veggie-signo-de-oro-x-500g/p
https://www.jumbo.com.ar/inflador-de-pie-bestway-air-hammer/p
https://www.jumbo.com.ar/ketchup-ahumado-marian-arytza-400g/p
https://www.jumbo.com.ar/sal-marina-finas-hierbas-ahumada-s-tacc-450g/p
https://www.jumbo.com.ar/jugo-smudis-pomelo-500ml-brk-0-5-lt/p
https://www.jumbo.com.ar/limpiador-antihongos-ayudin-removedor-activo-envase-economico-450-ml/p
https://www.jumbo.com.ar/marcador-permanente-punta-redonda-color-negro/p
https://www.jumbo.com.ar/galletitas-dulces-con-chips-de-chocolate-pepitos-119g/p
https://www.jumbo.com.ar/afeitadora-bic-comfort-twin-l5p4-2/p
https://www.jumbo.com.ar/canvas-20x20-cm-paisajes-04-krea/p
https://www.jumbo.com.ar/turron-georgalos-de-mani-con-chocolate-x-90-gr/p
https://www.jumbo.com.ar/arroz-vanguardia-elaborado-largo-fino/p
https://www.jumbo.com.ar/set-x-3-pastafrola-fija-n-14/p
https://www.jumbo.com.ar/pulpa-fina-basilico-mutti-400-gr/p
https://www.jumbo.com.ar/vino-tinto-elementos-malbec-750-cc/p
https://www.jumbo.com.ar/enjuague-bucal-listerine-antisarro-suave-sn-alcohol-x250/p
https://www.jumbo.com.ar/almohaditas-lasfor-avellana-200-grs/p
https://www.jumbo.com.ar/vino-tinto-los-haroldos-estate-cabernet-sauvignon-750-ml/p
https://www.jumbo.com.ar/peluche-funnyland-maxtoys-tibalt-perro-28cm/p
https://www.jumbo.com.ar/cafetera-filtro-negro-electrolux-1-2-litros/p
https://www.jumbo.com.ar/media-nina-ciudadella-minnie-t2/p
https://www.jumbo.com.ar/portaretrato-colores-13x18cm-4c-krea4136010100/p
https://www.jumbo.com.ar/lustramuebles-blem-madera-aceite-de-argan-aerosol-360cc/p
https://www.jumbo.com.ar/sriracha-sauce-hashi-x250ml-2/p
https://www.jumbo.com.ar/plato-hondo-22-1-cm-ceramica-blanca/p
https://www.jumbo.com.ar/limpiador-harpic-banos-sarro-y-manchas-495ml/p
https://www.jumbo.com.ar/shampoo-dove-real-poder-de-las-plantas-purificacion-jengibre-300-ml/p
https://www.jumbo.com.ar/aromatizador-glade-mini-gel-car-3/p
https://www.jumbo.com.ar/carpeta-con-10-folios-a4/p
https://www.jumbo.com.ar/sabana-king-caracol-krea/p
https://www.jumbo.com.ar/leche-en-polvo-nutribaby-1-hmo-x-800-grs/p
https://www.jumbo.com.ar/chalitas-viavita-clasicas-x-100-grs-sin-tacc/p
https://www.jumbo.com.ar/hervidor-tramontina-14cm-cm-x1/p
https://www.jumbo.com.ar/aceitunas-de-gordal-ybarra-x240gr-2/p
https://www.jumbo.com.ar/tableta-vizzio-relleno-nugaton-x100g-2/p
https://www.jumbo.com.ar/mortadela-paladini-fetas-finas-x-200-gr-2/p
https://www.jumbo.com.ar/budin-limon-y-amapolas/p
https://www.jumbo.com.ar/vino-chac-chac-sauvingnon-blanc-lata-269cc/p
https://www.jumbo.com.ar/whisky-chivas-regal-18-yo-700cc/p
https://www.jumbo.com.ar/copa-de-vidrio-rigolleau-6/p
https://www.jumbo.com.ar/notcreamcheese-210-gr/p
https://www.jumbo.com.ar/oso-con-miel-de-abejas-cuisine-co-340-gr/p
https://www.jumbo.com.ar/difusor-aromas-spirit-spirit-win-home-250ml-x1/p
https://www.jumbo.com.ar/exprimidor-ultracomb-ex-2302/p

View file

@ -11,7 +11,7 @@
"author": "",
"license": "ISC",
"dependencies": {
"drizzle-orm": "=0.29.1"
"drizzle-orm": "^0.29.1"
},
"devDependencies": {
"@types/bun": "^1.0.0",

View file

@ -2,15 +2,23 @@ export enum Supermercado {
Dia = "Dia",
Carrefour = "Carrefour",
Coto = "Coto",
Jumbo = "Jumbo",
}
export const supermercados: Supermercado[] = [
Supermercado.Carrefour,
Supermercado.Coto,
Supermercado.Dia,
Supermercado.Jumbo,
];
export const hosts: { [host: string]: Supermercado } = {
"diaonline.supermercadosdia.com.ar": Supermercado.Dia,
"www.carrefour.com.ar": Supermercado.Carrefour,
"www.cotodigital3.com.ar": Supermercado.Coto,
"www.jumbo.com.ar": Supermercado.Jumbo,
};
export const colorBySupermercado: { [supermercado in Supermercado]: string } = {
[Supermercado.Dia]: "#d52b1e",
[Supermercado.Carrefour]: "#19549d",
[Supermercado.Coto]: "#e20025",
[Supermercado.Jumbo]: "#2dc850",
};

View file

@ -1,17 +0,0 @@
{
"name": "dia-link-scraper",
"type": "module",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"linkedom": "^0.16.5",
"p-map": "^7.0.0"
}
}

View file

@ -1,5 +1,6 @@
import pMap from "p-map";
import { saveUrls } from "db-datos/urlHelpers.js";
import { getUrlsFromSitemap } from "./common.js";
export async function scrapCarrefourProducts() {
await scrapBySitemap();
@ -25,17 +26,7 @@ async function scrapBySitemap() {
async (sitemapUrl) => {
const res = await fetch(sitemapUrl);
const xml = await res.text();
let urls = new Set<string>();
new HTMLRewriter()
.on("loc", {
text(element) {
const txt = element.text.trim();
if (!txt) return;
urls.add(txt);
},
})
.transform(new Response(xml));
saveUrls(Array.from(urls));
saveUrls(getUrlsFromSitemap(xml));
},
{ concurrency: 3 }
);

14
link-scrapers/common.ts Normal file
View file

@ -0,0 +1,14 @@
import { decodeXML } from "entities";
export function getUrlsFromSitemap(xml: string) {
let urls = new Set<string>();
new HTMLRewriter()
.on("loc", {
text(element) {
const txt = element.text.trim();
if (!txt) return;
urls.add(decodeXML(txt));
},
})
.transform(new Response(xml));
return Array.from(urls);
}

View file

@ -1,4 +1,3 @@
import { getHtml } from "../scraper/fetch.js";
import { parseHTML } from "linkedom";
import PQueue from "p-queue";
import { saveUrls } from "db-datos/urlHelpers.js";
@ -28,12 +27,13 @@ function getPage(url: string) {
return async () => {
let html;
try {
html = await getHtml(url);
const res = await fetch(url);
html = await res.text();
} catch (error) {
await getPage(url)();
return;
}
const { document } = parseHTML(html.toString("utf-8"));
const { document } = parseHTML(html);
const hrefs = Array.from(
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),

View file

@ -1,7 +1,7 @@
import pMap from "p-map";
import { parseHTML } from "linkedom";
import { getHtml } from "../scraper/fetch.js";
import { saveUrls } from "db-datos/urlHelpers.js";
import { getUrlsFromSitemap } from "./common.js";
const categorias = [
"https://diaonline.supermercadosdia.com.ar/almacen",
@ -81,21 +81,15 @@ async function scrapBySitemap() {
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
];
await pMap(sitemaps, async (sitemapUrl) => {
const res = await fetch(sitemapUrl);
const xml = await res.text();
let urls = new Set<string>();
new HTMLRewriter()
.on("loc", {
text(element) {
const txt = element.text.trim();
if (!txt) return;
urls.add(txt);
},
})
.transform(new Response(xml));
saveUrls(Array.from(urls));
});
await pMap(
sitemaps,
async (sitemapUrl) => {
const res = await fetch(sitemapUrl);
const xml = await res.text();
saveUrls(getUrlsFromSitemap(xml));
},
{ concurrency: 3 }
);
}
async function scrapBySite() {
@ -110,8 +104,9 @@ async function scrapBySite() {
await pMap(
links,
async (url) => {
const html = await getHtml(url);
const { document } = parseHTML(html.toString("utf-8"));
const res = await fetch(url);
const html = await res.text();
const { document } = parseHTML(html);
const hrefs = Array.from(
document.querySelectorAll<HTMLAnchorElement>(

38
link-scrapers/jumbo.ts Normal file
View file

@ -0,0 +1,38 @@
import pMap from "p-map";
import { saveUrls } from "db-datos/urlHelpers.js";
import { getUrlsFromSitemap } from "./common.js";
export async function scrapJumboProducts() {
await scrapBySitemap();
}
async function scrapBySitemap() {
// de https://www.jumbo.com.ar/sitemap.xml
const sitemaps = [
"https://www.jumbo.com.ar/sitemap/product-1.xml",
"https://www.jumbo.com.ar/sitemap/product-10.xml",
"https://www.jumbo.com.ar/sitemap/product-11.xml",
"https://www.jumbo.com.ar/sitemap/product-12.xml",
"https://www.jumbo.com.ar/sitemap/product-13.xml",
"https://www.jumbo.com.ar/sitemap/product-14.xml",
"https://www.jumbo.com.ar/sitemap/product-15.xml",
"https://www.jumbo.com.ar/sitemap/product-2.xml",
"https://www.jumbo.com.ar/sitemap/product-3.xml",
"https://www.jumbo.com.ar/sitemap/product-4.xml",
"https://www.jumbo.com.ar/sitemap/product-5.xml",
"https://www.jumbo.com.ar/sitemap/product-6.xml",
"https://www.jumbo.com.ar/sitemap/product-7.xml",
"https://www.jumbo.com.ar/sitemap/product-8.xml",
"https://www.jumbo.com.ar/sitemap/product-9.xml",
];
await pMap(
sitemaps,
async (sitemapUrl) => {
const res = await fetch(sitemapUrl);
const xml = await res.text();
saveUrls(getUrlsFromSitemap(xml));
},
{ concurrency: 3 }
);
}

View file

@ -1,5 +1,5 @@
{
"name": "coto-link-scraper",
"name": "link-scrapers",
"type": "module",
"version": "1.0.0",
"description": "",
@ -11,6 +11,7 @@
"author": "",
"license": "ISC",
"dependencies": {
"entities": "^4.5.0",
"linkedom": "^0.16.5",
"p-queue": "^8.0.1"
}

View file

@ -2,9 +2,7 @@
"name": "preciazo",
"private": true,
"workspaces": [
"dia-link-scraper",
"coto-link-scraper",
"carrefour-link-scraper",
"link-scrapers",
"scraper",
"sitio",
"db-datos"

View file

@ -4,33 +4,23 @@ scrapeo "masivo" de precios y datos en supermercados argentinos
## componentes (en orden de proceso)
- los link scrapers ([coto-link-scraper](./coto-link-scraper/), [dia-link-scraper](./dia-link-scraper/) y [carrefour-link-scraper](./carrefour-link-scraper)) crean listas de links a productos para scrapear
- los link scrapers ([link-scrapers/](./link-scrapers/)) crean listas de links a productos para scrapear
(no hace falta correrlos porque ya hay listas armadas en [data/](./data/))
- [warcificator](./warcificator/) descarga las paginas de productos y genera un archivo [WARC](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.0/) con ellas
- el [scraper](./scraper/) procesa estos WARCs, extrayendo varios datos y guardandolos en una base de datos SQLite (definida en [db-datos](./db-datos/schema.ts))
- el [scraper](./scraper/) descarga todos los links, extrayendo varios datos y guardandolos en una base de datos SQLite (definida en [db-datos](./db-datos/schema.ts))
- el [sitio](./sitio/) renderiza páginas a partir de la base de datos y hace gráficos lindos
## setup
hay que instalar [Bun](https://bun.sh/), que lo estoy usando porque hacía que el scraper corra más rápido. quizás en el futuro lo reemplace con good old Node.js.
aparte, se necesita zstd, que se usa para comprimir los WARCs eficientemente. seguro está disponible en las repos de tu distro favorita :)
empezá descargando un WARC con 50 páginas de sample, y recomprimilo con zstd:
```
wget --no-verbose --tries=3 --delete-after --input-file ./data/samples/Dia.txt --warc-file=dia-sample
gzip -dc dia-sample.warc.gz | zstd --long -15 --no-sparse -o dia-sample.warc.zst
```
después, scrapealo a una BD:
después, escrapea un sample de productos de Carrefour a una BD:
```
cd scraper/
bun install
bun cli.ts scrap ../dia-sample.warc.zst
bun cli.ts scrap ./data/samples/Carrefour.50.txt
```
ahora miralo en el sitio:

View file

@ -1,42 +0,0 @@
FROM docker.io/oven/bun:1-alpine AS base
WORKDIR /usr/src/app
FROM base AS builder
ENV NODE_ENV=production
COPY . .
RUN bun install --frozen-lockfile \
&& bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js \
&& rm -rf node_modules/
# https://dev.to/deciduously/use-multi-stage-docker-builds-for-statically-linked-rust-binaries-3jgd
FROM docker.io/rust:1.74 AS warcificator-builder
WORKDIR /usr/src/
RUN rustup target add x86_64-unknown-linux-musl
RUN apt-get update && apt-get install -y musl-tools musl-dev
RUN USER=root cargo new warcificator
WORKDIR /usr/src/warcificator
COPY ./warcificator/Cargo.toml ./warcificator/Cargo.lock ./
RUN cargo build --release
COPY ./warcificator/src ./src
RUN cargo install --target x86_64-unknown-linux-musl --path .
FROM base
RUN apk add --no-cache wget zstd tini
RUN printf "#!/bin/sh\nexec bun /bin/scraper auto\n" > /etc/periodic/daily/scraper \
&& chmod +x /etc/periodic/daily/scraper
COPY --from=builder /tmp/cli.build.js /bin/scraper
COPY --from=warcificator-builder /usr/local/cargo/bin/warcificator /bin/
COPY --from=builder /usr/src/app/db-datos/drizzle /bin/drizzle
COPY --from=builder /usr/src/app/data /listas
WORKDIR /app
VOLUME /db
ENV NODE_ENV=production
ENV DB_PATH=/db/db.db
ENV LISTS_DIR=/listas/
CMD ["tini", "/bin/busybox", "crond", "-f", "-l2"]
# CMD ["bun", "/bin/scraper"]

View file

@ -1,29 +1,20 @@
import { mkdtemp, access, writeFile } from "node:fs/promises";
import { mkdtemp, writeFile } from "node:fs/promises";
import { tmpdir } from "node:os";
import { join, resolve } from "node:path";
import { spawn } from "node:child_process";
import { Supermercado, hosts } from "db-datos/supermercado.js";
import { join } from "node:path";
import { Supermercado, hosts, supermercados } from "db-datos/supermercado.js";
import PQueue from "p-queue";
import { format, formatDuration, intervalToDuration } from "date-fns";
import { parseWarc } from "./scrap.js";
import { S3Client } from "@aws-sdk/client-s3";
import { Upload } from "@aws-sdk/lib-storage";
import { BunFile } from "bun";
import { formatDuration, intervalToDuration } from "date-fns";
import { downloadList } from "./scrap.js";
import { db } from "db-datos/db.js";
import { like } from "drizzle-orm";
import { productoUrls } from "db-datos/schema.js";
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
const supermercados: Supermercado[] = [
Supermercado.Carrefour,
Supermercado.Coto,
Supermercado.Dia,
];
import { scrapDiaProducts } from "../link-scrapers/dia.js";
import { scrapCotoProducts } from "../link-scrapers/coto.js";
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
// hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU
const scrapQueue = new PQueue({ concurrency: 1 });
const scrapQueue = new PQueue({ concurrency: 4 });
export async function auto() {
const a = new Auto();
@ -31,35 +22,9 @@ export async function auto() {
}
class Auto {
s3Config?: { s3: S3Client; bucketName: string };
telegramConfig?: { token: string; chatId: string };
constructor() {
if (
!process.env.S3_ACCESS_KEY_ID ||
!process.env.S3_SECRET_ACCESS_KEY ||
!process.env.S3_BUCKET_NAME
) {
if (process.env.NODE_ENV === "development") {
console.warn("faltan creds de s3, no voy a subir a s3");
} else {
throw new Error("faltan creds de s3");
}
} else {
this.s3Config = {
// https://www.backblaze.com/docs/cloud-storage-use-the-aws-sdk-for-javascript-v3-with-backblaze-b2
s3: new S3Client({
endpoint: "https://s3.us-west-004.backblazeb2.com",
region: "us-west-004",
credentials: {
accessKeyId: process.env.S3_ACCESS_KEY_ID,
secretAccessKey: process.env.S3_SECRET_ACCESS_KEY,
},
}),
bucketName: process.env.S3_BUCKET_NAME,
};
}
if (!process.env.TELEGRAM_BOT_TOKEN)
console.warn("no hay TELEGRAM_BOT_TOKEN, no voy a loggear por allá");
else if (!process.env.TELEGRAM_BOT_CHAT_ID)
@ -89,6 +54,9 @@ class Auto {
case "Carrefour":
await scrapCarrefourProducts();
break;
case "Jumbo":
await scrapJumboProducts();
break;
}
this.inform(
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
@ -107,93 +75,29 @@ class Auto {
const urls = results.map((r) => r.url);
await writeFile(listPath, urls.join("\n") + "\n");
const date = new Date();
const zstdWarcName = `${supermercado}-${format(
date,
"yyyy-MM-dd-HH:mm"
)}.warc.zst`;
const zstdWarcPath = join(ctxPath, zstdWarcName);
const subproc = Bun.spawn({
cmd: ["warcificator", listPath, zstdWarcPath],
stderr: "ignore",
stdout: "ignore",
cwd: ctxPath,
});
const t0 = performance.now();
await subproc.exited;
this.inform(
`[downloader] ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}`
);
if (!(await fileExists(zstdWarcPath))) {
const err = this.report(`no encontré el ${zstdWarcPath}`);
throw err;
}
this.scrapAndInform({ zstdWarcPath, zstdWarcName });
try {
await this.uploadToBucket({
fileName: zstdWarcName,
file: Bun.file(zstdWarcPath),
});
} catch (error) {
this.inform(`Falló subir ${zstdWarcName} a S3; ${error}`);
console.error(error);
}
this.scrapAndInform({ listPath });
// TODO: borrar archivos temporales
}
async scrapAndInform({
zstdWarcPath,
zstdWarcName,
}: {
zstdWarcPath: string;
zstdWarcName: string;
}) {
async scrapAndInform({ listPath }: { listPath: string }) {
const res = await scrapQueue.add(async () => {
const t0 = performance.now();
const progress = await parseWarc(zstdWarcPath);
const progress = await downloadList(listPath);
return { took: performance.now() - t0, progress };
});
if (res) {
const { took, progress } = res;
this.inform(
`Procesado ${zstdWarcName} (${progress.done} ok, ${
progress.errors.length
} errores) (tardó ${formatMs(took)})`
`Procesado ${listPath} (${progress.done} ok, ${
progress.skipped
} skipped, ${progress.errors.length} errores) (tardó ${formatMs(took)})`
);
} else {
this.inform(`Algo falló en ${zstdWarcName}`);
this.inform(`Algo falló en ${listPath}`);
}
}
async uploadToBucket({
fileName,
file,
}: {
fileName: string;
file: BunFile;
}) {
if (!this.s3Config) {
this.inform(
`[s3] Se intentó subir ${fileName} pero no tenemos creds de S3`
);
return;
}
const parallelUploads3 = new Upload({
client: this.s3Config.s3,
params: {
Bucket: this.s3Config.bucketName,
Key: fileName,
Body: file,
},
});
await parallelUploads3.done();
}
inform(msg: string) {
this.sendTelegramMsg(msg);
console.info(msg);
@ -216,16 +120,6 @@ class Auto {
}
}
// no se llama exists porque bun tiene un bug en el que usa fs.exists por mas que exista una funcion llamada exists
async function fileExists(path: string) {
try {
access(path);
return true;
} catch {
return false;
}
}
function formatMs(ms: number) {
return formatDuration(intervalToDuration({ start: 0, end: Math.round(ms) }));
}

View file

@ -1,8 +1,9 @@
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
import { scrapCotoProducts } from "../link-scrapers/coto.js";
import { scrapDiaProducts } from "../link-scrapers/dia.js";
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
import { auto } from "./auto.js";
import { parseWarc } from "./scrap.js";
import { downloadList, getProduct } from "./scrap.js";
if (process.argv[2] === "auto") {
await auto();
@ -12,17 +13,24 @@ if (process.argv[2] === "auto") {
await scrapDiaProducts();
} else if (process.argv[2] === "scrap-coto-links") {
await scrapCotoProducts();
} else if (process.argv[2] === "scrap-jumbo-links") {
await scrapJumboProducts();
} else if (process.argv[2] === "scrap-link") {
const url = new URL(process.argv[3]);
const res = await fetch(url);
const text = await res.text();
console.info(await getProduct(url, text));
} else if (process.argv[2] === "scrap") {
const warcPaths = process.argv.slice(3);
if (warcPaths.length > 0) {
for (const path of warcPaths) {
const res = await parseWarc(path);
const urlLists = process.argv.slice(3);
if (urlLists.length > 0) {
for (const path of urlLists) {
const res = await downloadList(path);
console.info("=======================================");
console.info(path, res);
console.info("=======================================");
}
} else {
console.error("Especificá WARCs para scrapear.");
console.error("Especificá listas de urls para scrapear.");
process.exit(1);
}
} else {

View file

@ -1,13 +0,0 @@
export async function getHtml(url: string) {
const res = await fetch(url);
return readableToBuffer(res.body!);
}
async function readableToBuffer(source: AsyncIterable<any>) {
// https://stackoverflow.com/a/72891118
const buffers = [];
for await (const data of source) {
buffers.push(data);
}
return Buffer.concat(buffers);
}

View file

@ -5,8 +5,7 @@
"description": "",
"main": "index.js",
"scripts": {
"build:container": "podman build -t gitea.nulo.in/nulo/preciazo/scraper -f ./Containerfile ..",
"push:container": "bun build:container && podman push gitea.nulo.in/nulo/preciazo/scraper"
"check": "tsc"
},
"keywords": [],
"author": "",
@ -16,11 +15,11 @@
"@aws-sdk/lib-storage": "^3.478.0",
"date-fns": "^3.0.6",
"db-datos": "workspace:^",
"drizzle-orm": "=0.29.1",
"drizzle-orm": "^0.29.1",
"linkedom": "^0.16.5",
"nanoid": "^5.0.4",
"p-map": "^7.0.1",
"p-queue": "^8.0.1",
"warcio": "^2.2.1",
"zod": "^3.22.4"
},
"devDependencies": {

View file

@ -1,6 +1,6 @@
import { parseHTML } from "linkedom";
import { Precioish } from "../scrap.js";
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "../common.js";
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
function parseScriptJson<T>(dom: Window, varname: string): T {
const script = dom.window.document.querySelector<HTMLTemplateElement>(

View file

@ -21,7 +21,7 @@ function parseJsonLds(dom: Window): object[] {
const scripts = dom.window.document.querySelectorAll(
'script[type="application/ld+json"]'
);
return [...scripts].map((scripts) => JSON.parse(scripts.innerHTML));
return Array.from(scripts, (script) => JSON.parse(script.innerHTML));
}
function findJsonLd(dom: Window, type: string): object | undefined {
return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type);
@ -31,8 +31,9 @@ const zProductLd = z.object({
"@type": z.literal("Product"),
name: z.string(),
image: z.string(),
sku: z.string().optional(),
offers: z.object({
offers: z.tuple([
offers: z.array(
z.object({
"@type": z.literal("Offer"),
price: z.number(),
@ -41,8 +42,8 @@ const zProductLd = z.object({
"http://schema.org/OutOfStock",
"http://schema.org/InStock",
]),
}),
]),
})
),
}),
});
type ProductLd = z.infer<typeof zProductLd>;

View file

@ -19,7 +19,7 @@ function getEanFromText({ document }: Window) {
}
function getPriceFromText({ document }: Window) {
const el = document.querySelector(".atg_store_newPrice");
if (!el?.textContent) throw new Error("no encuentro el precio");
if (!el?.textContent) return null;
const nStr = el.textContent
.trim()
.replace("$", "")
@ -27,12 +27,16 @@ function getPriceFromText({ document }: Window) {
.replace(",", ".");
return parseFloat(nStr) * 100;
}
function getInStock({ document }: Window) {
return !document.querySelector(".product_not_available");
}
export function getCotoProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html);
const ean = getEanFromText(dom);
const precioCentavos = getPriceFromText(dom);
const inStock = getInStock(dom);
const name = dom.document
.querySelector("h1.product_page")
@ -40,5 +44,5 @@ export function getCotoProduct(html: string | Buffer): Precioish {
const imageUrl =
dom.document.querySelector<HTMLImageElement>(".zoom img")?.src;
return { name, imageUrl, ean, precioCentavos };
return { name, imageUrl, ean, precioCentavos, inStock };
}

View file

@ -1,6 +1,6 @@
import { parseHTML } from "linkedom";
import { type Precioish } from "../scrap.js";
import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js";
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
export function getDiaProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html);

54
scraper/parsers/jumbo.ts Normal file
View file

@ -0,0 +1,54 @@
import { parseHTML } from "linkedom";
import { type Precioish } from "../scrap.js";
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
import { z } from "zod";
const zJumboSearch = z.tuple([
z.object({
items: z.array(
z.object({
ean: z.string(),
})
),
}),
]);
async function getEanFromSearch(sku: string) {
const url = new URL(
"https://www.jumbo.com.ar/api/catalog_system/pub/products/search"
);
url.searchParams.set("fq", `skuId:${sku}`);
const res = await fetch(url);
const json = await res.json();
const parsed = zJumboSearch.parse(json);
const ean = parsed[0].items[0].ean;
if (!parsed[0].items.every((x) => x.ean === ean)) {
throw new Error("Inesperado: no todos los items tienen el mismo EAN");
}
return ean;
}
export async function getJumboProduct(
html: string | Buffer
): Promise<Precioish> {
const dom = parseHTML(html);
const precioCentavos = priceFromMeta(dom);
const inStock = stockFromMeta(dom);
const ld = getProductJsonLd(dom);
const name = ld.name;
const imageUrl = ld.image;
const retailerSku = ld.sku;
if (!retailerSku)
throw new Error("No encontré el SKU de Jumbo para pedir el EAN");
const ean = await getEanFromSearch(retailerSku);
return {
name,
imageUrl,
ean,
precioCentavos,
inStock,
};
}

View file

@ -1,112 +1,127 @@
/// <reference lib="dom" />
import * as schema from "db-datos/schema.js";
import { WARCParser } from "warcio";
import { writeFile } from "fs/promises";
import { writeFile, mkdir } from "fs/promises";
import { createHash } from "crypto";
import { getCarrefourProduct } from "./parsers/carrefour.js";
import { getDiaProduct } from "./parsers/dia.js";
import { getCotoProduct } from "./parsers/coto.js";
import { join } from "path";
import { and, eq, sql } from "drizzle-orm";
import { db } from "db-datos/db.js";
import pMap from "p-map";
import { getJumboProduct } from "./parsers/jumbo.js";
const DEBUG = false;
const DEBUG = true;
const PARSER_VERSION = 4;
const getPrevPrecio = db
.select({ id: schema.precios.id })
.from(schema.precios)
.where(
and(
eq(schema.precios.warcRecordId, sql.placeholder("warcRecordId")),
eq(schema.precios.parserVersion, PARSER_VERSION)
)
)
.limit(1)
.prepare();
export type Precio = typeof schema.precios.$inferInsert;
export type Precioish = Omit<
Precio,
"fetchedAt" | "url" | "id" | "warcRecordId" | "parserVersion"
>;
export async function parseWarc(path: string) {
// const warc = createReadStream(path);
let progress: {
done: number;
errors: { error: any; warcRecordId: string; path: string }[];
} = { done: 0, errors: [] };
export async function downloadList(path: string) {
let list = (await Bun.file(path).text())
.split("\n")
.filter((s) => s.length > 0);
const proc = Bun.spawn(["zstdcat", "-d", path], {});
const warc = proc.stdout;
// TODO: tirar error si falla zstd
const parser = new WARCParser(warc);
for await (const record of parser) {
if (record.warcType === "response") {
if (!record.warcTargetURI) continue;
const warcRecordId = record.warcHeader("WARC-Record-ID");
if (!warcRecordId) throw new Error("No tiene WARC-Record-ID");
if (getPrevPrecio.get({ warcRecordId })) {
console.debug(`skipped ${warcRecordId}`);
continue;
}
if (record.httpHeaders?.statusCode !== 200) {
console.debug(
`skipped ${warcRecordId} because status=${record.httpHeaders?.statusCode} (!=200)`
);
continue;
}
// TODO: sobreescribir si existe el mismo record-id pero con version mas bajo?
const html = await record.contentText();
const url = new URL(record.warcTargetURI);
try {
let ish: Precioish | undefined = undefined;
if (url.hostname === "www.carrefour.com.ar")
ish = getCarrefourProduct(html);
else if (url.hostname === "diaonline.supermercadosdia.com.ar")
ish = getDiaProduct(html);
else if (url.hostname === "www.cotodigital3.com.ar")
ish = getCotoProduct(html);
else throw new Error(`Unknown host ${url.hostname}`);
const p: Precio = {
...ish,
fetchedAt: new Date(record.warcDate!),
url: record.warcTargetURI,
warcRecordId,
parserVersion: PARSER_VERSION,
};
await db.insert(schema.precios).values(p);
progress.done++;
} catch (error) {
console.error({ path, warcRecordId, error });
progress.errors.push({
path,
warcRecordId,
error,
});
if (DEBUG) {
const urlHash = createHash("md5")
.update(record.warcTargetURI!)
.digest("hex");
const output = join("debug", `${urlHash}.html`);
await writeFile(output, html);
console.error(`wrote html to ${output}`);
const results = await pMap(
list,
async (urlS) => {
let res: ScrapResult = { type: "skipped" };
for (let attempts = 0; attempts < 6; attempts++) {
if (attempts !== 0) await wait(1500);
res = await scrap(urlS);
if (res.type === "done" || res.type === "skipped") {
break;
}
}
if (res.type === "error") console.error(res);
return res;
},
{ concurrency: 32 }
);
let progress: {
done: number;
skipped: number;
errors: { error: any; url: string; debugPath: string }[];
} = { done: 0, skipped: 0, errors: [] };
for (const result of results) {
switch (result.type) {
case "done":
progress.done++;
break;
case "error":
progress.errors.push(result);
break;
case "skipped":
progress.skipped++;
break;
}
}
if ((await proc.exited) !== 0) {
throw new Error("zstd tiró un error");
}
return progress;
}
export async function getProduct(url: URL, html: string): Promise<Precioish> {
if (url.hostname === "www.carrefour.com.ar") return getCarrefourProduct(html);
else if (url.hostname === "diaonline.supermercadosdia.com.ar")
return getDiaProduct(html);
else if (url.hostname === "www.cotodigital3.com.ar")
return getCotoProduct(html);
else if (url.hostname === "www.jumbo.com.ar")
return await getJumboProduct(html);
else throw new Error(`Unknown host ${url.hostname}`);
}
type ScrapResult =
| { type: "skipped" }
| { type: "done" }
| { type: "error"; url: string; error: any; debugPath: string };
async function scrap(urlS: string): Promise<ScrapResult> {
let url;
try {
url = new URL(urlS);
} catch (err) {
console.error(`skipped ${urlS} because ${err}`);
return { type: "skipped" };
}
const res = await fetch(url);
if (!res.ok) {
console.debug(`skipped ${urlS} because status=${res.status} (!=200)`);
return { type: "skipped" };
}
const html = await res.text();
try {
let ish = await getProduct(url, html);
const p: Precio = {
...ish,
fetchedAt: new Date(),
url: urlS,
parserVersion: PARSER_VERSION,
};
await db.insert(schema.precios).values(p);
return { type: "done" };
} catch (error) {
const urlHash = createHash("md5").update(urlS).digest("hex");
const output = join("debug", `${urlHash}.html`);
if (DEBUG) {
await mkdir("debug", { recursive: true });
await writeFile(output, html);
}
return {
type: "error",
url: urlS,
error,
debugPath: output,
};
}
}
function wait(ms: number) {
return new Promise((resolve) => setTimeout(resolve, ms));
}

View file

@ -1,3 +1,4 @@
{
"extends": "../tsconfig.json"
"extends": "../tsconfig.json",
"exclude": ["../sitio"]
}

View file

@ -1,157 +0,0 @@
const crlf = "\r\n";
const crlfB = Buffer.from(crlf, "utf-8");
const crlfcrlf = crlf + crlf;
const crlfcrlfB = Buffer.from(crlfcrlf, "utf-8");
const warc10B = Buffer.from("WARC/1.0", "utf-8");
const emptyBuffer = Buffer.from("", "utf-8");
export async function* parseWARC(path: string) {
const warc = Bun.spawn(["zstd", "-do", "/dev/stdout", path], {
stderr: "ignore",
}).stdout;
// const warc = Bun.stdin.stream(1024 * 1024 * 128);
// let buffer: Uint8Array[] = [];
// const transform = new TransformStream<Uint8Array, Buffer>({
// transform(chunk, controller) {
// buffer.push(chunk);
// if (
// buffer.reduce((prev, curr) => prev + curr.length, 0) >
// 1024 * 1024 * 64
// ) {
// controller.enqueue(Buffer.concat(buffer));
// buffer = [];
// }
// },
// flush(controller) {
// controller.enqueue(Buffer.concat(buffer));
// },
// });
// warc.pipeTo(transform.writable);
const reader = warc.getReader();
// const reader = transform.readable.getReader();
// const warc = process.stdin;
let arrays: Buffer[] = [];
let done = false;
while (!done) {
const r = await reader.readMany();
if (r.done) {
done = true;
} else {
arrays = arrays.concat(r.value.map((x) => Buffer.from(x)));
if (
arrays.reduce((prev, curr) => prev + curr.length, 0) <
1024 * 1024 * 10
)
continue;
}
let buf: Buffer;
while (
((buf = arrays.length === 1 ? arrays[0] : Buffer.concat(arrays)),
buf.subarray(warc10B.length).includes(warc10B))
) {
const until = buf.indexOf(crlfcrlfB);
const header = buf.subarray(0, until);
const lines = splitBuffer(header, crlfB);
let i = 0;
const nextLine = () => {
const line = lines[i];
i++;
return line ? line : emptyBuffer;
};
let line: Buffer;
if (!(line = nextLine()).equals(warc10B)) {
throw new Error(`No WARC 1.0 header in '${line}'`);
}
let field;
let fields = new Map<string, string>();
while (
((line = nextLine()),
(field = parseField(line.toString("utf8"))),
line.length !== 0)
) {
fields.set(field[0], field[1]);
}
const length = parseInt(fields.get("Content-Length")!);
const rawHttp = buf.subarray(
until + crlfcrlfB.length,
until + crlfcrlfB.length + length
);
const rawHttpHeaders = rawHttp
.subarray(
rawHttp.indexOf(crlfB) + crlfB.length,
rawHttp.indexOf(crlfcrlfB) + crlfcrlfB.length
)
.toString();
let httpHeaders = new Map<string, string>();
rawHttpHeaders.split(crlf).forEach((line) => {
if (!line.length) return;
const [key, val] = line.split(": ");
httpHeaders.set(key, val);
});
let content = rawHttp.subarray(
rawHttp.indexOf(crlfcrlfB) + crlfcrlfB.length
);
if (httpHeaders.get("Transfer-Encoding") === "chunked") {
content = dechunk(content);
}
// console.debug(fields.get("WARC-Date"), content.length);
yield {
fields,
content,
};
arrays = [
buf.subarray(until + crlfcrlfB.length + length + crlfcrlfB.length),
];
if (!arrays[0].length) break;
}
}
}
function splitBuffer(buffer: Buffer, val: Buffer): Buffer[] {
let bufs = [];
let rest = buffer;
let i;
while (((i = rest.indexOf(val)), i !== -1)) {
bufs.push(rest.subarray(0, i));
rest = rest.subarray(i + val.length);
}
bufs.push(rest);
return bufs;
}
function parseField(line: string): [string, string] {
const [key, val] = line.split(": ");
return [key, val];
}
function dechunk(content: Buffer): Buffer {
let actualContent = [];
while (true) {
let until = content.indexOf(crlf);
const hexLen = content.subarray(0, until).toString();
if (hexLen.length === 0) break;
const len = parseInt(hexLen, 16);
actualContent.push(
content.subarray(until + crlfB.length, until + crlfB.length + len)
);
content = content.subarray(until + crlfB.length + len + crlfB.length);
}
return Buffer.concat(actualContent);
}

View file

@ -1,31 +0,0 @@
FROM docker.io/oven/bun:1-alpine as build
RUN apk add --no-cache nodejs
WORKDIR /usr/src/app
COPY . .
WORKDIR /usr/src/app/sitio
RUN bun install && \
bun run build
# FROM docker.io/oven/bun:1-alpine as deps
# WORKDIR /usr/src/app/sitio
# RUN bun init && bun install "better-sqlite3"@"^9.2.2" "chart.js"@"^4.4.1" "chartjs-adapter-dayjs-4"@"^1.0.4" "dayjs"@"^1.11.10" "drizzle-orm"@"^0.29.1"
# COPY --from=build /usr/src/app/db-datos node_modules/db-datos
FROM docker.io/alpine:3.19
RUN apk add --no-cache tini nodejs npm jq
WORKDIR /app
COPY --from=build /usr/src/app/sitio/package.json package.real.json
RUN sh -c 'echo {\"name\":\"sitio\",\"type\":\"module\",\"dependencies\":$(jq .dependencies < package.real.json)} > package.json' && npm install
COPY --from=build /usr/src/app/db-datos node_modules/db-datos
COPY --from=build /usr/src/app/sitio/build .
# https://github.com/gornostay25/svelte-adapter-bun/issues/39
ENV PROTOCOL_HEADER=x-forwarded-proto
ENV HOST_HEADER=x-forwarded-host
VOLUME /db
ENV DB_PATH=/db/db.db
EXPOSE 3000
CMD ["tini", "node", "."]

View file

@ -38,7 +38,8 @@
"better-sqlite3": "^9.2.2",
"chart.js": "^4.4.1",
"chartjs-adapter-dayjs-4": "^1.0.4",
"croner": "^8.0.0",
"dayjs": "^1.11.10",
"drizzle-orm": "=0.29.1"
"drizzle-orm": "^0.29.1"
}
}

12
sitio/src/hooks.server.ts Normal file
View file

@ -0,0 +1,12 @@
import { spawn } from "child_process";
import Cron from "croner";
if (process.env.NODE_ENV === "production") {
const job = Cron("15 3 * * *", () => {
runScraper();
});
}
function runScraper() {
spawn("bun", ["/bin/scraper", "auto"], { stdio: "inherit" });
}

View file

@ -1,8 +1,10 @@
<script lang="ts">
export let product: { ean: string; name: string; imageUrl: string };
export let product: { ean: string; name: string; imageUrl?: string | null };
</script>
<a href={`/ean/${product.ean}`} class="flex">
<img src={product.imageUrl} alt={product.name} class="max-h-48" />
{#if product.imageUrl}
<img src={product.imageUrl} alt={product.name} class="max-h-48" />
{/if}
<p class="text-xl">{product.name}</p>
</a>

View file

@ -1,9 +1,22 @@
import type { PageServerLoad } from "./$types";
import type { PageData, PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db";
const { precios } = schema;
import { sql } from "drizzle-orm";
export const load: PageServerLoad = async ({ params }) => {
let cache: null | { key: Date; data: { precios: Precios } } = null;
type Precios = {
ean: string;
name: string | null;
imageUrl: string | null;
}[];
export const load: PageServerLoad = async ({
params,
}): Promise<{ precios: Precios }> => {
if (cache && +new Date() < +cache.key + 1000 * 60 * 10) {
return cache.data;
}
const q = db
.select({
ean: precios.ean,
@ -12,9 +25,11 @@ export const load: PageServerLoad = async ({ params }) => {
})
.from(precios)
.groupBy(precios.ean)
.having(sql`max(length(name))`)
.having(sql`max(length(name)) and max(parser_version) and in_stock`)
.orderBy(sql`random()`)
.limit(150);
const res = await q;
return { precios: res };
const data = { precios: res };
cache = { key: new Date(), data };
return data;
};

View file

@ -3,6 +3,10 @@
import type { PageData } from "./$types";
export let data: PageData;
$: precios = data.precios.filter(
(d): d is { ean: string; name: string; imageUrl: string | null } =>
!!d.name,
);
</script>
<h1 class="text-xl">WIP</h1>
@ -32,7 +36,7 @@
<section>
<h2 class="text-lg font-bold">Random</h2>
<ul class="grid grid-cols-1 gap-4 md:grid-cols-2 lg:grid-cols-3">
{#each data.precios as product}
{#each precios as product}
<li>
<ProductPreview {product} />
</li>

View file

@ -9,13 +9,11 @@ export const load: PageServerLoad = async ({ params }) => {
.select()
.from(precios)
.where(eq(precios.ean, params.ean))
.groupBy(precios.warcRecordId)
.having(max(precios.parserVersion))
.orderBy(precios.fetchedAt);
const res = await q;
if (res.length === 0) return error(404, "Not Found");
const meta = res.find((p) => p.name);
const meta = res.findLast((p) => p.name);
return { precios: res, meta };
};

View file

@ -17,6 +17,7 @@
[Supermercado.Dia]: "bg-[#d52b1e] focus:ring-[#d52b1e]",
[Supermercado.Carrefour]: "bg-[#19549d] focus:ring-[#19549d]",
[Supermercado.Coto]: "bg-[#e20025] focus:ring-[#e20025]",
[Supermercado.Jumbo]: "bg-[#2dc850] focus:ring-[#2dc850]",
};
</script>

View file

@ -1,18 +1,19 @@
import { error } from "@sveltejs/kit";
import { eq, max, sql } from "drizzle-orm";
import { sql } from "drizzle-orm";
import type { PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db";
const { precios } = schema;
import { db } from "$lib/server/db";
export const load: PageServerLoad = async ({ url }) => {
const query = url.searchParams.get("q");
let results: null | { ean: string; name: string; imageUrl: string }[] = null;
if (query) {
results = db.all(
sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
const sqlQuery = sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
join precios p on p.ean = f.ean
where f.name match ${query};`,
);
where f.name match ${`"${query}"`}
group by p.ean
having max(p.fetched_at) and max(p.in_stock)
order by p.in_stock desc;`;
results = db.all(sqlQuery);
}
return { query, results };

View file

@ -15,5 +15,6 @@
"noEmit": true,
"forceConsistentCasingInFileNames": true
},
"include": ["**/*.ts", "**/*.js"]
"include": ["**/*.ts", "**/*.js"],
"exclude": ["sitio/build"]
}

534
warcificator/Cargo.lock generated
View file

@ -24,7 +24,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01"
dependencies = [
"cfg-if",
"getrandom",
"once_cell",
"version_check",
"zerocopy",
@ -144,12 +143,6 @@ version = "3.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec"
[[package]]
name = "byteorder"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "bytes"
version = "1.5.0"
@ -205,6 +198,16 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "crossbeam-channel"
version = "0.5.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "82a9b73a36529d9c47029b9fb3a6f0ea3cc916a261195352ba19e770fc1748b2"
dependencies = [
"cfg-if",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.18"
@ -215,60 +218,14 @@ dependencies = [
]
[[package]]
name = "cssparser"
version = "0.31.2"
name = "deranged"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be"
checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
dependencies = [
"cssparser-macros",
"dtoa-short",
"itoa",
"phf 0.11.2",
"smallvec",
"powerfmt",
]
[[package]]
name = "cssparser-macros"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
dependencies = [
"quote",
"syn 2.0.43",
]
[[package]]
name = "derive_more"
version = "0.99.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "dtoa"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653"
[[package]]
name = "dtoa-short"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbaceec3c6e4211c79e7b1800fb9680527106beb2f9c51904a3210c03a448c74"
dependencies = [
"dtoa",
]
[[package]]
name = "ego-tree"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591"
[[package]]
name = "encoding_rs"
version = "0.8.33"
@ -342,16 +299,6 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "futf"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
dependencies = [
"mac",
"new_debug_unreachable",
]
[[package]]
name = "futures-channel"
version = "0.3.30"
@ -391,24 +338,6 @@ dependencies = [
"pin-utils",
]
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]]
name = "getopts"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
dependencies = [
"unicode-width",
]
[[package]]
name = "getrandom"
version = "0.2.11"
@ -470,20 +399,6 @@ version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
[[package]]
name = "html5ever"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
dependencies = [
"log",
"mac",
"markup5ever",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "http"
version = "0.2.11"
@ -597,6 +512,12 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
version = "0.2.151"
@ -629,26 +550,6 @@ version = "0.4.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
[[package]]
name = "mac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "markup5ever"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
dependencies = [
"log",
"phf 0.10.1",
"phf_codegen",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "memchr"
version = "2.7.1"
@ -682,10 +583,14 @@ dependencies = [
]
[[package]]
name = "new_debug_unreachable"
version = "1.0.4"
name = "nu-ansi-term"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54"
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
dependencies = [
"overload",
"winapi",
]
[[package]]
name = "num_cpus"
@ -712,6 +617,12 @@ version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
[[package]]
name = "overload"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "parking"
version = "2.2.0"
@ -747,86 +658,6 @@ version = "2.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
[[package]]
name = "phf"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
dependencies = [
"phf_shared 0.10.0",
]
[[package]]
name = "phf"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
dependencies = [
"phf_macros",
"phf_shared 0.11.2",
]
[[package]]
name = "phf_codegen"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
]
[[package]]
name = "phf_generator"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
dependencies = [
"phf_shared 0.10.0",
"rand",
]
[[package]]
name = "phf_generator"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0"
dependencies = [
"phf_shared 0.11.2",
"rand",
]
[[package]]
name = "phf_macros"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b"
dependencies = [
"phf_generator 0.11.2",
"phf_shared 0.11.2",
"proc-macro2",
"quote",
"syn 2.0.43",
]
[[package]]
name = "phf_shared"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
dependencies = [
"siphasher",
]
[[package]]
name = "phf_shared"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
dependencies = [
"siphasher",
]
[[package]]
name = "pin-project-lite"
version = "0.2.13"
@ -846,16 +677,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69d3587f8a9e599cc7ec2c00e331f71c4e69a5f9a4b8a6efd5b07466b9736f9a"
[[package]]
name = "ppv-lite86"
version = "0.2.17"
name = "powerfmt"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
[[package]]
name = "proc-macro2"
@ -875,36 +700,6 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
"rand_chacha",
"rand_core",
]
[[package]]
name = "rand_chacha"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
"getrandom",
]
[[package]]
name = "redox_syscall"
version = "0.4.1"
@ -1033,22 +828,6 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scraper"
version = "0.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "585480e3719b311b78a573db1c9d9c4c1f8010c2dee4cc59c2efe58ea4dbc3e1"
dependencies = [
"ahash",
"cssparser",
"ego-tree",
"getopts",
"html5ever",
"once_cell",
"selectors",
"tendril",
]
[[package]]
name = "sct"
version = "0.7.1"
@ -1059,25 +838,6 @@ dependencies = [
"untrusted",
]
[[package]]
name = "selectors"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06"
dependencies = [
"bitflags 2.4.1",
"cssparser",
"derive_more",
"fxhash",
"log",
"new_debug_unreachable",
"phf 0.10.1",
"phf_codegen",
"precomputed-hash",
"servo_arc",
"smallvec",
]
[[package]]
name = "serde"
version = "1.0.193"
@ -1095,7 +855,7 @@ checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.43",
"syn",
]
[[package]]
@ -1122,12 +882,12 @@ dependencies = [
]
[[package]]
name = "servo_arc"
version = "0.3.0"
name = "sharded-slab"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44"
checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
dependencies = [
"stable_deref_trait",
"lazy_static",
]
[[package]]
@ -1139,12 +899,6 @@ dependencies = [
"libc",
]
[[package]]
name = "siphasher"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
[[package]]
name = "slab"
version = "0.4.9"
@ -1176,49 +930,6 @@ version = "0.9.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
[[package]]
name = "stable_deref_trait"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "string_cache"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b"
dependencies = [
"new_debug_unreachable",
"once_cell",
"parking_lot",
"phf_shared 0.10.0",
"precomputed-hash",
"serde",
]
[[package]]
name = "string_cache_codegen"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
"proc-macro2",
"quote",
]
[[package]]
name = "syn"
version = "1.0.109"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "syn"
version = "2.0.43"
@ -1252,14 +963,62 @@ dependencies = [
]
[[package]]
name = "tendril"
version = "0.4.3"
name = "thiserror"
version = "1.0.55"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
checksum = "6e3de26b0965292219b4287ff031fcba86837900fe9cd2b34ea8ad893c0953d2"
dependencies = [
"futf",
"mac",
"utf-8",
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.55"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "268026685b2be38d7103e9e507c938a1fcb3d7e6eb15e87870b617bf37b6d581"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "thread_local"
version = "1.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152"
dependencies = [
"cfg-if",
"once_cell",
]
[[package]]
name = "time"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f657ba42c3f86e7680e53c8cd3af8abbe56b5491790b46e22e19c0d57463583e"
dependencies = [
"deranged",
"itoa",
"powerfmt",
"serde",
"time-core",
"time-macros",
]
[[package]]
name = "time-core"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
[[package]]
name = "time-macros"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26197e33420244aeb70c3e8c78376ca46571bc4e701e4791c2cd9f57dcb3a43f"
dependencies = [
"time-core",
]
[[package]]
@ -1277,6 +1036,11 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tl"
version = "0.7.7"
source = "git+https://github.com/evertedsphere/tl?branch=patch-1#56711166588fa6c7729a08e5740dca2526436316"
[[package]]
name = "tokio"
version = "1.35.1"
@ -1304,7 +1068,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.43",
"syn",
]
[[package]]
@ -1343,10 +1107,35 @@ version = "0.1.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
dependencies = [
"log",
"pin-project-lite",
"tracing-attributes",
"tracing-core",
]
[[package]]
name = "tracing-appender"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3566e8ce28cc0a3fe42519fc80e6b4c943cc4c8cef275620eb8dac2d3d4e06cf"
dependencies = [
"crossbeam-channel",
"thiserror",
"time",
"tracing-subscriber",
]
[[package]]
name = "tracing-attributes"
version = "0.1.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "tracing-core"
version = "0.1.32"
@ -1354,6 +1143,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
dependencies = [
"once_cell",
"valuable",
]
[[package]]
name = "tracing-log"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
dependencies = [
"log",
"once_cell",
"tracing-core",
]
[[package]]
name = "tracing-subscriber"
version = "0.3.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
dependencies = [
"nu-ansi-term",
"sharded-slab",
"smallvec",
"thread_local",
"tracing-core",
"tracing-log",
]
[[package]]
@ -1383,12 +1198,6 @@ dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-width"
version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85"
[[package]]
name = "untrusted"
version = "0.9.0"
@ -1407,10 +1216,10 @@ dependencies = [
]
[[package]]
name = "utf-8"
version = "0.7.6"
name = "valuable"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
[[package]]
name = "vcpkg"
@ -1440,10 +1249,13 @@ dependencies = [
"async-channel",
"reqwest",
"rusqlite",
"scraper",
"serde",
"serde_json",
"tl",
"tokio",
"tracing",
"tracing-appender",
"tracing-subscriber",
]
[[package]]
@ -1473,7 +1285,7 @@ dependencies = [
"once_cell",
"proc-macro2",
"quote",
"syn 2.0.43",
"syn",
"wasm-bindgen-shared",
]
@ -1507,7 +1319,7 @@ checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.43",
"syn",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
@ -1534,6 +1346,28 @@ version = "0.25.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1778a42e8b3b90bff8d0f5032bf22250792889a5cdc752aa0020c84abe3aaf10"
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-sys"
version = "0.48.0"
@ -1627,5 +1461,5 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.43",
"syn",
]

View file

@ -7,13 +7,18 @@ edition = "2021"
[dependencies]
async-channel = "2.1.1"
# lol_html = "1.2.0"
reqwest = { version = "0.11.23", default-features = false, features = [
"rustls-tls",
"gzip",
"brotli",
] }
rusqlite = "0.30.0"
scraper = "0.18.1"
# scraper = "0.18.1"
serde = { version = "1.0.193", features = ["derive"] }
serde_json = "1.0.109"
tl = { git = "https://github.com/evertedsphere/tl", branch = "patch-1", features = ["simd"] }
tokio = { version = "1.35.1", features = ["full"] }
tracing = { version = "0.1", features = ["log"] }
tracing-appender = "0.2.3"
tracing-subscriber = "0.3.18"

View file

@ -1,11 +1,12 @@
use async_channel::{Receiver, Sender};
use rusqlite::Connection;
use scraper::{Element, Html, Selector};
use std::{
env::args,
borrow::Cow,
env::{self, args},
fs,
time::{SystemTime, UNIX_EPOCH},
};
use tl::VDom;
use tokio::io::{stderr, AsyncWriteExt};
#[derive(Debug)]
@ -21,6 +22,78 @@ struct PrecioPoint {
image_url: Option<String>,
}
// fn main() {
// let arg = args().skip(1).next().unwrap();
// let file_iter = fs::read_dir(arg)
// .unwrap()
// .filter(|pr| {
// if let Ok(p) = pr {
// !p.file_name().to_str().unwrap().ends_with(".link")
// } else {
// false
// }
// })
// .take(1000)
// .map(|f| fs::read(f.unwrap().path()).unwrap());
// let mut i = 0;
// for item in file_iter {
// i = i + 1;
// {
// // let mut text: Option<String> = None;
// // let mut price_str: Option<String> = None;
// // let mut rewriter = HtmlRewriter::new(
// // Settings {
// // element_content_handlers: vec![
// // // Rewrite insecure hyperlinks
// // element!("a[href]", |el| {
// // let href = el.get_attribute("href").unwrap().replace("http:", "https:");
// // el.set_attribute("href", &href).unwrap();
// // Ok(())
// // }),
// // (
// // Cow::Owned("a".parse().unwrap()),
// // ElementContentHandlers::default().text(extract_first_text(&mut text)),
// // ),
// // element!(
// // "meta[property=\"product:price:amount\"]",
// // extract_first_attr(&mut price_str, "content")
// // ),
// // ],
// // memory_settings: lol_html::MemorySettings {
// // preallocated_parsing_buffer_size: 1024 * 16,
// // max_allowed_memory_usage: std::usize::MAX,
// // },
// // ..Settings::default()
// // },
// // |_: &[u8]| {},
// // );
// // rewriter.write(&item).unwrap();
// // rewriter.end().unwrap();
// // println!("{:#?}", price_str);
// // let html = scraper::Html::parse_document(&String::from_utf8(item).unwrap());
// let html = String::from_utf8(item).unwrap();
// let dom = tl::parse(&html, tl::ParserOptions::default()).unwrap();
// match parse_carrefour("".into(), &dom) {
// Ok(point) => {
// // println!("{:?}", point);
// }
// Err(err) => {
// // println!("Error {:#?}: {}", err, html);
// }
// };
// }
// }
// println!("n={}", i);
// }
#[tokio::main]
async fn main() {
let mut args = args().skip(1);
@ -38,7 +111,10 @@ async fn main() {
let (res_sender, res_receiver) = async_channel::unbounded::<PrecioPoint>();
let mut handles = Vec::new();
for _ in 1..16 {
for _ in 1..env::var("N_COROUTINES")
.map_or(Ok(32), |s| s.parse::<usize>())
.unwrap()
{
let rx = receiver.clone();
let tx = res_sender.clone();
handles.push(tokio::spawn(worker(rx, tx)));
@ -70,7 +146,7 @@ async fn worker(rx: Receiver<String>, tx: Sender<PrecioPoint>) {
}
Err(err) => {
stderr()
.write_all(format!("Failed to fetch {}: {:#?}\n", url.as_str(), err).as_bytes())
.write_all(format!("Failed to fetch {}: {:?}\n", url.as_str(), err).as_bytes())
.await
.unwrap();
}
@ -81,14 +157,7 @@ async fn worker(rx: Receiver<String>, tx: Sender<PrecioPoint>) {
#[derive(Debug)]
enum FetchError {
HttpError(reqwest::Error),
NoPriceMetaEl,
NoMetaContent,
NotANumber,
NoStockMetaEl,
NoValidStockMeta,
NoSeedState,
NoProductInSeedState,
NoProductSkuInSeedState,
ParseError(&'static str),
}
async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result<PrecioPoint, FetchError> {
@ -102,102 +171,124 @@ async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result<Precio
.await
.map_err(|e| FetchError::HttpError(e))?;
let html = Html::parse_document(&body);
let dom = tl::parse(&body, tl::ParserOptions::default()).unwrap();
// let parser = dom.parser();
let point = parse_carrefour(url, html)?;
let point = parse_carrefour(url, &dom)?;
Ok(point)
}
fn parse_carrefour(url: String, html: Html) -> Result<PrecioPoint, FetchError> {
let meta_price_sel = Selector::parse("meta[property=\"product:price:amount\"]").unwrap();
let precio_centavos = match html.select(&meta_price_sel).next() {
Some(el) => match el.attr("content") {
Some(attr) => match attr.parse::<f64>() {
Ok(f) => Ok((f * 100.0) as u64),
Err(_) => Err(FetchError::NotANumber),
},
None => Err(FetchError::NoMetaContent),
},
None => Err(FetchError::NoPriceMetaEl),
fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, FetchError> {
let precio_centavos = {
get_meta_content(dom, "product:price:amount")?
.map(|s| {
s.parse::<f64>()
.map_err(|_| FetchError::ParseError("Failed to parse number"))
})
.transpose()
.map(|f| f.map(|f| (f * 100.0) as u64))
}?;
let meta_stock_el = Selector::parse("meta[property=\"product:availability\"]").unwrap();
let in_stock = match html.select(&meta_stock_el).next() {
Some(el) => match el.attr("content") {
Some(attr) => match attr {
"oos" => Ok(Some(false)),
"instock" => Ok(Some(true)),
_ => Err(FetchError::NoValidStockMeta),
},
None => Err(FetchError::NoMetaContent),
let in_stock_meta = get_meta_content(dom, "product:availability")?.map(|s| s.into_owned());
let in_stock = match in_stock_meta {
Some(s) => match s.as_ref() {
"oos" => Some(false),
"instock" => Some(true),
_ => return Err(FetchError::ParseError("Not a valid product:availability")),
},
None => Err(FetchError::NoStockMetaEl),
}?;
None => None,
};
let ean = {
let state = parse_script_json(&html, "__STATE__").ok_or(FetchError::NoSeedState)?;
let seed_state = &state.as_object().ok_or(FetchError::NoSeedState)?;
let (_, product_json) = seed_state
let json = &parse_script_json(dom, "__STATE__")?;
let state = json
.as_object()
.ok_or(FetchError::ParseError("Seed state not an object"))?;
let (_, product_json) = state
.into_iter()
.find(|(key, val)| {
key.starts_with("Product:")
&& val.as_object().map_or(false, |val| {
val.get("__typename")
.map_or(false, |typename| typename == "Product")
})
&& val
.as_object()
.and_then(|val| val.get("__typename"))
.map_or(false, |typename| typename == "Product")
})
.ok_or(FetchError::NoProductInSeedState)?;
.ok_or(FetchError::ParseError("No product in seed state"))?;
let cache_id = product_json
.get("cacheId")
.ok_or(FetchError::NoProductInSeedState)?;
let (_, product_sku_json) = seed_state
.into_iter()
.filter_map(|(key, val)| val.as_object().map_or(None, |o| Some((key, o))))
.and_then(|v| v.as_str())
.ok_or(FetchError::ParseError("No cacheId in seed state"))?;
let (_, product_sku_json) = state
.iter()
.find(|(key, val)| {
key.starts_with(&format!("Product:{}", cache_id))
&& val
.get("__typename")
.map_or(false, |typename| typename == "SKU")
&& val.as_object().map_or(false, |obj| {
obj.get("__typename")
.map_or(false, |typename| typename == "SKU")
})
})
.ok_or(FetchError::NoProductSkuInSeedState)?;
.ok_or(FetchError::ParseError("No Product:cacheId* found"))?;
product_sku_json
.get("ean")
.ok_or(FetchError::NoProductSkuInSeedState)?
.as_str()
.ok_or(FetchError::NoProductSkuInSeedState)?
.and_then(|v| v.as_str())
.ok_or(FetchError::ParseError("No product SKU in seed state"))?
.to_string()
};
Ok(PrecioPoint {
ean: ean,
ean,
fetched_at: now_sec(),
in_stock: in_stock,
in_stock,
name: None,
image_url: None,
parser_version: 5,
precio_centavos: Some(precio_centavos),
url: url,
precio_centavos,
url,
})
}
fn parse_script_json(html: &Html, varname: &str) -> Option<serde_json::Value> {
let template_sel = Selector::parse(&format!(
"template[data-type=\"json\"][data-varname=\"{}\"]",
varname
))
.unwrap();
match html.select(&template_sel).next() {
Some(value) => match value.first_element_child() {
Some(script) => match serde_json::from_str(&script.inner_html()) {
Ok(val) => val,
Err(_) => None,
},
None => None,
},
None => None,
fn get_meta_content<'a>(dom: &'a VDom<'a>, prop: &str) -> Result<Option<Cow<'a, str>>, FetchError> {
let tag = &dom
.query_selector(&format!("meta[property=\"{}\"]", prop))
.and_then(|mut iter| iter.next())
.and_then(|h| h.get(dom.parser()))
.and_then(|n| n.as_tag());
match tag {
Some(tag) => Ok(Some(
tag.attributes()
.get("content")
.flatten()
.ok_or(FetchError::ParseError("Failed to get content attr"))?
.as_utf8_str(),
)),
None => Ok(None),
}
}
fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, FetchError> {
let parser = dom.parser();
let inner_html = &dom
.query_selector(&format!(
"template[data-type=\"json\"][data-varname=\"{}\"]",
varname
))
.and_then(|mut iter| iter.next())
.and_then(|h| h.get(parser))
.and_then(|n| n.as_tag())
.and_then(|t| {
t.children()
.all(parser)
.iter()
.find(|n| n.as_tag().is_some())
})
.ok_or(FetchError::ParseError("Failed to get script tag"))?
.inner_html(parser);
Ok(inner_html
.parse()
.map_err(|_| FetchError::ParseError("Couldn't parse JSON in script"))?)
}
fn now_sec() -> u64 {
let start = SystemTime::now();
let since_the_epoch = start
@ -207,9 +298,9 @@ fn now_sec() -> u64 {
}
async fn db_writer(rx: Receiver<PrecioPoint>) {
let conn = Connection::open("../scraper/sqlite.db").unwrap();
// let conn = Connection::open("../scraper/sqlite.db").unwrap();
// let mut stmt = conn.prepare("SELECT id, name, data FROM person")?;
while let Ok(res) = rx.recv().await {
println!("{:#?}", res)
println!("{:?}", res)
}
}