mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-29 13:06:19 +00:00
Compare commits
No commits in common. "78878d8b7ecac6d16d295d580c3722166479e23c" and "3cf723cc3d4ae78ce771b83228d5d41fe9a6ff00" have entirely different histories.
78878d8b7e
...
3cf723cc3d
47 changed files with 1019 additions and 915 deletions
|
@ -1,7 +1,7 @@
|
||||||
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
|
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
|
||||||
// README at: https://github.com/devcontainers/templates/tree/main/src/alpine
|
// README at: https://github.com/devcontainers/templates/tree/main/src/alpine
|
||||||
{
|
{
|
||||||
"name": "Debian",
|
"name": "Alpine",
|
||||||
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
|
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
|
||||||
"image": "mcr.microsoft.com/devcontainers/base:debian",
|
"image": "mcr.microsoft.com/devcontainers/base:debian",
|
||||||
"features": {
|
"features": {
|
||||||
|
|
|
@ -4,11 +4,4 @@ data/carrefour/
|
||||||
downloader/
|
downloader/
|
||||||
node_modules/
|
node_modules/
|
||||||
*/node_modules/
|
*/node_modules/
|
||||||
Containerfile
|
|
||||||
*/Containerfile
|
*/Containerfile
|
||||||
Dockerfile
|
|
||||||
*/Dockerfile
|
|
||||||
*.warc.zst
|
|
||||||
.git
|
|
||||||
scraper/debug/
|
|
||||||
*/target/
|
|
54
.github/workflows/container.yml
vendored
54
.github/workflows/container.yml
vendored
|
@ -1,54 +0,0 @@
|
||||||
name: check and publish container image
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches: ["master"]
|
|
||||||
|
|
||||||
env:
|
|
||||||
REGISTRY: ghcr.io
|
|
||||||
IMAGE_NAME: ${{ github.repository }}
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
check:
|
|
||||||
name: chequear typescript
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- uses: oven-sh/setup-bun@v1
|
|
||||||
|
|
||||||
- run: bun install
|
|
||||||
working-directory: ./sitio
|
|
||||||
- run: bun check
|
|
||||||
working-directory: ./sitio
|
|
||||||
- run: bun install
|
|
||||||
working-directory: ./scraper
|
|
||||||
- run: bun check
|
|
||||||
working-directory: ./scraper
|
|
||||||
|
|
||||||
build-and-push-sitio:
|
|
||||||
needs: check
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
packages: write
|
|
||||||
steps:
|
|
||||||
- name: Checkout repository
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Log in to the Container registry
|
|
||||||
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
|
|
||||||
with:
|
|
||||||
registry: ${{ env.REGISTRY }}
|
|
||||||
username: ${{ github.actor }}
|
|
||||||
password: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
- name: Extract metadata (tags, labels) for Docker
|
|
||||||
id: meta
|
|
||||||
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
|
|
||||||
with:
|
|
||||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/sitio
|
|
||||||
- name: Build and push Docker image
|
|
||||||
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
|
|
||||||
with:
|
|
||||||
context: .
|
|
||||||
push: true
|
|
||||||
tags: ${{ steps.meta.outputs.tags }}
|
|
||||||
labels: ${{ steps.meta.outputs.labels }}
|
|
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -13,6 +13,3 @@ scraper/x.tsv
|
||||||
*.tmp
|
*.tmp
|
||||||
target/
|
target/
|
||||||
.env.*
|
.env.*
|
||||||
|
|
||||||
*/flamegraph.svg
|
|
||||||
*/perf.data*
|
|
2
.vscode/launch.json
vendored
2
.vscode/launch.json
vendored
|
@ -13,7 +13,7 @@
|
||||||
// https://github.com/vadimcn/codelldb/issues/884
|
// https://github.com/vadimcn/codelldb/issues/884
|
||||||
"args": ["build", "--manifest-path=warcificator/Cargo.toml"]
|
"args": ["build", "--manifest-path=warcificator/Cargo.toml"]
|
||||||
},
|
},
|
||||||
"args": ["../data/carrefour"],
|
"args": ["../data/samples/Carrefour.50.txt"],
|
||||||
"env": {}
|
"env": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
30
Dockerfile
30
Dockerfile
|
@ -1,30 +0,0 @@
|
||||||
FROM docker.io/oven/bun:1-alpine AS base
|
|
||||||
WORKDIR /usr/src/app
|
|
||||||
|
|
||||||
FROM base as build
|
|
||||||
ENV NODE_ENV=production
|
|
||||||
RUN apk add --no-cache nodejs
|
|
||||||
COPY . .
|
|
||||||
RUN bun install --frozen-lockfile
|
|
||||||
RUN cd sitio && \
|
|
||||||
bun run build
|
|
||||||
RUN bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js
|
|
||||||
|
|
||||||
FROM cgr.dev/chainguard/wolfi-base
|
|
||||||
RUN apk add --no-cache nodejs npm jq bun sqlite
|
|
||||||
|
|
||||||
# Sitio
|
|
||||||
COPY --from=build /usr/src/app/sitio/package.json package.real.json
|
|
||||||
RUN sh -c 'echo {\"name\":\"sitio\",\"type\":\"module\",\"dependencies\":$(jq .dependencies < package.real.json)} > package.json' && npm install
|
|
||||||
COPY --from=build /usr/src/app/db-datos node_modules/db-datos
|
|
||||||
COPY --from=build /usr/src/app/sitio/build .
|
|
||||||
|
|
||||||
# Scraper
|
|
||||||
COPY --from=build /tmp/cli.build.js /bin/scraper
|
|
||||||
COPY --from=build /usr/src/app/db-datos/drizzle /bin/drizzle
|
|
||||||
|
|
||||||
ENV NODE_ENV=production
|
|
||||||
ENV DB_PATH=/db/db.db
|
|
||||||
EXPOSE 3000
|
|
||||||
|
|
||||||
CMD ["node", "."]
|
|
BIN
bun.lockb
BIN
bun.lockb
Binary file not shown.
|
@ -1,6 +1,5 @@
|
||||||
import pMap from "p-map";
|
import pMap from "p-map";
|
||||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
import { getUrlsFromSitemap } from "./common.js";
|
|
||||||
|
|
||||||
export async function scrapCarrefourProducts() {
|
export async function scrapCarrefourProducts() {
|
||||||
await scrapBySitemap();
|
await scrapBySitemap();
|
||||||
|
@ -26,7 +25,17 @@ async function scrapBySitemap() {
|
||||||
async (sitemapUrl) => {
|
async (sitemapUrl) => {
|
||||||
const res = await fetch(sitemapUrl);
|
const res = await fetch(sitemapUrl);
|
||||||
const xml = await res.text();
|
const xml = await res.text();
|
||||||
saveUrls(getUrlsFromSitemap(xml));
|
let urls = new Set<string>();
|
||||||
|
new HTMLRewriter()
|
||||||
|
.on("loc", {
|
||||||
|
text(element) {
|
||||||
|
const txt = element.text.trim();
|
||||||
|
if (!txt) return;
|
||||||
|
urls.add(txt);
|
||||||
|
},
|
||||||
|
})
|
||||||
|
.transform(new Response(xml));
|
||||||
|
saveUrls(Array.from(urls));
|
||||||
},
|
},
|
||||||
{ concurrency: 3 }
|
{ concurrency: 3 }
|
||||||
);
|
);
|
17
carrefour-link-scraper/package.json
Normal file
17
carrefour-link-scraper/package.json
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
{
|
||||||
|
"name": "carrefour-link-scraper",
|
||||||
|
"type": "module",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "",
|
||||||
|
"main": "index.js",
|
||||||
|
"scripts": {
|
||||||
|
"test": "echo \"Error: no test specified\" && exit 1"
|
||||||
|
},
|
||||||
|
"keywords": [],
|
||||||
|
"author": "",
|
||||||
|
"license": "ISC",
|
||||||
|
"dependencies": {
|
||||||
|
"linkedom": "^0.16.5",
|
||||||
|
"p-map": "^7.0.1"
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,3 +1,4 @@
|
||||||
|
import { getHtml } from "../scraper/fetch.js";
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import PQueue from "p-queue";
|
import PQueue from "p-queue";
|
||||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
|
@ -27,13 +28,12 @@ function getPage(url: string) {
|
||||||
return async () => {
|
return async () => {
|
||||||
let html;
|
let html;
|
||||||
try {
|
try {
|
||||||
const res = await fetch(url);
|
html = await getHtml(url);
|
||||||
html = await res.text();
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
await getPage(url)();
|
await getPage(url)();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const { document } = parseHTML(html);
|
const { document } = parseHTML(html.toString("utf-8"));
|
||||||
|
|
||||||
const hrefs = Array.from(
|
const hrefs = Array.from(
|
||||||
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
|
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
|
|
@ -1,5 +1,5 @@
|
||||||
{
|
{
|
||||||
"name": "link-scrapers",
|
"name": "coto-link-scraper",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"description": "",
|
"description": "",
|
||||||
|
@ -11,7 +11,6 @@
|
||||||
"author": "",
|
"author": "",
|
||||||
"license": "ISC",
|
"license": "ISC",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"entities": "^4.5.0",
|
|
||||||
"linkedom": "^0.16.5",
|
"linkedom": "^0.16.5",
|
||||||
"p-queue": "^8.0.1"
|
"p-queue": "^8.0.1"
|
||||||
}
|
}
|
|
@ -1,3 +0,0 @@
|
||||||
version https://git-lfs.github.com/spec/v1
|
|
||||||
oid sha256:6299b470d9debc9a173b40c2ba91208eb43a6c8cde02a4819e7bcd76368e4363
|
|
||||||
size 922185
|
|
|
@ -1,100 +0,0 @@
|
||||||
https://www.jumbo.com.ar/huevos-de-color-avicoper-6-u-1-paquete-2/p
|
|
||||||
https://www.jumbo.com.ar/ajo-ahumado-organico-pampa-gourmet-285g/p
|
|
||||||
https://www.jumbo.com.ar/boxer-dst-raya-finita-art-b278-talle-m/p
|
|
||||||
https://www.jumbo.com.ar/yogur-bebible-ser-sachet-vainilla-900g/p
|
|
||||||
https://www.jumbo.com.ar/plato-playo-melamina-27-cm-boho-krea-2/p
|
|
||||||
https://www.jumbo.com.ar/mermelada-la-vieja-fabrica-frutos-del-bosque-350-gr/p
|
|
||||||
https://www.jumbo.com.ar/dr-lemon-vodka-pomelo-5/p
|
|
||||||
https://www.jumbo.com.ar/vino-cuvelier-los-andes-grand-vin-750cc/p
|
|
||||||
https://www.jumbo.com.ar/capsulas-cafe-cabrales-dg-cortado-x88gr/p
|
|
||||||
https://www.jumbo.com.ar/pizza-muzarella-e/p
|
|
||||||
https://www.jumbo.com.ar/filet-de-merluza-rebozado-8/p
|
|
||||||
https://www.jumbo.com.ar/ron-bacardi-carta-blanca-750-ml/p
|
|
||||||
https://www.jumbo.com.ar/sal-gruesa-celusal-1-kg/p
|
|
||||||
https://www.jumbo.com.ar/vaso-bajo-acrilico-boho-krea-2/p
|
|
||||||
https://www.jumbo.com.ar/espumante-chandon-demi-sec/p
|
|
||||||
https://www.jumbo.com.ar/jarra-electrica-smartlife-sl-ek1714wpn/p
|
|
||||||
https://www.jumbo.com.ar/espumante-dada-7-rose-dulce-750-cc/p
|
|
||||||
https://www.jumbo.com.ar/panquequera-hudson-de-aluminio-con-antiadherente-22cm/p
|
|
||||||
https://www.jumbo.com.ar/sacapuntas-de-plastico-pizzini-2un/p
|
|
||||||
https://www.jumbo.com.ar/vino-vinas-de-alvear-tinto-750ml/p
|
|
||||||
https://www.jumbo.com.ar/campera-mujer-puffer-larga/p
|
|
||||||
https://www.jumbo.com.ar/tabla-de-quesos/p
|
|
||||||
https://www.jumbo.com.ar/frutos-del-bosque-frutas-del-sur-x400gr/p
|
|
||||||
https://www.jumbo.com.ar/blister-resaltador-flash-amarillo-x-1-un/p
|
|
||||||
https://www.jumbo.com.ar/alim-whiskas-gatitos-carne-y-leche-500gr/p
|
|
||||||
https://www.jumbo.com.ar/detergente-polvo-zorro-blue-3k-x-1un/p
|
|
||||||
https://www.jumbo.com.ar/media-vestir-hombre-1s10471-negro/p
|
|
||||||
https://www.jumbo.com.ar/nachos-macritas-ketchup-x90g/p
|
|
||||||
https://www.jumbo.com.ar/pack-x3-medias-juvenil-liso-t-5-elemento/p
|
|
||||||
https://www.jumbo.com.ar/set-de-vehiculos-emergencias-duravit/p
|
|
||||||
https://www.jumbo.com.ar/carbon-patagonia-x-4kgs/p
|
|
||||||
https://www.jumbo.com.ar/rejilla-mr-trapo-cocina-algodon/p
|
|
||||||
https://www.jumbo.com.ar/jugo-exprimido-pura-frutta-arandanos-manzana-verde-x-1l/p
|
|
||||||
https://www.jumbo.com.ar/media-dama-invisible-alta-nyb-urb-2/p
|
|
||||||
https://www.jumbo.com.ar/boxer-nino-raya-violeta-2-colores-dst-t-10/p
|
|
||||||
https://www.jumbo.com.ar/barra-zafran-caju-y-sem-de-zapallo-x112g/p
|
|
||||||
https://www.jumbo.com.ar/iniciador-de-fuego-maderasa/p
|
|
||||||
https://www.jumbo.com.ar/queso-mozzarella-barraza-x-500grs-paq-gr-500/p
|
|
||||||
https://www.jumbo.com.ar/vaso-de-vidrio-cuadrado-360-cc/p
|
|
||||||
https://www.jumbo.com.ar/shampoo-sedal-jengibre-y-ricino-190ml/p
|
|
||||||
https://www.jumbo.com.ar/roller-gel-filgo-gel-pop-glitter-1un/p
|
|
||||||
https://www.jumbo.com.ar/una-familia-anormal-el-misterio-de-prh/p
|
|
||||||
https://www.jumbo.com.ar/veggie-stick-tomate-y-oliva-via-vita-x-50grs/p
|
|
||||||
https://www.jumbo.com.ar/bowl-stor-bicolor-mickey-mouse/p
|
|
||||||
https://www.jumbo.com.ar/vino-blanco-don-valentin-lacrado-750-ml/p
|
|
||||||
https://www.jumbo.com.ar/un-vecino-anormal-2-prh/p
|
|
||||||
https://www.jumbo.com.ar/paleta-pet-cancat-mordillo-ice/p
|
|
||||||
https://www.jumbo.com.ar/aceitunas-nucete-premium-descarozadas-180-gr/p
|
|
||||||
https://www.jumbo.com.ar/caja-plastica-6l-teen-boy-pv23-krea-2/p
|
|
||||||
https://www.jumbo.com.ar/vino-santa-julia-chardonnay-x-750-cc/p
|
|
||||||
https://www.jumbo.com.ar/protecor-solar-dermaglos-bebes-fps65-120gr/p
|
|
||||||
https://www.jumbo.com.ar/oregano-100-gr/p
|
|
||||||
https://www.jumbo.com.ar/puerro-song/p
|
|
||||||
https://www.jumbo.com.ar/repuesto-difusor-sandia-pepino-350-ml-2/p
|
|
||||||
https://www.jumbo.com.ar/botellas-plasticas-origin-580ml-rosa-2/p
|
|
||||||
https://www.jumbo.com.ar/nescafe-dolca-original-x-170gr/p
|
|
||||||
https://www.jumbo.com.ar/tapa-empanada-veggie-signo-de-oro-x-500g/p
|
|
||||||
https://www.jumbo.com.ar/inflador-de-pie-bestway-air-hammer/p
|
|
||||||
https://www.jumbo.com.ar/ketchup-ahumado-marian-arytza-400g/p
|
|
||||||
https://www.jumbo.com.ar/sal-marina-finas-hierbas-ahumada-s-tacc-450g/p
|
|
||||||
https://www.jumbo.com.ar/jugo-smudis-pomelo-500ml-brk-0-5-lt/p
|
|
||||||
https://www.jumbo.com.ar/limpiador-antihongos-ayudin-removedor-activo-envase-economico-450-ml/p
|
|
||||||
https://www.jumbo.com.ar/marcador-permanente-punta-redonda-color-negro/p
|
|
||||||
https://www.jumbo.com.ar/galletitas-dulces-con-chips-de-chocolate-pepitos-119g/p
|
|
||||||
https://www.jumbo.com.ar/afeitadora-bic-comfort-twin-l5p4-2/p
|
|
||||||
https://www.jumbo.com.ar/canvas-20x20-cm-paisajes-04-krea/p
|
|
||||||
https://www.jumbo.com.ar/turron-georgalos-de-mani-con-chocolate-x-90-gr/p
|
|
||||||
https://www.jumbo.com.ar/arroz-vanguardia-elaborado-largo-fino/p
|
|
||||||
https://www.jumbo.com.ar/set-x-3-pastafrola-fija-n-14/p
|
|
||||||
https://www.jumbo.com.ar/pulpa-fina-basilico-mutti-400-gr/p
|
|
||||||
https://www.jumbo.com.ar/vino-tinto-elementos-malbec-750-cc/p
|
|
||||||
https://www.jumbo.com.ar/enjuague-bucal-listerine-antisarro-suave-sn-alcohol-x250/p
|
|
||||||
https://www.jumbo.com.ar/almohaditas-lasfor-avellana-200-grs/p
|
|
||||||
https://www.jumbo.com.ar/vino-tinto-los-haroldos-estate-cabernet-sauvignon-750-ml/p
|
|
||||||
https://www.jumbo.com.ar/peluche-funnyland-maxtoys-tibalt-perro-28cm/p
|
|
||||||
https://www.jumbo.com.ar/cafetera-filtro-negro-electrolux-1-2-litros/p
|
|
||||||
https://www.jumbo.com.ar/media-nina-ciudadella-minnie-t2/p
|
|
||||||
https://www.jumbo.com.ar/portaretrato-colores-13x18cm-4c-krea4136010100/p
|
|
||||||
https://www.jumbo.com.ar/lustramuebles-blem-madera-aceite-de-argan-aerosol-360cc/p
|
|
||||||
https://www.jumbo.com.ar/sriracha-sauce-hashi-x250ml-2/p
|
|
||||||
https://www.jumbo.com.ar/plato-hondo-22-1-cm-ceramica-blanca/p
|
|
||||||
https://www.jumbo.com.ar/limpiador-harpic-banos-sarro-y-manchas-495ml/p
|
|
||||||
https://www.jumbo.com.ar/shampoo-dove-real-poder-de-las-plantas-purificacion-jengibre-300-ml/p
|
|
||||||
https://www.jumbo.com.ar/aromatizador-glade-mini-gel-car-3/p
|
|
||||||
https://www.jumbo.com.ar/carpeta-con-10-folios-a4/p
|
|
||||||
https://www.jumbo.com.ar/sabana-king-caracol-krea/p
|
|
||||||
https://www.jumbo.com.ar/leche-en-polvo-nutribaby-1-hmo-x-800-grs/p
|
|
||||||
https://www.jumbo.com.ar/chalitas-viavita-clasicas-x-100-grs-sin-tacc/p
|
|
||||||
https://www.jumbo.com.ar/hervidor-tramontina-14cm-cm-x1/p
|
|
||||||
https://www.jumbo.com.ar/aceitunas-de-gordal-ybarra-x240gr-2/p
|
|
||||||
https://www.jumbo.com.ar/tableta-vizzio-relleno-nugaton-x100g-2/p
|
|
||||||
https://www.jumbo.com.ar/mortadela-paladini-fetas-finas-x-200-gr-2/p
|
|
||||||
https://www.jumbo.com.ar/budin-limon-y-amapolas/p
|
|
||||||
https://www.jumbo.com.ar/vino-chac-chac-sauvingnon-blanc-lata-269cc/p
|
|
||||||
https://www.jumbo.com.ar/whisky-chivas-regal-18-yo-700cc/p
|
|
||||||
https://www.jumbo.com.ar/copa-de-vidrio-rigolleau-6/p
|
|
||||||
https://www.jumbo.com.ar/notcreamcheese-210-gr/p
|
|
||||||
https://www.jumbo.com.ar/oso-con-miel-de-abejas-cuisine-co-340-gr/p
|
|
||||||
https://www.jumbo.com.ar/difusor-aromas-spirit-spirit-win-home-250ml-x1/p
|
|
||||||
https://www.jumbo.com.ar/exprimidor-ultracomb-ex-2302/p
|
|
|
@ -11,7 +11,7 @@
|
||||||
"author": "",
|
"author": "",
|
||||||
"license": "ISC",
|
"license": "ISC",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"drizzle-orm": "^0.29.1"
|
"drizzle-orm": "=0.29.1"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/bun": "^1.0.0",
|
"@types/bun": "^1.0.0",
|
||||||
|
|
|
@ -2,23 +2,15 @@ export enum Supermercado {
|
||||||
Dia = "Dia",
|
Dia = "Dia",
|
||||||
Carrefour = "Carrefour",
|
Carrefour = "Carrefour",
|
||||||
Coto = "Coto",
|
Coto = "Coto",
|
||||||
Jumbo = "Jumbo",
|
|
||||||
}
|
}
|
||||||
export const supermercados: Supermercado[] = [
|
|
||||||
Supermercado.Carrefour,
|
|
||||||
Supermercado.Coto,
|
|
||||||
Supermercado.Dia,
|
|
||||||
Supermercado.Jumbo,
|
|
||||||
];
|
|
||||||
export const hosts: { [host: string]: Supermercado } = {
|
export const hosts: { [host: string]: Supermercado } = {
|
||||||
"diaonline.supermercadosdia.com.ar": Supermercado.Dia,
|
"diaonline.supermercadosdia.com.ar": Supermercado.Dia,
|
||||||
"www.carrefour.com.ar": Supermercado.Carrefour,
|
"www.carrefour.com.ar": Supermercado.Carrefour,
|
||||||
"www.cotodigital3.com.ar": Supermercado.Coto,
|
"www.cotodigital3.com.ar": Supermercado.Coto,
|
||||||
"www.jumbo.com.ar": Supermercado.Jumbo,
|
|
||||||
};
|
};
|
||||||
export const colorBySupermercado: { [supermercado in Supermercado]: string } = {
|
export const colorBySupermercado: { [supermercado in Supermercado]: string } = {
|
||||||
[Supermercado.Dia]: "#d52b1e",
|
[Supermercado.Dia]: "#d52b1e",
|
||||||
[Supermercado.Carrefour]: "#19549d",
|
[Supermercado.Carrefour]: "#19549d",
|
||||||
[Supermercado.Coto]: "#e20025",
|
[Supermercado.Coto]: "#e20025",
|
||||||
[Supermercado.Jumbo]: "#2dc850",
|
|
||||||
};
|
};
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import pMap from "p-map";
|
import pMap from "p-map";
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
|
import { getHtml } from "../scraper/fetch.js";
|
||||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
import { getUrlsFromSitemap } from "./common.js";
|
|
||||||
|
|
||||||
const categorias = [
|
const categorias = [
|
||||||
"https://diaonline.supermercadosdia.com.ar/almacen",
|
"https://diaonline.supermercadosdia.com.ar/almacen",
|
||||||
|
@ -81,15 +81,21 @@ async function scrapBySitemap() {
|
||||||
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
|
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
|
||||||
];
|
];
|
||||||
|
|
||||||
await pMap(
|
await pMap(sitemaps, async (sitemapUrl) => {
|
||||||
sitemaps,
|
|
||||||
async (sitemapUrl) => {
|
|
||||||
const res = await fetch(sitemapUrl);
|
const res = await fetch(sitemapUrl);
|
||||||
const xml = await res.text();
|
const xml = await res.text();
|
||||||
saveUrls(getUrlsFromSitemap(xml));
|
let urls = new Set<string>();
|
||||||
|
new HTMLRewriter()
|
||||||
|
.on("loc", {
|
||||||
|
text(element) {
|
||||||
|
const txt = element.text.trim();
|
||||||
|
if (!txt) return;
|
||||||
|
urls.add(txt);
|
||||||
},
|
},
|
||||||
{ concurrency: 3 }
|
})
|
||||||
);
|
.transform(new Response(xml));
|
||||||
|
saveUrls(Array.from(urls));
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapBySite() {
|
async function scrapBySite() {
|
||||||
|
@ -104,9 +110,8 @@ async function scrapBySite() {
|
||||||
await pMap(
|
await pMap(
|
||||||
links,
|
links,
|
||||||
async (url) => {
|
async (url) => {
|
||||||
const res = await fetch(url);
|
const html = await getHtml(url);
|
||||||
const html = await res.text();
|
const { document } = parseHTML(html.toString("utf-8"));
|
||||||
const { document } = parseHTML(html);
|
|
||||||
|
|
||||||
const hrefs = Array.from(
|
const hrefs = Array.from(
|
||||||
document.querySelectorAll<HTMLAnchorElement>(
|
document.querySelectorAll<HTMLAnchorElement>(
|
17
dia-link-scraper/package.json
Normal file
17
dia-link-scraper/package.json
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
{
|
||||||
|
"name": "dia-link-scraper",
|
||||||
|
"type": "module",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "",
|
||||||
|
"main": "index.js",
|
||||||
|
"scripts": {
|
||||||
|
"test": "echo \"Error: no test specified\" && exit 1"
|
||||||
|
},
|
||||||
|
"keywords": [],
|
||||||
|
"author": "",
|
||||||
|
"license": "ISC",
|
||||||
|
"dependencies": {
|
||||||
|
"linkedom": "^0.16.5",
|
||||||
|
"p-map": "^7.0.0"
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,14 +0,0 @@
|
||||||
import { decodeXML } from "entities";
|
|
||||||
export function getUrlsFromSitemap(xml: string) {
|
|
||||||
let urls = new Set<string>();
|
|
||||||
new HTMLRewriter()
|
|
||||||
.on("loc", {
|
|
||||||
text(element) {
|
|
||||||
const txt = element.text.trim();
|
|
||||||
if (!txt) return;
|
|
||||||
urls.add(decodeXML(txt));
|
|
||||||
},
|
|
||||||
})
|
|
||||||
.transform(new Response(xml));
|
|
||||||
return Array.from(urls);
|
|
||||||
}
|
|
|
@ -1,38 +0,0 @@
|
||||||
import pMap from "p-map";
|
|
||||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
|
||||||
import { getUrlsFromSitemap } from "./common.js";
|
|
||||||
|
|
||||||
export async function scrapJumboProducts() {
|
|
||||||
await scrapBySitemap();
|
|
||||||
}
|
|
||||||
|
|
||||||
async function scrapBySitemap() {
|
|
||||||
// de https://www.jumbo.com.ar/sitemap.xml
|
|
||||||
const sitemaps = [
|
|
||||||
"https://www.jumbo.com.ar/sitemap/product-1.xml",
|
|
||||||
"https://www.jumbo.com.ar/sitemap/product-10.xml",
|
|
||||||
"https://www.jumbo.com.ar/sitemap/product-11.xml",
|
|
||||||
"https://www.jumbo.com.ar/sitemap/product-12.xml",
|
|
||||||
"https://www.jumbo.com.ar/sitemap/product-13.xml",
|
|
||||||
"https://www.jumbo.com.ar/sitemap/product-14.xml",
|
|
||||||
"https://www.jumbo.com.ar/sitemap/product-15.xml",
|
|
||||||
"https://www.jumbo.com.ar/sitemap/product-2.xml",
|
|
||||||
"https://www.jumbo.com.ar/sitemap/product-3.xml",
|
|
||||||
"https://www.jumbo.com.ar/sitemap/product-4.xml",
|
|
||||||
"https://www.jumbo.com.ar/sitemap/product-5.xml",
|
|
||||||
"https://www.jumbo.com.ar/sitemap/product-6.xml",
|
|
||||||
"https://www.jumbo.com.ar/sitemap/product-7.xml",
|
|
||||||
"https://www.jumbo.com.ar/sitemap/product-8.xml",
|
|
||||||
"https://www.jumbo.com.ar/sitemap/product-9.xml",
|
|
||||||
];
|
|
||||||
|
|
||||||
await pMap(
|
|
||||||
sitemaps,
|
|
||||||
async (sitemapUrl) => {
|
|
||||||
const res = await fetch(sitemapUrl);
|
|
||||||
const xml = await res.text();
|
|
||||||
saveUrls(getUrlsFromSitemap(xml));
|
|
||||||
},
|
|
||||||
{ concurrency: 3 }
|
|
||||||
);
|
|
||||||
}
|
|
|
@ -2,7 +2,9 @@
|
||||||
"name": "preciazo",
|
"name": "preciazo",
|
||||||
"private": true,
|
"private": true,
|
||||||
"workspaces": [
|
"workspaces": [
|
||||||
"link-scrapers",
|
"dia-link-scraper",
|
||||||
|
"coto-link-scraper",
|
||||||
|
"carrefour-link-scraper",
|
||||||
"scraper",
|
"scraper",
|
||||||
"sitio",
|
"sitio",
|
||||||
"db-datos"
|
"db-datos"
|
||||||
|
|
18
readme.md
18
readme.md
|
@ -4,23 +4,33 @@ scrapeo "masivo" de precios y datos en supermercados argentinos
|
||||||
|
|
||||||
## componentes (en orden de proceso)
|
## componentes (en orden de proceso)
|
||||||
|
|
||||||
- los link scrapers ([link-scrapers/](./link-scrapers/)) crean listas de links a productos para scrapear
|
- los link scrapers ([coto-link-scraper](./coto-link-scraper/), [dia-link-scraper](./dia-link-scraper/) y [carrefour-link-scraper](./carrefour-link-scraper)) crean listas de links a productos para scrapear
|
||||||
|
|
||||||
(no hace falta correrlos porque ya hay listas armadas en [data/](./data/))
|
(no hace falta correrlos porque ya hay listas armadas en [data/](./data/))
|
||||||
|
|
||||||
- el [scraper](./scraper/) descarga todos los links, extrayendo varios datos y guardandolos en una base de datos SQLite (definida en [db-datos](./db-datos/schema.ts))
|
- [warcificator](./warcificator/) descarga las paginas de productos y genera un archivo [WARC](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.0/) con ellas
|
||||||
|
- el [scraper](./scraper/) procesa estos WARCs, extrayendo varios datos y guardandolos en una base de datos SQLite (definida en [db-datos](./db-datos/schema.ts))
|
||||||
- el [sitio](./sitio/) renderiza páginas a partir de la base de datos y hace gráficos lindos
|
- el [sitio](./sitio/) renderiza páginas a partir de la base de datos y hace gráficos lindos
|
||||||
|
|
||||||
## setup
|
## setup
|
||||||
|
|
||||||
hay que instalar [Bun](https://bun.sh/), que lo estoy usando porque hacía que el scraper corra más rápido. quizás en el futuro lo reemplace con good old Node.js.
|
hay que instalar [Bun](https://bun.sh/), que lo estoy usando porque hacía que el scraper corra más rápido. quizás en el futuro lo reemplace con good old Node.js.
|
||||||
|
|
||||||
después, escrapea un sample de productos de Carrefour a una BD:
|
aparte, se necesita zstd, que se usa para comprimir los WARCs eficientemente. seguro está disponible en las repos de tu distro favorita :)
|
||||||
|
|
||||||
|
empezá descargando un WARC con 50 páginas de sample, y recomprimilo con zstd:
|
||||||
|
|
||||||
|
```
|
||||||
|
wget --no-verbose --tries=3 --delete-after --input-file ./data/samples/Dia.txt --warc-file=dia-sample
|
||||||
|
gzip -dc dia-sample.warc.gz | zstd --long -15 --no-sparse -o dia-sample.warc.zst
|
||||||
|
```
|
||||||
|
|
||||||
|
después, scrapealo a una BD:
|
||||||
|
|
||||||
```
|
```
|
||||||
cd scraper/
|
cd scraper/
|
||||||
bun install
|
bun install
|
||||||
bun cli.ts scrap ./data/samples/Carrefour.50.txt
|
bun cli.ts scrap ../dia-sample.warc.zst
|
||||||
```
|
```
|
||||||
|
|
||||||
ahora miralo en el sitio:
|
ahora miralo en el sitio:
|
||||||
|
|
42
scraper/Containerfile
Normal file
42
scraper/Containerfile
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
FROM docker.io/oven/bun:1-alpine AS base
|
||||||
|
WORKDIR /usr/src/app
|
||||||
|
|
||||||
|
FROM base AS builder
|
||||||
|
ENV NODE_ENV=production
|
||||||
|
COPY . .
|
||||||
|
RUN bun install --frozen-lockfile \
|
||||||
|
&& bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js \
|
||||||
|
&& rm -rf node_modules/
|
||||||
|
|
||||||
|
# https://dev.to/deciduously/use-multi-stage-docker-builds-for-statically-linked-rust-binaries-3jgd
|
||||||
|
FROM docker.io/rust:1.74 AS warcificator-builder
|
||||||
|
WORKDIR /usr/src/
|
||||||
|
RUN rustup target add x86_64-unknown-linux-musl
|
||||||
|
RUN apt-get update && apt-get install -y musl-tools musl-dev
|
||||||
|
|
||||||
|
RUN USER=root cargo new warcificator
|
||||||
|
WORKDIR /usr/src/warcificator
|
||||||
|
COPY ./warcificator/Cargo.toml ./warcificator/Cargo.lock ./
|
||||||
|
RUN cargo build --release
|
||||||
|
|
||||||
|
COPY ./warcificator/src ./src
|
||||||
|
RUN cargo install --target x86_64-unknown-linux-musl --path .
|
||||||
|
|
||||||
|
FROM base
|
||||||
|
RUN apk add --no-cache wget zstd tini
|
||||||
|
RUN printf "#!/bin/sh\nexec bun /bin/scraper auto\n" > /etc/periodic/daily/scraper \
|
||||||
|
&& chmod +x /etc/periodic/daily/scraper
|
||||||
|
|
||||||
|
COPY --from=builder /tmp/cli.build.js /bin/scraper
|
||||||
|
COPY --from=warcificator-builder /usr/local/cargo/bin/warcificator /bin/
|
||||||
|
COPY --from=builder /usr/src/app/db-datos/drizzle /bin/drizzle
|
||||||
|
COPY --from=builder /usr/src/app/data /listas
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
VOLUME /db
|
||||||
|
ENV NODE_ENV=production
|
||||||
|
ENV DB_PATH=/db/db.db
|
||||||
|
ENV LISTS_DIR=/listas/
|
||||||
|
|
||||||
|
CMD ["tini", "/bin/busybox", "crond", "-f", "-l2"]
|
||||||
|
# CMD ["bun", "/bin/scraper"]
|
146
scraper/auto.ts
146
scraper/auto.ts
|
@ -1,20 +1,29 @@
|
||||||
import { mkdtemp, writeFile } from "node:fs/promises";
|
import { mkdtemp, access, writeFile } from "node:fs/promises";
|
||||||
import { tmpdir } from "node:os";
|
import { tmpdir } from "node:os";
|
||||||
import { join } from "node:path";
|
import { join, resolve } from "node:path";
|
||||||
import { Supermercado, hosts, supermercados } from "db-datos/supermercado.js";
|
import { spawn } from "node:child_process";
|
||||||
|
import { Supermercado, hosts } from "db-datos/supermercado.js";
|
||||||
import PQueue from "p-queue";
|
import PQueue from "p-queue";
|
||||||
import { formatDuration, intervalToDuration } from "date-fns";
|
import { format, formatDuration, intervalToDuration } from "date-fns";
|
||||||
import { downloadList } from "./scrap.js";
|
import { parseWarc } from "./scrap.js";
|
||||||
|
import { S3Client } from "@aws-sdk/client-s3";
|
||||||
|
import { Upload } from "@aws-sdk/lib-storage";
|
||||||
|
import { BunFile } from "bun";
|
||||||
import { db } from "db-datos/db.js";
|
import { db } from "db-datos/db.js";
|
||||||
import { like } from "drizzle-orm";
|
import { like } from "drizzle-orm";
|
||||||
import { productoUrls } from "db-datos/schema.js";
|
import { productoUrls } from "db-datos/schema.js";
|
||||||
import { scrapDiaProducts } from "../link-scrapers/dia.js";
|
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
|
||||||
import { scrapCotoProducts } from "../link-scrapers/coto.js";
|
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
|
||||||
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
|
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
|
||||||
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
|
|
||||||
|
const supermercados: Supermercado[] = [
|
||||||
|
Supermercado.Carrefour,
|
||||||
|
Supermercado.Coto,
|
||||||
|
Supermercado.Dia,
|
||||||
|
];
|
||||||
|
|
||||||
// hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU
|
// hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU
|
||||||
const scrapQueue = new PQueue({ concurrency: 4 });
|
const scrapQueue = new PQueue({ concurrency: 1 });
|
||||||
|
|
||||||
export async function auto() {
|
export async function auto() {
|
||||||
const a = new Auto();
|
const a = new Auto();
|
||||||
|
@ -22,9 +31,35 @@ export async function auto() {
|
||||||
}
|
}
|
||||||
|
|
||||||
class Auto {
|
class Auto {
|
||||||
|
s3Config?: { s3: S3Client; bucketName: string };
|
||||||
telegramConfig?: { token: string; chatId: string };
|
telegramConfig?: { token: string; chatId: string };
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
|
if (
|
||||||
|
!process.env.S3_ACCESS_KEY_ID ||
|
||||||
|
!process.env.S3_SECRET_ACCESS_KEY ||
|
||||||
|
!process.env.S3_BUCKET_NAME
|
||||||
|
) {
|
||||||
|
if (process.env.NODE_ENV === "development") {
|
||||||
|
console.warn("faltan creds de s3, no voy a subir a s3");
|
||||||
|
} else {
|
||||||
|
throw new Error("faltan creds de s3");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
this.s3Config = {
|
||||||
|
// https://www.backblaze.com/docs/cloud-storage-use-the-aws-sdk-for-javascript-v3-with-backblaze-b2
|
||||||
|
s3: new S3Client({
|
||||||
|
endpoint: "https://s3.us-west-004.backblazeb2.com",
|
||||||
|
region: "us-west-004",
|
||||||
|
credentials: {
|
||||||
|
accessKeyId: process.env.S3_ACCESS_KEY_ID,
|
||||||
|
secretAccessKey: process.env.S3_SECRET_ACCESS_KEY,
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
bucketName: process.env.S3_BUCKET_NAME,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
if (!process.env.TELEGRAM_BOT_TOKEN)
|
if (!process.env.TELEGRAM_BOT_TOKEN)
|
||||||
console.warn("no hay TELEGRAM_BOT_TOKEN, no voy a loggear por allá");
|
console.warn("no hay TELEGRAM_BOT_TOKEN, no voy a loggear por allá");
|
||||||
else if (!process.env.TELEGRAM_BOT_CHAT_ID)
|
else if (!process.env.TELEGRAM_BOT_CHAT_ID)
|
||||||
|
@ -54,9 +89,6 @@ class Auto {
|
||||||
case "Carrefour":
|
case "Carrefour":
|
||||||
await scrapCarrefourProducts();
|
await scrapCarrefourProducts();
|
||||||
break;
|
break;
|
||||||
case "Jumbo":
|
|
||||||
await scrapJumboProducts();
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
this.inform(
|
this.inform(
|
||||||
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
|
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
|
||||||
|
@ -75,29 +107,93 @@ class Auto {
|
||||||
const urls = results.map((r) => r.url);
|
const urls = results.map((r) => r.url);
|
||||||
await writeFile(listPath, urls.join("\n") + "\n");
|
await writeFile(listPath, urls.join("\n") + "\n");
|
||||||
|
|
||||||
this.scrapAndInform({ listPath });
|
const date = new Date();
|
||||||
|
const zstdWarcName = `${supermercado}-${format(
|
||||||
|
date,
|
||||||
|
"yyyy-MM-dd-HH:mm"
|
||||||
|
)}.warc.zst`;
|
||||||
|
const zstdWarcPath = join(ctxPath, zstdWarcName);
|
||||||
|
const subproc = Bun.spawn({
|
||||||
|
cmd: ["warcificator", listPath, zstdWarcPath],
|
||||||
|
stderr: "ignore",
|
||||||
|
stdout: "ignore",
|
||||||
|
cwd: ctxPath,
|
||||||
|
});
|
||||||
|
const t0 = performance.now();
|
||||||
|
await subproc.exited;
|
||||||
|
this.inform(
|
||||||
|
`[downloader] ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}`
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!(await fileExists(zstdWarcPath))) {
|
||||||
|
const err = this.report(`no encontré el ${zstdWarcPath}`);
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.scrapAndInform({ zstdWarcPath, zstdWarcName });
|
||||||
|
|
||||||
|
try {
|
||||||
|
await this.uploadToBucket({
|
||||||
|
fileName: zstdWarcName,
|
||||||
|
file: Bun.file(zstdWarcPath),
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
this.inform(`Falló subir ${zstdWarcName} a S3; ${error}`);
|
||||||
|
console.error(error);
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: borrar archivos temporales
|
// TODO: borrar archivos temporales
|
||||||
}
|
}
|
||||||
|
|
||||||
async scrapAndInform({ listPath }: { listPath: string }) {
|
async scrapAndInform({
|
||||||
|
zstdWarcPath,
|
||||||
|
zstdWarcName,
|
||||||
|
}: {
|
||||||
|
zstdWarcPath: string;
|
||||||
|
zstdWarcName: string;
|
||||||
|
}) {
|
||||||
const res = await scrapQueue.add(async () => {
|
const res = await scrapQueue.add(async () => {
|
||||||
const t0 = performance.now();
|
const t0 = performance.now();
|
||||||
const progress = await downloadList(listPath);
|
const progress = await parseWarc(zstdWarcPath);
|
||||||
return { took: performance.now() - t0, progress };
|
return { took: performance.now() - t0, progress };
|
||||||
});
|
});
|
||||||
|
|
||||||
if (res) {
|
if (res) {
|
||||||
const { took, progress } = res;
|
const { took, progress } = res;
|
||||||
this.inform(
|
this.inform(
|
||||||
`Procesado ${listPath} (${progress.done} ok, ${
|
`Procesado ${zstdWarcName} (${progress.done} ok, ${
|
||||||
progress.skipped
|
progress.errors.length
|
||||||
} skipped, ${progress.errors.length} errores) (tardó ${formatMs(took)})`
|
} errores) (tardó ${formatMs(took)})`
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
this.inform(`Algo falló en ${listPath}`);
|
this.inform(`Algo falló en ${zstdWarcName}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async uploadToBucket({
|
||||||
|
fileName,
|
||||||
|
file,
|
||||||
|
}: {
|
||||||
|
fileName: string;
|
||||||
|
file: BunFile;
|
||||||
|
}) {
|
||||||
|
if (!this.s3Config) {
|
||||||
|
this.inform(
|
||||||
|
`[s3] Se intentó subir ${fileName} pero no tenemos creds de S3`
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const parallelUploads3 = new Upload({
|
||||||
|
client: this.s3Config.s3,
|
||||||
|
params: {
|
||||||
|
Bucket: this.s3Config.bucketName,
|
||||||
|
Key: fileName,
|
||||||
|
Body: file,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
await parallelUploads3.done();
|
||||||
|
}
|
||||||
|
|
||||||
inform(msg: string) {
|
inform(msg: string) {
|
||||||
this.sendTelegramMsg(msg);
|
this.sendTelegramMsg(msg);
|
||||||
console.info(msg);
|
console.info(msg);
|
||||||
|
@ -120,6 +216,16 @@ class Auto {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// no se llama exists porque bun tiene un bug en el que usa fs.exists por mas que exista una funcion llamada exists
|
||||||
|
async function fileExists(path: string) {
|
||||||
|
try {
|
||||||
|
access(path);
|
||||||
|
return true;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function formatMs(ms: number) {
|
function formatMs(ms: number) {
|
||||||
return formatDuration(intervalToDuration({ start: 0, end: Math.round(ms) }));
|
return formatDuration(intervalToDuration({ start: 0, end: Math.round(ms) }));
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,9 +1,8 @@
|
||||||
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
|
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
|
||||||
import { scrapCotoProducts } from "../link-scrapers/coto.js";
|
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
|
||||||
import { scrapDiaProducts } from "../link-scrapers/dia.js";
|
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
|
||||||
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
|
|
||||||
import { auto } from "./auto.js";
|
import { auto } from "./auto.js";
|
||||||
import { downloadList, getProduct } from "./scrap.js";
|
import { parseWarc } from "./scrap.js";
|
||||||
|
|
||||||
if (process.argv[2] === "auto") {
|
if (process.argv[2] === "auto") {
|
||||||
await auto();
|
await auto();
|
||||||
|
@ -13,24 +12,17 @@ if (process.argv[2] === "auto") {
|
||||||
await scrapDiaProducts();
|
await scrapDiaProducts();
|
||||||
} else if (process.argv[2] === "scrap-coto-links") {
|
} else if (process.argv[2] === "scrap-coto-links") {
|
||||||
await scrapCotoProducts();
|
await scrapCotoProducts();
|
||||||
} else if (process.argv[2] === "scrap-jumbo-links") {
|
|
||||||
await scrapJumboProducts();
|
|
||||||
} else if (process.argv[2] === "scrap-link") {
|
|
||||||
const url = new URL(process.argv[3]);
|
|
||||||
const res = await fetch(url);
|
|
||||||
const text = await res.text();
|
|
||||||
console.info(await getProduct(url, text));
|
|
||||||
} else if (process.argv[2] === "scrap") {
|
} else if (process.argv[2] === "scrap") {
|
||||||
const urlLists = process.argv.slice(3);
|
const warcPaths = process.argv.slice(3);
|
||||||
if (urlLists.length > 0) {
|
if (warcPaths.length > 0) {
|
||||||
for (const path of urlLists) {
|
for (const path of warcPaths) {
|
||||||
const res = await downloadList(path);
|
const res = await parseWarc(path);
|
||||||
console.info("=======================================");
|
console.info("=======================================");
|
||||||
console.info(path, res);
|
console.info(path, res);
|
||||||
console.info("=======================================");
|
console.info("=======================================");
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
console.error("Especificá listas de urls para scrapear.");
|
console.error("Especificá WARCs para scrapear.");
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -21,7 +21,7 @@ function parseJsonLds(dom: Window): object[] {
|
||||||
const scripts = dom.window.document.querySelectorAll(
|
const scripts = dom.window.document.querySelectorAll(
|
||||||
'script[type="application/ld+json"]'
|
'script[type="application/ld+json"]'
|
||||||
);
|
);
|
||||||
return Array.from(scripts, (script) => JSON.parse(script.innerHTML));
|
return [...scripts].map((scripts) => JSON.parse(scripts.innerHTML));
|
||||||
}
|
}
|
||||||
function findJsonLd(dom: Window, type: string): object | undefined {
|
function findJsonLd(dom: Window, type: string): object | undefined {
|
||||||
return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type);
|
return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type);
|
||||||
|
@ -31,9 +31,8 @@ const zProductLd = z.object({
|
||||||
"@type": z.literal("Product"),
|
"@type": z.literal("Product"),
|
||||||
name: z.string(),
|
name: z.string(),
|
||||||
image: z.string(),
|
image: z.string(),
|
||||||
sku: z.string().optional(),
|
|
||||||
offers: z.object({
|
offers: z.object({
|
||||||
offers: z.array(
|
offers: z.tuple([
|
||||||
z.object({
|
z.object({
|
||||||
"@type": z.literal("Offer"),
|
"@type": z.literal("Offer"),
|
||||||
price: z.number(),
|
price: z.number(),
|
||||||
|
@ -42,8 +41,8 @@ const zProductLd = z.object({
|
||||||
"http://schema.org/OutOfStock",
|
"http://schema.org/OutOfStock",
|
||||||
"http://schema.org/InStock",
|
"http://schema.org/InStock",
|
||||||
]),
|
]),
|
||||||
})
|
}),
|
||||||
),
|
]),
|
||||||
}),
|
}),
|
||||||
});
|
});
|
||||||
type ProductLd = z.infer<typeof zProductLd>;
|
type ProductLd = z.infer<typeof zProductLd>;
|
13
scraper/fetch.ts
Normal file
13
scraper/fetch.ts
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
export async function getHtml(url: string) {
|
||||||
|
const res = await fetch(url);
|
||||||
|
return readableToBuffer(res.body!);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function readableToBuffer(source: AsyncIterable<any>) {
|
||||||
|
// https://stackoverflow.com/a/72891118
|
||||||
|
const buffers = [];
|
||||||
|
for await (const data of source) {
|
||||||
|
buffers.push(data);
|
||||||
|
}
|
||||||
|
return Buffer.concat(buffers);
|
||||||
|
}
|
|
@ -5,7 +5,8 @@
|
||||||
"description": "",
|
"description": "",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"check": "tsc"
|
"build:container": "podman build -t gitea.nulo.in/nulo/preciazo/scraper -f ./Containerfile ..",
|
||||||
|
"push:container": "bun build:container && podman push gitea.nulo.in/nulo/preciazo/scraper"
|
||||||
},
|
},
|
||||||
"keywords": [],
|
"keywords": [],
|
||||||
"author": "",
|
"author": "",
|
||||||
|
@ -15,11 +16,11 @@
|
||||||
"@aws-sdk/lib-storage": "^3.478.0",
|
"@aws-sdk/lib-storage": "^3.478.0",
|
||||||
"date-fns": "^3.0.6",
|
"date-fns": "^3.0.6",
|
||||||
"db-datos": "workspace:^",
|
"db-datos": "workspace:^",
|
||||||
"drizzle-orm": "^0.29.1",
|
"drizzle-orm": "=0.29.1",
|
||||||
"linkedom": "^0.16.5",
|
"linkedom": "^0.16.5",
|
||||||
"nanoid": "^5.0.4",
|
"nanoid": "^5.0.4",
|
||||||
"p-map": "^7.0.1",
|
|
||||||
"p-queue": "^8.0.1",
|
"p-queue": "^8.0.1",
|
||||||
|
"warcio": "^2.2.1",
|
||||||
"zod": "^3.22.4"
|
"zod": "^3.22.4"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import { Precioish } from "../scrap.js";
|
import { Precioish } from "../scrap.js";
|
||||||
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
|
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "../common.js";
|
||||||
|
|
||||||
function parseScriptJson<T>(dom: Window, varname: string): T {
|
function parseScriptJson<T>(dom: Window, varname: string): T {
|
||||||
const script = dom.window.document.querySelector<HTMLTemplateElement>(
|
const script = dom.window.document.querySelector<HTMLTemplateElement>(
|
||||||
|
|
|
@ -19,7 +19,7 @@ function getEanFromText({ document }: Window) {
|
||||||
}
|
}
|
||||||
function getPriceFromText({ document }: Window) {
|
function getPriceFromText({ document }: Window) {
|
||||||
const el = document.querySelector(".atg_store_newPrice");
|
const el = document.querySelector(".atg_store_newPrice");
|
||||||
if (!el?.textContent) return null;
|
if (!el?.textContent) throw new Error("no encuentro el precio");
|
||||||
const nStr = el.textContent
|
const nStr = el.textContent
|
||||||
.trim()
|
.trim()
|
||||||
.replace("$", "")
|
.replace("$", "")
|
||||||
|
@ -27,16 +27,12 @@ function getPriceFromText({ document }: Window) {
|
||||||
.replace(",", ".");
|
.replace(",", ".");
|
||||||
return parseFloat(nStr) * 100;
|
return parseFloat(nStr) * 100;
|
||||||
}
|
}
|
||||||
function getInStock({ document }: Window) {
|
|
||||||
return !document.querySelector(".product_not_available");
|
|
||||||
}
|
|
||||||
|
|
||||||
export function getCotoProduct(html: string | Buffer): Precioish {
|
export function getCotoProduct(html: string | Buffer): Precioish {
|
||||||
const dom = parseHTML(html);
|
const dom = parseHTML(html);
|
||||||
|
|
||||||
const ean = getEanFromText(dom);
|
const ean = getEanFromText(dom);
|
||||||
const precioCentavos = getPriceFromText(dom);
|
const precioCentavos = getPriceFromText(dom);
|
||||||
const inStock = getInStock(dom);
|
|
||||||
|
|
||||||
const name = dom.document
|
const name = dom.document
|
||||||
.querySelector("h1.product_page")
|
.querySelector("h1.product_page")
|
||||||
|
@ -44,5 +40,5 @@ export function getCotoProduct(html: string | Buffer): Precioish {
|
||||||
const imageUrl =
|
const imageUrl =
|
||||||
dom.document.querySelector<HTMLImageElement>(".zoom img")?.src;
|
dom.document.querySelector<HTMLImageElement>(".zoom img")?.src;
|
||||||
|
|
||||||
return { name, imageUrl, ean, precioCentavos, inStock };
|
return { name, imageUrl, ean, precioCentavos };
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import { type Precioish } from "../scrap.js";
|
import { type Precioish } from "../scrap.js";
|
||||||
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
|
import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js";
|
||||||
|
|
||||||
export function getDiaProduct(html: string | Buffer): Precioish {
|
export function getDiaProduct(html: string | Buffer): Precioish {
|
||||||
const dom = parseHTML(html);
|
const dom = parseHTML(html);
|
||||||
|
|
|
@ -1,54 +0,0 @@
|
||||||
import { parseHTML } from "linkedom";
|
|
||||||
import { type Precioish } from "../scrap.js";
|
|
||||||
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
|
|
||||||
import { z } from "zod";
|
|
||||||
|
|
||||||
const zJumboSearch = z.tuple([
|
|
||||||
z.object({
|
|
||||||
items: z.array(
|
|
||||||
z.object({
|
|
||||||
ean: z.string(),
|
|
||||||
})
|
|
||||||
),
|
|
||||||
}),
|
|
||||||
]);
|
|
||||||
|
|
||||||
async function getEanFromSearch(sku: string) {
|
|
||||||
const url = new URL(
|
|
||||||
"https://www.jumbo.com.ar/api/catalog_system/pub/products/search"
|
|
||||||
);
|
|
||||||
url.searchParams.set("fq", `skuId:${sku}`);
|
|
||||||
const res = await fetch(url);
|
|
||||||
const json = await res.json();
|
|
||||||
const parsed = zJumboSearch.parse(json);
|
|
||||||
const ean = parsed[0].items[0].ean;
|
|
||||||
if (!parsed[0].items.every((x) => x.ean === ean)) {
|
|
||||||
throw new Error("Inesperado: no todos los items tienen el mismo EAN");
|
|
||||||
}
|
|
||||||
return ean;
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function getJumboProduct(
|
|
||||||
html: string | Buffer
|
|
||||||
): Promise<Precioish> {
|
|
||||||
const dom = parseHTML(html);
|
|
||||||
const precioCentavos = priceFromMeta(dom);
|
|
||||||
const inStock = stockFromMeta(dom);
|
|
||||||
|
|
||||||
const ld = getProductJsonLd(dom);
|
|
||||||
const name = ld.name;
|
|
||||||
const imageUrl = ld.image;
|
|
||||||
|
|
||||||
const retailerSku = ld.sku;
|
|
||||||
if (!retailerSku)
|
|
||||||
throw new Error("No encontré el SKU de Jumbo para pedir el EAN");
|
|
||||||
const ean = await getEanFromSearch(retailerSku);
|
|
||||||
|
|
||||||
return {
|
|
||||||
name,
|
|
||||||
imageUrl,
|
|
||||||
ean,
|
|
||||||
precioCentavos,
|
|
||||||
inStock,
|
|
||||||
};
|
|
||||||
}
|
|
167
scraper/scrap.ts
167
scraper/scrap.ts
|
@ -1,127 +1,112 @@
|
||||||
/// <reference lib="dom" />
|
|
||||||
import * as schema from "db-datos/schema.js";
|
import * as schema from "db-datos/schema.js";
|
||||||
import { writeFile, mkdir } from "fs/promises";
|
import { WARCParser } from "warcio";
|
||||||
|
import { writeFile } from "fs/promises";
|
||||||
import { createHash } from "crypto";
|
import { createHash } from "crypto";
|
||||||
import { getCarrefourProduct } from "./parsers/carrefour.js";
|
import { getCarrefourProduct } from "./parsers/carrefour.js";
|
||||||
import { getDiaProduct } from "./parsers/dia.js";
|
import { getDiaProduct } from "./parsers/dia.js";
|
||||||
import { getCotoProduct } from "./parsers/coto.js";
|
import { getCotoProduct } from "./parsers/coto.js";
|
||||||
import { join } from "path";
|
import { join } from "path";
|
||||||
|
import { and, eq, sql } from "drizzle-orm";
|
||||||
import { db } from "db-datos/db.js";
|
import { db } from "db-datos/db.js";
|
||||||
import pMap from "p-map";
|
|
||||||
import { getJumboProduct } from "./parsers/jumbo.js";
|
|
||||||
|
|
||||||
const DEBUG = true;
|
const DEBUG = false;
|
||||||
const PARSER_VERSION = 4;
|
const PARSER_VERSION = 4;
|
||||||
|
|
||||||
|
const getPrevPrecio = db
|
||||||
|
.select({ id: schema.precios.id })
|
||||||
|
.from(schema.precios)
|
||||||
|
.where(
|
||||||
|
and(
|
||||||
|
eq(schema.precios.warcRecordId, sql.placeholder("warcRecordId")),
|
||||||
|
eq(schema.precios.parserVersion, PARSER_VERSION)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.limit(1)
|
||||||
|
.prepare();
|
||||||
|
|
||||||
export type Precio = typeof schema.precios.$inferInsert;
|
export type Precio = typeof schema.precios.$inferInsert;
|
||||||
export type Precioish = Omit<
|
export type Precioish = Omit<
|
||||||
Precio,
|
Precio,
|
||||||
"fetchedAt" | "url" | "id" | "warcRecordId" | "parserVersion"
|
"fetchedAt" | "url" | "id" | "warcRecordId" | "parserVersion"
|
||||||
>;
|
>;
|
||||||
|
|
||||||
export async function downloadList(path: string) {
|
export async function parseWarc(path: string) {
|
||||||
let list = (await Bun.file(path).text())
|
// const warc = createReadStream(path);
|
||||||
.split("\n")
|
|
||||||
.filter((s) => s.length > 0);
|
|
||||||
|
|
||||||
const results = await pMap(
|
|
||||||
list,
|
|
||||||
async (urlS) => {
|
|
||||||
let res: ScrapResult = { type: "skipped" };
|
|
||||||
for (let attempts = 0; attempts < 6; attempts++) {
|
|
||||||
if (attempts !== 0) await wait(1500);
|
|
||||||
res = await scrap(urlS);
|
|
||||||
if (res.type === "done" || res.type === "skipped") {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (res.type === "error") console.error(res);
|
|
||||||
return res;
|
|
||||||
},
|
|
||||||
{ concurrency: 32 }
|
|
||||||
);
|
|
||||||
|
|
||||||
let progress: {
|
let progress: {
|
||||||
done: number;
|
done: number;
|
||||||
skipped: number;
|
errors: { error: any; warcRecordId: string; path: string }[];
|
||||||
errors: { error: any; url: string; debugPath: string }[];
|
} = { done: 0, errors: [] };
|
||||||
} = { done: 0, skipped: 0, errors: [] };
|
|
||||||
for (const result of results) {
|
|
||||||
switch (result.type) {
|
|
||||||
case "done":
|
|
||||||
progress.done++;
|
|
||||||
break;
|
|
||||||
case "error":
|
|
||||||
progress.errors.push(result);
|
|
||||||
break;
|
|
||||||
case "skipped":
|
|
||||||
progress.skipped++;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return progress;
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function getProduct(url: URL, html: string): Promise<Precioish> {
|
const proc = Bun.spawn(["zstdcat", "-d", path], {});
|
||||||
if (url.hostname === "www.carrefour.com.ar") return getCarrefourProduct(html);
|
const warc = proc.stdout;
|
||||||
|
// TODO: tirar error si falla zstd
|
||||||
|
|
||||||
|
const parser = new WARCParser(warc);
|
||||||
|
for await (const record of parser) {
|
||||||
|
if (record.warcType === "response") {
|
||||||
|
if (!record.warcTargetURI) continue;
|
||||||
|
const warcRecordId = record.warcHeader("WARC-Record-ID");
|
||||||
|
if (!warcRecordId) throw new Error("No tiene WARC-Record-ID");
|
||||||
|
|
||||||
|
if (getPrevPrecio.get({ warcRecordId })) {
|
||||||
|
console.debug(`skipped ${warcRecordId}`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (record.httpHeaders?.statusCode !== 200) {
|
||||||
|
console.debug(
|
||||||
|
`skipped ${warcRecordId} because status=${record.httpHeaders?.statusCode} (!=200)`
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// TODO: sobreescribir si existe el mismo record-id pero con version mas bajo?
|
||||||
|
|
||||||
|
const html = await record.contentText();
|
||||||
|
|
||||||
|
const url = new URL(record.warcTargetURI);
|
||||||
|
try {
|
||||||
|
let ish: Precioish | undefined = undefined;
|
||||||
|
if (url.hostname === "www.carrefour.com.ar")
|
||||||
|
ish = getCarrefourProduct(html);
|
||||||
else if (url.hostname === "diaonline.supermercadosdia.com.ar")
|
else if (url.hostname === "diaonline.supermercadosdia.com.ar")
|
||||||
return getDiaProduct(html);
|
ish = getDiaProduct(html);
|
||||||
else if (url.hostname === "www.cotodigital3.com.ar")
|
else if (url.hostname === "www.cotodigital3.com.ar")
|
||||||
return getCotoProduct(html);
|
ish = getCotoProduct(html);
|
||||||
else if (url.hostname === "www.jumbo.com.ar")
|
|
||||||
return await getJumboProduct(html);
|
|
||||||
else throw new Error(`Unknown host ${url.hostname}`);
|
else throw new Error(`Unknown host ${url.hostname}`);
|
||||||
}
|
|
||||||
|
|
||||||
type ScrapResult =
|
|
||||||
| { type: "skipped" }
|
|
||||||
| { type: "done" }
|
|
||||||
| { type: "error"; url: string; error: any; debugPath: string };
|
|
||||||
async function scrap(urlS: string): Promise<ScrapResult> {
|
|
||||||
let url;
|
|
||||||
try {
|
|
||||||
url = new URL(urlS);
|
|
||||||
} catch (err) {
|
|
||||||
console.error(`skipped ${urlS} because ${err}`);
|
|
||||||
return { type: "skipped" };
|
|
||||||
}
|
|
||||||
const res = await fetch(url);
|
|
||||||
if (!res.ok) {
|
|
||||||
console.debug(`skipped ${urlS} because status=${res.status} (!=200)`);
|
|
||||||
return { type: "skipped" };
|
|
||||||
}
|
|
||||||
|
|
||||||
const html = await res.text();
|
|
||||||
|
|
||||||
try {
|
|
||||||
let ish = await getProduct(url, html);
|
|
||||||
|
|
||||||
const p: Precio = {
|
const p: Precio = {
|
||||||
...ish,
|
...ish,
|
||||||
fetchedAt: new Date(),
|
fetchedAt: new Date(record.warcDate!),
|
||||||
url: urlS,
|
url: record.warcTargetURI,
|
||||||
|
warcRecordId,
|
||||||
parserVersion: PARSER_VERSION,
|
parserVersion: PARSER_VERSION,
|
||||||
};
|
};
|
||||||
|
|
||||||
await db.insert(schema.precios).values(p);
|
await db.insert(schema.precios).values(p);
|
||||||
|
|
||||||
return { type: "done" };
|
progress.done++;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
const urlHash = createHash("md5").update(urlS).digest("hex");
|
console.error({ path, warcRecordId, error });
|
||||||
const output = join("debug", `${urlHash}.html`);
|
progress.errors.push({
|
||||||
if (DEBUG) {
|
path,
|
||||||
await mkdir("debug", { recursive: true });
|
warcRecordId,
|
||||||
await writeFile(output, html);
|
|
||||||
}
|
|
||||||
return {
|
|
||||||
type: "error",
|
|
||||||
url: urlS,
|
|
||||||
error,
|
error,
|
||||||
debugPath: output,
|
});
|
||||||
};
|
|
||||||
|
if (DEBUG) {
|
||||||
|
const urlHash = createHash("md5")
|
||||||
|
.update(record.warcTargetURI!)
|
||||||
|
.digest("hex");
|
||||||
|
const output = join("debug", `${urlHash}.html`);
|
||||||
|
await writeFile(output, html);
|
||||||
|
console.error(`wrote html to ${output}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function wait(ms: number) {
|
if ((await proc.exited) !== 0) {
|
||||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
throw new Error("zstd tiró un error");
|
||||||
|
}
|
||||||
|
|
||||||
|
return progress;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
{
|
{
|
||||||
"extends": "../tsconfig.json",
|
"extends": "../tsconfig.json"
|
||||||
"exclude": ["../sitio"]
|
|
||||||
}
|
}
|
||||||
|
|
157
scraper/warc.ts
Normal file
157
scraper/warc.ts
Normal file
|
@ -0,0 +1,157 @@
|
||||||
|
const crlf = "\r\n";
|
||||||
|
const crlfB = Buffer.from(crlf, "utf-8");
|
||||||
|
const crlfcrlf = crlf + crlf;
|
||||||
|
const crlfcrlfB = Buffer.from(crlfcrlf, "utf-8");
|
||||||
|
const warc10B = Buffer.from("WARC/1.0", "utf-8");
|
||||||
|
const emptyBuffer = Buffer.from("", "utf-8");
|
||||||
|
|
||||||
|
export async function* parseWARC(path: string) {
|
||||||
|
const warc = Bun.spawn(["zstd", "-do", "/dev/stdout", path], {
|
||||||
|
stderr: "ignore",
|
||||||
|
}).stdout;
|
||||||
|
|
||||||
|
// const warc = Bun.stdin.stream(1024 * 1024 * 128);
|
||||||
|
|
||||||
|
// let buffer: Uint8Array[] = [];
|
||||||
|
// const transform = new TransformStream<Uint8Array, Buffer>({
|
||||||
|
// transform(chunk, controller) {
|
||||||
|
// buffer.push(chunk);
|
||||||
|
// if (
|
||||||
|
// buffer.reduce((prev, curr) => prev + curr.length, 0) >
|
||||||
|
// 1024 * 1024 * 64
|
||||||
|
// ) {
|
||||||
|
// controller.enqueue(Buffer.concat(buffer));
|
||||||
|
// buffer = [];
|
||||||
|
// }
|
||||||
|
// },
|
||||||
|
// flush(controller) {
|
||||||
|
// controller.enqueue(Buffer.concat(buffer));
|
||||||
|
// },
|
||||||
|
// });
|
||||||
|
|
||||||
|
// warc.pipeTo(transform.writable);
|
||||||
|
|
||||||
|
const reader = warc.getReader();
|
||||||
|
// const reader = transform.readable.getReader();
|
||||||
|
|
||||||
|
// const warc = process.stdin;
|
||||||
|
|
||||||
|
let arrays: Buffer[] = [];
|
||||||
|
let done = false;
|
||||||
|
while (!done) {
|
||||||
|
const r = await reader.readMany();
|
||||||
|
if (r.done) {
|
||||||
|
done = true;
|
||||||
|
} else {
|
||||||
|
arrays = arrays.concat(r.value.map((x) => Buffer.from(x)));
|
||||||
|
if (
|
||||||
|
arrays.reduce((prev, curr) => prev + curr.length, 0) <
|
||||||
|
1024 * 1024 * 10
|
||||||
|
)
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let buf: Buffer;
|
||||||
|
while (
|
||||||
|
((buf = arrays.length === 1 ? arrays[0] : Buffer.concat(arrays)),
|
||||||
|
buf.subarray(warc10B.length).includes(warc10B))
|
||||||
|
) {
|
||||||
|
const until = buf.indexOf(crlfcrlfB);
|
||||||
|
const header = buf.subarray(0, until);
|
||||||
|
|
||||||
|
const lines = splitBuffer(header, crlfB);
|
||||||
|
let i = 0;
|
||||||
|
const nextLine = () => {
|
||||||
|
const line = lines[i];
|
||||||
|
i++;
|
||||||
|
return line ? line : emptyBuffer;
|
||||||
|
};
|
||||||
|
let line: Buffer;
|
||||||
|
if (!(line = nextLine()).equals(warc10B)) {
|
||||||
|
throw new Error(`No WARC 1.0 header in '${line}'`);
|
||||||
|
}
|
||||||
|
|
||||||
|
let field;
|
||||||
|
let fields = new Map<string, string>();
|
||||||
|
while (
|
||||||
|
((line = nextLine()),
|
||||||
|
(field = parseField(line.toString("utf8"))),
|
||||||
|
line.length !== 0)
|
||||||
|
) {
|
||||||
|
fields.set(field[0], field[1]);
|
||||||
|
}
|
||||||
|
const length = parseInt(fields.get("Content-Length")!);
|
||||||
|
|
||||||
|
const rawHttp = buf.subarray(
|
||||||
|
until + crlfcrlfB.length,
|
||||||
|
until + crlfcrlfB.length + length
|
||||||
|
);
|
||||||
|
const rawHttpHeaders = rawHttp
|
||||||
|
.subarray(
|
||||||
|
rawHttp.indexOf(crlfB) + crlfB.length,
|
||||||
|
rawHttp.indexOf(crlfcrlfB) + crlfcrlfB.length
|
||||||
|
)
|
||||||
|
.toString();
|
||||||
|
|
||||||
|
let httpHeaders = new Map<string, string>();
|
||||||
|
rawHttpHeaders.split(crlf).forEach((line) => {
|
||||||
|
if (!line.length) return;
|
||||||
|
const [key, val] = line.split(": ");
|
||||||
|
httpHeaders.set(key, val);
|
||||||
|
});
|
||||||
|
|
||||||
|
let content = rawHttp.subarray(
|
||||||
|
rawHttp.indexOf(crlfcrlfB) + crlfcrlfB.length
|
||||||
|
);
|
||||||
|
|
||||||
|
if (httpHeaders.get("Transfer-Encoding") === "chunked") {
|
||||||
|
content = dechunk(content);
|
||||||
|
}
|
||||||
|
|
||||||
|
// console.debug(fields.get("WARC-Date"), content.length);
|
||||||
|
|
||||||
|
yield {
|
||||||
|
fields,
|
||||||
|
content,
|
||||||
|
};
|
||||||
|
|
||||||
|
arrays = [
|
||||||
|
buf.subarray(until + crlfcrlfB.length + length + crlfcrlfB.length),
|
||||||
|
];
|
||||||
|
if (!arrays[0].length) break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function splitBuffer(buffer: Buffer, val: Buffer): Buffer[] {
|
||||||
|
let bufs = [];
|
||||||
|
let rest = buffer;
|
||||||
|
let i;
|
||||||
|
while (((i = rest.indexOf(val)), i !== -1)) {
|
||||||
|
bufs.push(rest.subarray(0, i));
|
||||||
|
rest = rest.subarray(i + val.length);
|
||||||
|
}
|
||||||
|
bufs.push(rest);
|
||||||
|
return bufs;
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseField(line: string): [string, string] {
|
||||||
|
const [key, val] = line.split(": ");
|
||||||
|
return [key, val];
|
||||||
|
}
|
||||||
|
|
||||||
|
function dechunk(content: Buffer): Buffer {
|
||||||
|
let actualContent = [];
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
let until = content.indexOf(crlf);
|
||||||
|
const hexLen = content.subarray(0, until).toString();
|
||||||
|
if (hexLen.length === 0) break;
|
||||||
|
const len = parseInt(hexLen, 16);
|
||||||
|
actualContent.push(
|
||||||
|
content.subarray(until + crlfB.length, until + crlfB.length + len)
|
||||||
|
);
|
||||||
|
content = content.subarray(until + crlfB.length + len + crlfB.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Buffer.concat(actualContent);
|
||||||
|
}
|
31
sitio/Containerfile
Normal file
31
sitio/Containerfile
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
FROM docker.io/oven/bun:1-alpine as build
|
||||||
|
RUN apk add --no-cache nodejs
|
||||||
|
WORKDIR /usr/src/app
|
||||||
|
COPY . .
|
||||||
|
WORKDIR /usr/src/app/sitio
|
||||||
|
RUN bun install && \
|
||||||
|
bun run build
|
||||||
|
|
||||||
|
# FROM docker.io/oven/bun:1-alpine as deps
|
||||||
|
# WORKDIR /usr/src/app/sitio
|
||||||
|
# RUN bun init && bun install "better-sqlite3"@"^9.2.2" "chart.js"@"^4.4.1" "chartjs-adapter-dayjs-4"@"^1.0.4" "dayjs"@"^1.11.10" "drizzle-orm"@"^0.29.1"
|
||||||
|
# COPY --from=build /usr/src/app/db-datos node_modules/db-datos
|
||||||
|
|
||||||
|
FROM docker.io/alpine:3.19
|
||||||
|
RUN apk add --no-cache tini nodejs npm jq
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
COPY --from=build /usr/src/app/sitio/package.json package.real.json
|
||||||
|
RUN sh -c 'echo {\"name\":\"sitio\",\"type\":\"module\",\"dependencies\":$(jq .dependencies < package.real.json)} > package.json' && npm install
|
||||||
|
COPY --from=build /usr/src/app/db-datos node_modules/db-datos
|
||||||
|
COPY --from=build /usr/src/app/sitio/build .
|
||||||
|
|
||||||
|
# https://github.com/gornostay25/svelte-adapter-bun/issues/39
|
||||||
|
ENV PROTOCOL_HEADER=x-forwarded-proto
|
||||||
|
ENV HOST_HEADER=x-forwarded-host
|
||||||
|
|
||||||
|
VOLUME /db
|
||||||
|
ENV DB_PATH=/db/db.db
|
||||||
|
EXPOSE 3000
|
||||||
|
|
||||||
|
CMD ["tini", "node", "."]
|
|
@ -38,8 +38,7 @@
|
||||||
"better-sqlite3": "^9.2.2",
|
"better-sqlite3": "^9.2.2",
|
||||||
"chart.js": "^4.4.1",
|
"chart.js": "^4.4.1",
|
||||||
"chartjs-adapter-dayjs-4": "^1.0.4",
|
"chartjs-adapter-dayjs-4": "^1.0.4",
|
||||||
"croner": "^8.0.0",
|
|
||||||
"dayjs": "^1.11.10",
|
"dayjs": "^1.11.10",
|
||||||
"drizzle-orm": "^0.29.1"
|
"drizzle-orm": "=0.29.1"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,12 +0,0 @@
|
||||||
import { spawn } from "child_process";
|
|
||||||
import Cron from "croner";
|
|
||||||
|
|
||||||
if (process.env.NODE_ENV === "production") {
|
|
||||||
const job = Cron("15 3 * * *", () => {
|
|
||||||
runScraper();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
function runScraper() {
|
|
||||||
spawn("bun", ["/bin/scraper", "auto"], { stdio: "inherit" });
|
|
||||||
}
|
|
|
@ -1,10 +1,8 @@
|
||||||
<script lang="ts">
|
<script lang="ts">
|
||||||
export let product: { ean: string; name: string; imageUrl?: string | null };
|
export let product: { ean: string; name: string; imageUrl: string };
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<a href={`/ean/${product.ean}`} class="flex">
|
<a href={`/ean/${product.ean}`} class="flex">
|
||||||
{#if product.imageUrl}
|
|
||||||
<img src={product.imageUrl} alt={product.name} class="max-h-48" />
|
<img src={product.imageUrl} alt={product.name} class="max-h-48" />
|
||||||
{/if}
|
|
||||||
<p class="text-xl">{product.name}</p>
|
<p class="text-xl">{product.name}</p>
|
||||||
</a>
|
</a>
|
||||||
|
|
|
@ -1,22 +1,9 @@
|
||||||
import type { PageData, PageServerLoad } from "./$types";
|
import type { PageServerLoad } from "./$types";
|
||||||
import { db, schema } from "$lib/server/db";
|
import { db, schema } from "$lib/server/db";
|
||||||
const { precios } = schema;
|
const { precios } = schema;
|
||||||
import { sql } from "drizzle-orm";
|
import { sql } from "drizzle-orm";
|
||||||
|
|
||||||
let cache: null | { key: Date; data: { precios: Precios } } = null;
|
export const load: PageServerLoad = async ({ params }) => {
|
||||||
|
|
||||||
type Precios = {
|
|
||||||
ean: string;
|
|
||||||
name: string | null;
|
|
||||||
imageUrl: string | null;
|
|
||||||
}[];
|
|
||||||
|
|
||||||
export const load: PageServerLoad = async ({
|
|
||||||
params,
|
|
||||||
}): Promise<{ precios: Precios }> => {
|
|
||||||
if (cache && +new Date() < +cache.key + 1000 * 60 * 10) {
|
|
||||||
return cache.data;
|
|
||||||
}
|
|
||||||
const q = db
|
const q = db
|
||||||
.select({
|
.select({
|
||||||
ean: precios.ean,
|
ean: precios.ean,
|
||||||
|
@ -25,11 +12,9 @@ export const load: PageServerLoad = async ({
|
||||||
})
|
})
|
||||||
.from(precios)
|
.from(precios)
|
||||||
.groupBy(precios.ean)
|
.groupBy(precios.ean)
|
||||||
.having(sql`max(length(name)) and max(parser_version) and in_stock`)
|
.having(sql`max(length(name))`)
|
||||||
.orderBy(sql`random()`)
|
.orderBy(sql`random()`)
|
||||||
.limit(150);
|
.limit(150);
|
||||||
const res = await q;
|
const res = await q;
|
||||||
const data = { precios: res };
|
return { precios: res };
|
||||||
cache = { key: new Date(), data };
|
|
||||||
return data;
|
|
||||||
};
|
};
|
||||||
|
|
|
@ -3,10 +3,6 @@
|
||||||
import type { PageData } from "./$types";
|
import type { PageData } from "./$types";
|
||||||
|
|
||||||
export let data: PageData;
|
export let data: PageData;
|
||||||
$: precios = data.precios.filter(
|
|
||||||
(d): d is { ean: string; name: string; imageUrl: string | null } =>
|
|
||||||
!!d.name,
|
|
||||||
);
|
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<h1 class="text-xl">WIP</h1>
|
<h1 class="text-xl">WIP</h1>
|
||||||
|
@ -36,7 +32,7 @@
|
||||||
<section>
|
<section>
|
||||||
<h2 class="text-lg font-bold">Random</h2>
|
<h2 class="text-lg font-bold">Random</h2>
|
||||||
<ul class="grid grid-cols-1 gap-4 md:grid-cols-2 lg:grid-cols-3">
|
<ul class="grid grid-cols-1 gap-4 md:grid-cols-2 lg:grid-cols-3">
|
||||||
{#each precios as product}
|
{#each data.precios as product}
|
||||||
<li>
|
<li>
|
||||||
<ProductPreview {product} />
|
<ProductPreview {product} />
|
||||||
</li>
|
</li>
|
||||||
|
|
|
@ -9,11 +9,13 @@ export const load: PageServerLoad = async ({ params }) => {
|
||||||
.select()
|
.select()
|
||||||
.from(precios)
|
.from(precios)
|
||||||
.where(eq(precios.ean, params.ean))
|
.where(eq(precios.ean, params.ean))
|
||||||
|
.groupBy(precios.warcRecordId)
|
||||||
|
.having(max(precios.parserVersion))
|
||||||
.orderBy(precios.fetchedAt);
|
.orderBy(precios.fetchedAt);
|
||||||
const res = await q;
|
const res = await q;
|
||||||
if (res.length === 0) return error(404, "Not Found");
|
if (res.length === 0) return error(404, "Not Found");
|
||||||
|
|
||||||
const meta = res.findLast((p) => p.name);
|
const meta = res.find((p) => p.name);
|
||||||
|
|
||||||
return { precios: res, meta };
|
return { precios: res, meta };
|
||||||
};
|
};
|
||||||
|
|
|
@ -17,7 +17,6 @@
|
||||||
[Supermercado.Dia]: "bg-[#d52b1e] focus:ring-[#d52b1e]",
|
[Supermercado.Dia]: "bg-[#d52b1e] focus:ring-[#d52b1e]",
|
||||||
[Supermercado.Carrefour]: "bg-[#19549d] focus:ring-[#19549d]",
|
[Supermercado.Carrefour]: "bg-[#19549d] focus:ring-[#19549d]",
|
||||||
[Supermercado.Coto]: "bg-[#e20025] focus:ring-[#e20025]",
|
[Supermercado.Coto]: "bg-[#e20025] focus:ring-[#e20025]",
|
||||||
[Supermercado.Jumbo]: "bg-[#2dc850] focus:ring-[#2dc850]",
|
|
||||||
};
|
};
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
|
|
|
@ -1,19 +1,18 @@
|
||||||
import { error } from "@sveltejs/kit";
|
import { error } from "@sveltejs/kit";
|
||||||
import { sql } from "drizzle-orm";
|
import { eq, max, sql } from "drizzle-orm";
|
||||||
import type { PageServerLoad } from "./$types";
|
import type { PageServerLoad } from "./$types";
|
||||||
import { db } from "$lib/server/db";
|
import { db, schema } from "$lib/server/db";
|
||||||
|
const { precios } = schema;
|
||||||
|
|
||||||
export const load: PageServerLoad = async ({ url }) => {
|
export const load: PageServerLoad = async ({ url }) => {
|
||||||
const query = url.searchParams.get("q");
|
const query = url.searchParams.get("q");
|
||||||
let results: null | { ean: string; name: string; imageUrl: string }[] = null;
|
let results: null | { ean: string; name: string; imageUrl: string }[] = null;
|
||||||
if (query) {
|
if (query) {
|
||||||
const sqlQuery = sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
|
results = db.all(
|
||||||
|
sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
|
||||||
join precios p on p.ean = f.ean
|
join precios p on p.ean = f.ean
|
||||||
where f.name match ${`"${query}"`}
|
where f.name match ${query};`,
|
||||||
group by p.ean
|
);
|
||||||
having max(p.fetched_at) and max(p.in_stock)
|
|
||||||
order by p.in_stock desc;`;
|
|
||||||
results = db.all(sqlQuery);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return { query, results };
|
return { query, results };
|
||||||
|
|
|
@ -15,6 +15,5 @@
|
||||||
"noEmit": true,
|
"noEmit": true,
|
||||||
"forceConsistentCasingInFileNames": true
|
"forceConsistentCasingInFileNames": true
|
||||||
},
|
},
|
||||||
"include": ["**/*.ts", "**/*.js"],
|
"include": ["**/*.ts", "**/*.js"]
|
||||||
"exclude": ["sitio/build"]
|
|
||||||
}
|
}
|
||||||
|
|
534
warcificator/Cargo.lock
generated
534
warcificator/Cargo.lock
generated
|
@ -24,6 +24,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01"
|
checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
|
"getrandom",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"version_check",
|
"version_check",
|
||||||
"zerocopy",
|
"zerocopy",
|
||||||
|
@ -143,6 +144,12 @@ version = "3.14.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec"
|
checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "byteorder"
|
||||||
|
version = "1.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bytes"
|
name = "bytes"
|
||||||
version = "1.5.0"
|
version = "1.5.0"
|
||||||
|
@ -198,16 +205,6 @@ dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "crossbeam-channel"
|
|
||||||
version = "0.5.10"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "82a9b73a36529d9c47029b9fb3a6f0ea3cc916a261195352ba19e770fc1748b2"
|
|
||||||
dependencies = [
|
|
||||||
"cfg-if",
|
|
||||||
"crossbeam-utils",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crossbeam-utils"
|
name = "crossbeam-utils"
|
||||||
version = "0.8.18"
|
version = "0.8.18"
|
||||||
|
@ -218,14 +215,60 @@ dependencies = [
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "deranged"
|
name = "cssparser"
|
||||||
version = "0.3.11"
|
version = "0.31.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
|
checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"powerfmt",
|
"cssparser-macros",
|
||||||
|
"dtoa-short",
|
||||||
|
"itoa",
|
||||||
|
"phf 0.11.2",
|
||||||
|
"smallvec",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cssparser-macros"
|
||||||
|
version = "0.6.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
|
||||||
|
dependencies = [
|
||||||
|
"quote",
|
||||||
|
"syn 2.0.43",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "derive_more"
|
||||||
|
version = "0.99.17"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn 1.0.109",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "dtoa"
|
||||||
|
version = "1.0.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "dtoa-short"
|
||||||
|
version = "0.3.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "dbaceec3c6e4211c79e7b1800fb9680527106beb2f9c51904a3210c03a448c74"
|
||||||
|
dependencies = [
|
||||||
|
"dtoa",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ego-tree"
|
||||||
|
version = "0.6.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "encoding_rs"
|
name = "encoding_rs"
|
||||||
version = "0.8.33"
|
version = "0.8.33"
|
||||||
|
@ -299,6 +342,16 @@ dependencies = [
|
||||||
"percent-encoding",
|
"percent-encoding",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "futf"
|
||||||
|
version = "0.1.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
|
||||||
|
dependencies = [
|
||||||
|
"mac",
|
||||||
|
"new_debug_unreachable",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "futures-channel"
|
name = "futures-channel"
|
||||||
version = "0.3.30"
|
version = "0.3.30"
|
||||||
|
@ -338,6 +391,24 @@ dependencies = [
|
||||||
"pin-utils",
|
"pin-utils",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fxhash"
|
||||||
|
version = "0.2.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
|
||||||
|
dependencies = [
|
||||||
|
"byteorder",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "getopts"
|
||||||
|
version = "0.2.21"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
|
||||||
|
dependencies = [
|
||||||
|
"unicode-width",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "getrandom"
|
name = "getrandom"
|
||||||
version = "0.2.11"
|
version = "0.2.11"
|
||||||
|
@ -399,6 +470,20 @@ version = "0.3.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
|
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "html5ever"
|
||||||
|
version = "0.26.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
|
||||||
|
dependencies = [
|
||||||
|
"log",
|
||||||
|
"mac",
|
||||||
|
"markup5ever",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn 1.0.109",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "http"
|
name = "http"
|
||||||
version = "0.2.11"
|
version = "0.2.11"
|
||||||
|
@ -512,12 +597,6 @@ dependencies = [
|
||||||
"wasm-bindgen",
|
"wasm-bindgen",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "lazy_static"
|
|
||||||
version = "1.4.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libc"
|
name = "libc"
|
||||||
version = "0.2.151"
|
version = "0.2.151"
|
||||||
|
@ -550,6 +629,26 @@ version = "0.4.20"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
|
checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "mac"
|
||||||
|
version = "0.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "markup5ever"
|
||||||
|
version = "0.11.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
|
||||||
|
dependencies = [
|
||||||
|
"log",
|
||||||
|
"phf 0.10.1",
|
||||||
|
"phf_codegen",
|
||||||
|
"string_cache",
|
||||||
|
"string_cache_codegen",
|
||||||
|
"tendril",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "memchr"
|
name = "memchr"
|
||||||
version = "2.7.1"
|
version = "2.7.1"
|
||||||
|
@ -583,14 +682,10 @@ dependencies = [
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "nu-ansi-term"
|
name = "new_debug_unreachable"
|
||||||
version = "0.46.0"
|
version = "1.0.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
|
checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54"
|
||||||
dependencies = [
|
|
||||||
"overload",
|
|
||||||
"winapi",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "num_cpus"
|
name = "num_cpus"
|
||||||
|
@ -617,12 +712,6 @@ version = "1.19.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
|
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "overload"
|
|
||||||
version = "0.1.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "parking"
|
name = "parking"
|
||||||
version = "2.2.0"
|
version = "2.2.0"
|
||||||
|
@ -658,6 +747,86 @@ version = "2.3.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
|
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "phf"
|
||||||
|
version = "0.10.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
|
||||||
|
dependencies = [
|
||||||
|
"phf_shared 0.10.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "phf"
|
||||||
|
version = "0.11.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
|
||||||
|
dependencies = [
|
||||||
|
"phf_macros",
|
||||||
|
"phf_shared 0.11.2",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "phf_codegen"
|
||||||
|
version = "0.10.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
|
||||||
|
dependencies = [
|
||||||
|
"phf_generator 0.10.0",
|
||||||
|
"phf_shared 0.10.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "phf_generator"
|
||||||
|
version = "0.10.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
|
||||||
|
dependencies = [
|
||||||
|
"phf_shared 0.10.0",
|
||||||
|
"rand",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "phf_generator"
|
||||||
|
version = "0.11.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0"
|
||||||
|
dependencies = [
|
||||||
|
"phf_shared 0.11.2",
|
||||||
|
"rand",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "phf_macros"
|
||||||
|
version = "0.11.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b"
|
||||||
|
dependencies = [
|
||||||
|
"phf_generator 0.11.2",
|
||||||
|
"phf_shared 0.11.2",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn 2.0.43",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "phf_shared"
|
||||||
|
version = "0.10.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
|
||||||
|
dependencies = [
|
||||||
|
"siphasher",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "phf_shared"
|
||||||
|
version = "0.11.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
|
||||||
|
dependencies = [
|
||||||
|
"siphasher",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pin-project-lite"
|
name = "pin-project-lite"
|
||||||
version = "0.2.13"
|
version = "0.2.13"
|
||||||
|
@ -677,10 +846,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "69d3587f8a9e599cc7ec2c00e331f71c4e69a5f9a4b8a6efd5b07466b9736f9a"
|
checksum = "69d3587f8a9e599cc7ec2c00e331f71c4e69a5f9a4b8a6efd5b07466b9736f9a"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "powerfmt"
|
name = "ppv-lite86"
|
||||||
version = "0.2.0"
|
version = "0.2.17"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
|
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "precomputed-hash"
|
||||||
|
version = "0.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "proc-macro2"
|
name = "proc-macro2"
|
||||||
|
@ -700,6 +875,36 @@ dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rand"
|
||||||
|
version = "0.8.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"rand_chacha",
|
||||||
|
"rand_core",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rand_chacha"
|
||||||
|
version = "0.3.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
|
||||||
|
dependencies = [
|
||||||
|
"ppv-lite86",
|
||||||
|
"rand_core",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rand_core"
|
||||||
|
version = "0.6.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
||||||
|
dependencies = [
|
||||||
|
"getrandom",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "redox_syscall"
|
name = "redox_syscall"
|
||||||
version = "0.4.1"
|
version = "0.4.1"
|
||||||
|
@ -828,6 +1033,22 @@ version = "1.2.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "scraper"
|
||||||
|
version = "0.18.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "585480e3719b311b78a573db1c9d9c4c1f8010c2dee4cc59c2efe58ea4dbc3e1"
|
||||||
|
dependencies = [
|
||||||
|
"ahash",
|
||||||
|
"cssparser",
|
||||||
|
"ego-tree",
|
||||||
|
"getopts",
|
||||||
|
"html5ever",
|
||||||
|
"once_cell",
|
||||||
|
"selectors",
|
||||||
|
"tendril",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sct"
|
name = "sct"
|
||||||
version = "0.7.1"
|
version = "0.7.1"
|
||||||
|
@ -838,6 +1059,25 @@ dependencies = [
|
||||||
"untrusted",
|
"untrusted",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "selectors"
|
||||||
|
version = "0.25.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06"
|
||||||
|
dependencies = [
|
||||||
|
"bitflags 2.4.1",
|
||||||
|
"cssparser",
|
||||||
|
"derive_more",
|
||||||
|
"fxhash",
|
||||||
|
"log",
|
||||||
|
"new_debug_unreachable",
|
||||||
|
"phf 0.10.1",
|
||||||
|
"phf_codegen",
|
||||||
|
"precomputed-hash",
|
||||||
|
"servo_arc",
|
||||||
|
"smallvec",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde"
|
name = "serde"
|
||||||
version = "1.0.193"
|
version = "1.0.193"
|
||||||
|
@ -855,7 +1095,7 @@ checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn",
|
"syn 2.0.43",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -882,12 +1122,12 @@ dependencies = [
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sharded-slab"
|
name = "servo_arc"
|
||||||
version = "0.1.7"
|
version = "0.3.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
|
checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"lazy_static",
|
"stable_deref_trait",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -899,6 +1139,12 @@ dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "siphasher"
|
||||||
|
version = "0.3.11"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "slab"
|
name = "slab"
|
||||||
version = "0.4.9"
|
version = "0.4.9"
|
||||||
|
@ -930,6 +1176,49 @@ version = "0.9.8"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
|
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "stable_deref_trait"
|
||||||
|
version = "1.2.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "string_cache"
|
||||||
|
version = "0.8.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b"
|
||||||
|
dependencies = [
|
||||||
|
"new_debug_unreachable",
|
||||||
|
"once_cell",
|
||||||
|
"parking_lot",
|
||||||
|
"phf_shared 0.10.0",
|
||||||
|
"precomputed-hash",
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "string_cache_codegen"
|
||||||
|
version = "0.5.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988"
|
||||||
|
dependencies = [
|
||||||
|
"phf_generator 0.10.0",
|
||||||
|
"phf_shared 0.10.0",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "syn"
|
||||||
|
version = "1.0.109"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"unicode-ident",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "syn"
|
name = "syn"
|
||||||
version = "2.0.43"
|
version = "2.0.43"
|
||||||
|
@ -963,62 +1252,14 @@ dependencies = [
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "thiserror"
|
name = "tendril"
|
||||||
version = "1.0.55"
|
version = "0.4.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6e3de26b0965292219b4287ff031fcba86837900fe9cd2b34ea8ad893c0953d2"
|
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"thiserror-impl",
|
"futf",
|
||||||
]
|
"mac",
|
||||||
|
"utf-8",
|
||||||
[[package]]
|
|
||||||
name = "thiserror-impl"
|
|
||||||
version = "1.0.55"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "268026685b2be38d7103e9e507c938a1fcb3d7e6eb15e87870b617bf37b6d581"
|
|
||||||
dependencies = [
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"syn",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "thread_local"
|
|
||||||
version = "1.1.7"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152"
|
|
||||||
dependencies = [
|
|
||||||
"cfg-if",
|
|
||||||
"once_cell",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "time"
|
|
||||||
version = "0.3.31"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "f657ba42c3f86e7680e53c8cd3af8abbe56b5491790b46e22e19c0d57463583e"
|
|
||||||
dependencies = [
|
|
||||||
"deranged",
|
|
||||||
"itoa",
|
|
||||||
"powerfmt",
|
|
||||||
"serde",
|
|
||||||
"time-core",
|
|
||||||
"time-macros",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "time-core"
|
|
||||||
version = "0.1.2"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "time-macros"
|
|
||||||
version = "0.2.16"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "26197e33420244aeb70c3e8c78376ca46571bc4e701e4791c2cd9f57dcb3a43f"
|
|
||||||
dependencies = [
|
|
||||||
"time-core",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1036,11 +1277,6 @@ version = "0.1.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "tl"
|
|
||||||
version = "0.7.7"
|
|
||||||
source = "git+https://github.com/evertedsphere/tl?branch=patch-1#56711166588fa6c7729a08e5740dca2526436316"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tokio"
|
name = "tokio"
|
||||||
version = "1.35.1"
|
version = "1.35.1"
|
||||||
|
@ -1068,7 +1304,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn",
|
"syn 2.0.43",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1107,35 +1343,10 @@ version = "0.1.40"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
|
checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"log",
|
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"tracing-attributes",
|
|
||||||
"tracing-core",
|
"tracing-core",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "tracing-appender"
|
|
||||||
version = "0.2.3"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "3566e8ce28cc0a3fe42519fc80e6b4c943cc4c8cef275620eb8dac2d3d4e06cf"
|
|
||||||
dependencies = [
|
|
||||||
"crossbeam-channel",
|
|
||||||
"thiserror",
|
|
||||||
"time",
|
|
||||||
"tracing-subscriber",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "tracing-attributes"
|
|
||||||
version = "0.1.27"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
|
|
||||||
dependencies = [
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"syn",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tracing-core"
|
name = "tracing-core"
|
||||||
version = "0.1.32"
|
version = "0.1.32"
|
||||||
|
@ -1143,32 +1354,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
|
checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"valuable",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "tracing-log"
|
|
||||||
version = "0.2.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
|
|
||||||
dependencies = [
|
|
||||||
"log",
|
|
||||||
"once_cell",
|
|
||||||
"tracing-core",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "tracing-subscriber"
|
|
||||||
version = "0.3.18"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
|
|
||||||
dependencies = [
|
|
||||||
"nu-ansi-term",
|
|
||||||
"sharded-slab",
|
|
||||||
"smallvec",
|
|
||||||
"thread_local",
|
|
||||||
"tracing-core",
|
|
||||||
"tracing-log",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1198,6 +1383,12 @@ dependencies = [
|
||||||
"tinyvec",
|
"tinyvec",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-width"
|
||||||
|
version = "0.1.11"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "untrusted"
|
name = "untrusted"
|
||||||
version = "0.9.0"
|
version = "0.9.0"
|
||||||
|
@ -1216,10 +1407,10 @@ dependencies = [
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "valuable"
|
name = "utf-8"
|
||||||
version = "0.1.0"
|
version = "0.7.6"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
|
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "vcpkg"
|
name = "vcpkg"
|
||||||
|
@ -1249,13 +1440,10 @@ dependencies = [
|
||||||
"async-channel",
|
"async-channel",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"rusqlite",
|
"rusqlite",
|
||||||
|
"scraper",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"tl",
|
|
||||||
"tokio",
|
"tokio",
|
||||||
"tracing",
|
|
||||||
"tracing-appender",
|
|
||||||
"tracing-subscriber",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1285,7 +1473,7 @@ dependencies = [
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn",
|
"syn 2.0.43",
|
||||||
"wasm-bindgen-shared",
|
"wasm-bindgen-shared",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -1319,7 +1507,7 @@ checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn",
|
"syn 2.0.43",
|
||||||
"wasm-bindgen-backend",
|
"wasm-bindgen-backend",
|
||||||
"wasm-bindgen-shared",
|
"wasm-bindgen-shared",
|
||||||
]
|
]
|
||||||
|
@ -1346,28 +1534,6 @@ version = "0.25.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1778a42e8b3b90bff8d0f5032bf22250792889a5cdc752aa0020c84abe3aaf10"
|
checksum = "1778a42e8b3b90bff8d0f5032bf22250792889a5cdc752aa0020c84abe3aaf10"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "winapi"
|
|
||||||
version = "0.3.9"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
|
|
||||||
dependencies = [
|
|
||||||
"winapi-i686-pc-windows-gnu",
|
|
||||||
"winapi-x86_64-pc-windows-gnu",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "winapi-i686-pc-windows-gnu"
|
|
||||||
version = "0.4.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "winapi-x86_64-pc-windows-gnu"
|
|
||||||
version = "0.4.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "windows-sys"
|
name = "windows-sys"
|
||||||
version = "0.48.0"
|
version = "0.48.0"
|
||||||
|
@ -1461,5 +1627,5 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn",
|
"syn 2.0.43",
|
||||||
]
|
]
|
||||||
|
|
|
@ -7,18 +7,13 @@ edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
async-channel = "2.1.1"
|
async-channel = "2.1.1"
|
||||||
# lol_html = "1.2.0"
|
|
||||||
reqwest = { version = "0.11.23", default-features = false, features = [
|
reqwest = { version = "0.11.23", default-features = false, features = [
|
||||||
"rustls-tls",
|
"rustls-tls",
|
||||||
"gzip",
|
"gzip",
|
||||||
"brotli",
|
"brotli",
|
||||||
] }
|
] }
|
||||||
rusqlite = "0.30.0"
|
rusqlite = "0.30.0"
|
||||||
# scraper = "0.18.1"
|
scraper = "0.18.1"
|
||||||
serde = { version = "1.0.193", features = ["derive"] }
|
serde = { version = "1.0.193", features = ["derive"] }
|
||||||
serde_json = "1.0.109"
|
serde_json = "1.0.109"
|
||||||
tl = { git = "https://github.com/evertedsphere/tl", branch = "patch-1", features = ["simd"] }
|
|
||||||
tokio = { version = "1.35.1", features = ["full"] }
|
tokio = { version = "1.35.1", features = ["full"] }
|
||||||
tracing = { version = "0.1", features = ["log"] }
|
|
||||||
tracing-appender = "0.2.3"
|
|
||||||
tracing-subscriber = "0.3.18"
|
|
||||||
|
|
|
@ -1,12 +1,11 @@
|
||||||
use async_channel::{Receiver, Sender};
|
use async_channel::{Receiver, Sender};
|
||||||
use rusqlite::Connection;
|
use rusqlite::Connection;
|
||||||
|
use scraper::{Element, Html, Selector};
|
||||||
use std::{
|
use std::{
|
||||||
borrow::Cow,
|
env::args,
|
||||||
env::{self, args},
|
|
||||||
fs,
|
fs,
|
||||||
time::{SystemTime, UNIX_EPOCH},
|
time::{SystemTime, UNIX_EPOCH},
|
||||||
};
|
};
|
||||||
use tl::VDom;
|
|
||||||
use tokio::io::{stderr, AsyncWriteExt};
|
use tokio::io::{stderr, AsyncWriteExt};
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
|
@ -22,78 +21,6 @@ struct PrecioPoint {
|
||||||
image_url: Option<String>,
|
image_url: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
// fn main() {
|
|
||||||
// let arg = args().skip(1).next().unwrap();
|
|
||||||
|
|
||||||
// let file_iter = fs::read_dir(arg)
|
|
||||||
// .unwrap()
|
|
||||||
// .filter(|pr| {
|
|
||||||
// if let Ok(p) = pr {
|
|
||||||
// !p.file_name().to_str().unwrap().ends_with(".link")
|
|
||||||
// } else {
|
|
||||||
// false
|
|
||||||
// }
|
|
||||||
// })
|
|
||||||
// .take(1000)
|
|
||||||
// .map(|f| fs::read(f.unwrap().path()).unwrap());
|
|
||||||
|
|
||||||
// let mut i = 0;
|
|
||||||
// for item in file_iter {
|
|
||||||
// i = i + 1;
|
|
||||||
// {
|
|
||||||
// // let mut text: Option<String> = None;
|
|
||||||
// // let mut price_str: Option<String> = None;
|
|
||||||
// // let mut rewriter = HtmlRewriter::new(
|
|
||||||
// // Settings {
|
|
||||||
// // element_content_handlers: vec![
|
|
||||||
// // // Rewrite insecure hyperlinks
|
|
||||||
// // element!("a[href]", |el| {
|
|
||||||
// // let href = el.get_attribute("href").unwrap().replace("http:", "https:");
|
|
||||||
|
|
||||||
// // el.set_attribute("href", &href).unwrap();
|
|
||||||
|
|
||||||
// // Ok(())
|
|
||||||
// // }),
|
|
||||||
// // (
|
|
||||||
// // Cow::Owned("a".parse().unwrap()),
|
|
||||||
// // ElementContentHandlers::default().text(extract_first_text(&mut text)),
|
|
||||||
// // ),
|
|
||||||
// // element!(
|
|
||||||
// // "meta[property=\"product:price:amount\"]",
|
|
||||||
// // extract_first_attr(&mut price_str, "content")
|
|
||||||
// // ),
|
|
||||||
// // ],
|
|
||||||
// // memory_settings: lol_html::MemorySettings {
|
|
||||||
// // preallocated_parsing_buffer_size: 1024 * 16,
|
|
||||||
// // max_allowed_memory_usage: std::usize::MAX,
|
|
||||||
// // },
|
|
||||||
// // ..Settings::default()
|
|
||||||
// // },
|
|
||||||
// // |_: &[u8]| {},
|
|
||||||
// // );
|
|
||||||
|
|
||||||
// // rewriter.write(&item).unwrap();
|
|
||||||
// // rewriter.end().unwrap();
|
|
||||||
// // println!("{:#?}", price_str);
|
|
||||||
|
|
||||||
// // let html = scraper::Html::parse_document(&String::from_utf8(item).unwrap());
|
|
||||||
|
|
||||||
// let html = String::from_utf8(item).unwrap();
|
|
||||||
// let dom = tl::parse(&html, tl::ParserOptions::default()).unwrap();
|
|
||||||
|
|
||||||
// match parse_carrefour("".into(), &dom) {
|
|
||||||
// Ok(point) => {
|
|
||||||
// // println!("{:?}", point);
|
|
||||||
// }
|
|
||||||
// Err(err) => {
|
|
||||||
// // println!("Error {:#?}: {}", err, html);
|
|
||||||
// }
|
|
||||||
// };
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// println!("n={}", i);
|
|
||||||
// }
|
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() {
|
async fn main() {
|
||||||
let mut args = args().skip(1);
|
let mut args = args().skip(1);
|
||||||
|
@ -111,10 +38,7 @@ async fn main() {
|
||||||
let (res_sender, res_receiver) = async_channel::unbounded::<PrecioPoint>();
|
let (res_sender, res_receiver) = async_channel::unbounded::<PrecioPoint>();
|
||||||
|
|
||||||
let mut handles = Vec::new();
|
let mut handles = Vec::new();
|
||||||
for _ in 1..env::var("N_COROUTINES")
|
for _ in 1..16 {
|
||||||
.map_or(Ok(32), |s| s.parse::<usize>())
|
|
||||||
.unwrap()
|
|
||||||
{
|
|
||||||
let rx = receiver.clone();
|
let rx = receiver.clone();
|
||||||
let tx = res_sender.clone();
|
let tx = res_sender.clone();
|
||||||
handles.push(tokio::spawn(worker(rx, tx)));
|
handles.push(tokio::spawn(worker(rx, tx)));
|
||||||
|
@ -146,7 +70,7 @@ async fn worker(rx: Receiver<String>, tx: Sender<PrecioPoint>) {
|
||||||
}
|
}
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
stderr()
|
stderr()
|
||||||
.write_all(format!("Failed to fetch {}: {:?}\n", url.as_str(), err).as_bytes())
|
.write_all(format!("Failed to fetch {}: {:#?}\n", url.as_str(), err).as_bytes())
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
@ -157,7 +81,14 @@ async fn worker(rx: Receiver<String>, tx: Sender<PrecioPoint>) {
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
enum FetchError {
|
enum FetchError {
|
||||||
HttpError(reqwest::Error),
|
HttpError(reqwest::Error),
|
||||||
ParseError(&'static str),
|
NoPriceMetaEl,
|
||||||
|
NoMetaContent,
|
||||||
|
NotANumber,
|
||||||
|
NoStockMetaEl,
|
||||||
|
NoValidStockMeta,
|
||||||
|
NoSeedState,
|
||||||
|
NoProductInSeedState,
|
||||||
|
NoProductSkuInSeedState,
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result<PrecioPoint, FetchError> {
|
async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result<PrecioPoint, FetchError> {
|
||||||
|
@ -171,122 +102,100 @@ async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result<Precio
|
||||||
.await
|
.await
|
||||||
.map_err(|e| FetchError::HttpError(e))?;
|
.map_err(|e| FetchError::HttpError(e))?;
|
||||||
|
|
||||||
let dom = tl::parse(&body, tl::ParserOptions::default()).unwrap();
|
let html = Html::parse_document(&body);
|
||||||
// let parser = dom.parser();
|
|
||||||
|
|
||||||
let point = parse_carrefour(url, &dom)?;
|
let point = parse_carrefour(url, html)?;
|
||||||
|
|
||||||
Ok(point)
|
Ok(point)
|
||||||
}
|
}
|
||||||
|
fn parse_carrefour(url: String, html: Html) -> Result<PrecioPoint, FetchError> {
|
||||||
fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, FetchError> {
|
let meta_price_sel = Selector::parse("meta[property=\"product:price:amount\"]").unwrap();
|
||||||
let precio_centavos = {
|
let precio_centavos = match html.select(&meta_price_sel).next() {
|
||||||
get_meta_content(dom, "product:price:amount")?
|
Some(el) => match el.attr("content") {
|
||||||
.map(|s| {
|
Some(attr) => match attr.parse::<f64>() {
|
||||||
s.parse::<f64>()
|
Ok(f) => Ok((f * 100.0) as u64),
|
||||||
.map_err(|_| FetchError::ParseError("Failed to parse number"))
|
Err(_) => Err(FetchError::NotANumber),
|
||||||
})
|
},
|
||||||
.transpose()
|
None => Err(FetchError::NoMetaContent),
|
||||||
.map(|f| f.map(|f| (f * 100.0) as u64))
|
},
|
||||||
|
None => Err(FetchError::NoPriceMetaEl),
|
||||||
}?;
|
}?;
|
||||||
|
|
||||||
let in_stock_meta = get_meta_content(dom, "product:availability")?.map(|s| s.into_owned());
|
let meta_stock_el = Selector::parse("meta[property=\"product:availability\"]").unwrap();
|
||||||
let in_stock = match in_stock_meta {
|
let in_stock = match html.select(&meta_stock_el).next() {
|
||||||
Some(s) => match s.as_ref() {
|
Some(el) => match el.attr("content") {
|
||||||
"oos" => Some(false),
|
Some(attr) => match attr {
|
||||||
"instock" => Some(true),
|
"oos" => Ok(Some(false)),
|
||||||
_ => return Err(FetchError::ParseError("Not a valid product:availability")),
|
"instock" => Ok(Some(true)),
|
||||||
|
_ => Err(FetchError::NoValidStockMeta),
|
||||||
},
|
},
|
||||||
None => None,
|
None => Err(FetchError::NoMetaContent),
|
||||||
};
|
},
|
||||||
|
None => Err(FetchError::NoStockMetaEl),
|
||||||
|
}?;
|
||||||
|
|
||||||
let ean = {
|
let ean = {
|
||||||
let json = &parse_script_json(dom, "__STATE__")?;
|
let state = parse_script_json(&html, "__STATE__").ok_or(FetchError::NoSeedState)?;
|
||||||
let state = json
|
let seed_state = &state.as_object().ok_or(FetchError::NoSeedState)?;
|
||||||
.as_object()
|
let (_, product_json) = seed_state
|
||||||
.ok_or(FetchError::ParseError("Seed state not an object"))?;
|
|
||||||
let (_, product_json) = state
|
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.find(|(key, val)| {
|
.find(|(key, val)| {
|
||||||
key.starts_with("Product:")
|
key.starts_with("Product:")
|
||||||
&& val
|
&& val.as_object().map_or(false, |val| {
|
||||||
.as_object()
|
val.get("__typename")
|
||||||
.and_then(|val| val.get("__typename"))
|
|
||||||
.map_or(false, |typename| typename == "Product")
|
.map_or(false, |typename| typename == "Product")
|
||||||
})
|
})
|
||||||
.ok_or(FetchError::ParseError("No product in seed state"))?;
|
})
|
||||||
|
.ok_or(FetchError::NoProductInSeedState)?;
|
||||||
let cache_id = product_json
|
let cache_id = product_json
|
||||||
.get("cacheId")
|
.get("cacheId")
|
||||||
.and_then(|v| v.as_str())
|
.ok_or(FetchError::NoProductInSeedState)?;
|
||||||
.ok_or(FetchError::ParseError("No cacheId in seed state"))?;
|
let (_, product_sku_json) = seed_state
|
||||||
let (_, product_sku_json) = state
|
.into_iter()
|
||||||
.iter()
|
.filter_map(|(key, val)| val.as_object().map_or(None, |o| Some((key, o))))
|
||||||
.find(|(key, val)| {
|
.find(|(key, val)| {
|
||||||
key.starts_with(&format!("Product:{}", cache_id))
|
key.starts_with(&format!("Product:{}", cache_id))
|
||||||
&& val.as_object().map_or(false, |obj| {
|
&& val
|
||||||
obj.get("__typename")
|
.get("__typename")
|
||||||
.map_or(false, |typename| typename == "SKU")
|
.map_or(false, |typename| typename == "SKU")
|
||||||
})
|
})
|
||||||
})
|
.ok_or(FetchError::NoProductSkuInSeedState)?;
|
||||||
.ok_or(FetchError::ParseError("No Product:cacheId* found"))?;
|
|
||||||
product_sku_json
|
product_sku_json
|
||||||
.get("ean")
|
.get("ean")
|
||||||
.and_then(|v| v.as_str())
|
.ok_or(FetchError::NoProductSkuInSeedState)?
|
||||||
.ok_or(FetchError::ParseError("No product SKU in seed state"))?
|
.as_str()
|
||||||
|
.ok_or(FetchError::NoProductSkuInSeedState)?
|
||||||
.to_string()
|
.to_string()
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(PrecioPoint {
|
Ok(PrecioPoint {
|
||||||
ean,
|
ean: ean,
|
||||||
fetched_at: now_sec(),
|
fetched_at: now_sec(),
|
||||||
in_stock,
|
in_stock: in_stock,
|
||||||
name: None,
|
name: None,
|
||||||
image_url: None,
|
image_url: None,
|
||||||
parser_version: 5,
|
parser_version: 5,
|
||||||
precio_centavos,
|
precio_centavos: Some(precio_centavos),
|
||||||
url,
|
url: url,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_meta_content<'a>(dom: &'a VDom<'a>, prop: &str) -> Result<Option<Cow<'a, str>>, FetchError> {
|
fn parse_script_json(html: &Html, varname: &str) -> Option<serde_json::Value> {
|
||||||
let tag = &dom
|
let template_sel = Selector::parse(&format!(
|
||||||
.query_selector(&format!("meta[property=\"{}\"]", prop))
|
|
||||||
.and_then(|mut iter| iter.next())
|
|
||||||
.and_then(|h| h.get(dom.parser()))
|
|
||||||
.and_then(|n| n.as_tag());
|
|
||||||
match tag {
|
|
||||||
Some(tag) => Ok(Some(
|
|
||||||
tag.attributes()
|
|
||||||
.get("content")
|
|
||||||
.flatten()
|
|
||||||
.ok_or(FetchError::ParseError("Failed to get content attr"))?
|
|
||||||
.as_utf8_str(),
|
|
||||||
)),
|
|
||||||
None => Ok(None),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, FetchError> {
|
|
||||||
let parser = dom.parser();
|
|
||||||
let inner_html = &dom
|
|
||||||
.query_selector(&format!(
|
|
||||||
"template[data-type=\"json\"][data-varname=\"{}\"]",
|
"template[data-type=\"json\"][data-varname=\"{}\"]",
|
||||||
varname
|
varname
|
||||||
))
|
))
|
||||||
.and_then(|mut iter| iter.next())
|
.unwrap();
|
||||||
.and_then(|h| h.get(parser))
|
match html.select(&template_sel).next() {
|
||||||
.and_then(|n| n.as_tag())
|
Some(value) => match value.first_element_child() {
|
||||||
.and_then(|t| {
|
Some(script) => match serde_json::from_str(&script.inner_html()) {
|
||||||
t.children()
|
Ok(val) => val,
|
||||||
.all(parser)
|
Err(_) => None,
|
||||||
.iter()
|
},
|
||||||
.find(|n| n.as_tag().is_some())
|
None => None,
|
||||||
})
|
},
|
||||||
.ok_or(FetchError::ParseError("Failed to get script tag"))?
|
None => None,
|
||||||
.inner_html(parser);
|
}
|
||||||
Ok(inner_html
|
|
||||||
.parse()
|
|
||||||
.map_err(|_| FetchError::ParseError("Couldn't parse JSON in script"))?)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn now_sec() -> u64 {
|
fn now_sec() -> u64 {
|
||||||
|
@ -298,9 +207,9 @@ fn now_sec() -> u64 {
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn db_writer(rx: Receiver<PrecioPoint>) {
|
async fn db_writer(rx: Receiver<PrecioPoint>) {
|
||||||
// let conn = Connection::open("../scraper/sqlite.db").unwrap();
|
let conn = Connection::open("../scraper/sqlite.db").unwrap();
|
||||||
// let mut stmt = conn.prepare("SELECT id, name, data FROM person")?;
|
// let mut stmt = conn.prepare("SELECT id, name, data FROM person")?;
|
||||||
while let Ok(res) = rx.recv().await {
|
while let Ok(res) = rx.recv().await {
|
||||||
println!("{:?}", res)
|
println!("{:#?}", res)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue