diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index c20bd8d..e6e3ae3 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,7 +1,7 @@ // For format details, see https://aka.ms/devcontainer.json. For config options, see the // README at: https://github.com/devcontainers/templates/tree/main/src/alpine { - "name": "Alpine", + "name": "Debian", // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile "image": "mcr.microsoft.com/devcontainers/base:debian", "features": { diff --git a/.dockerignore b/.dockerignore index 86597ab..16aa0b4 100644 --- a/.dockerignore +++ b/.dockerignore @@ -4,4 +4,11 @@ data/carrefour/ downloader/ node_modules/ */node_modules/ -*/Containerfile \ No newline at end of file +Containerfile +*/Containerfile +Dockerfile +*/Dockerfile +*.warc.zst +.git +scraper/debug/ +*/target/ \ No newline at end of file diff --git a/.github/workflows/container.yml b/.github/workflows/container.yml new file mode 100644 index 0000000..2733099 --- /dev/null +++ b/.github/workflows/container.yml @@ -0,0 +1,54 @@ +name: check and publish container image + +on: + push: + branches: ["master"] + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +jobs: + check: + name: chequear typescript + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: oven-sh/setup-bun@v1 + + - run: bun install + working-directory: ./sitio + - run: bun check + working-directory: ./sitio + - run: bun install + working-directory: ./scraper + - run: bun check + working-directory: ./scraper + + build-and-push-sitio: + needs: check + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Log in to the Container registry + uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/sitio + - name: Build and push Docker image + uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 + with: + context: . + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/.gitignore b/.gitignore index 8aed687..d3a88c9 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,6 @@ scraper/x.tsv *.tmp target/ .env.* + +*/flamegraph.svg +*/perf.data* \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json index e5b3b8a..c8c97a4 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -13,7 +13,7 @@ // https://github.com/vadimcn/codelldb/issues/884 "args": ["build", "--manifest-path=warcificator/Cargo.toml"] }, - "args": ["../data/samples/Carrefour.50.txt"], + "args": ["../data/carrefour"], "env": {} }, { diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..724ca0d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,30 @@ +FROM docker.io/oven/bun:1-alpine AS base +WORKDIR /usr/src/app + +FROM base as build +ENV NODE_ENV=production +RUN apk add --no-cache nodejs +COPY . . +RUN bun install --frozen-lockfile +RUN cd sitio && \ + bun run build +RUN bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js + +FROM cgr.dev/chainguard/wolfi-base +RUN apk add --no-cache nodejs npm jq bun + +# Sitio +COPY --from=build /usr/src/app/sitio/package.json package.real.json +RUN sh -c 'echo {\"name\":\"sitio\",\"type\":\"module\",\"dependencies\":$(jq .dependencies < package.real.json)} > package.json' && npm install +COPY --from=build /usr/src/app/db-datos node_modules/db-datos +COPY --from=build /usr/src/app/sitio/build . + +# Scraper +COPY --from=build /tmp/cli.build.js /bin/scraper +COPY --from=build /usr/src/app/db-datos/drizzle /bin/drizzle + +ENV NODE_ENV=production +ENV DB_PATH=/db/db.db +EXPOSE 3000 + +CMD ["node", "."] \ No newline at end of file diff --git a/bun.lockb b/bun.lockb index 280e648..4ad5bd1 100755 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/carrefour-link-scraper/package.json b/carrefour-link-scraper/package.json deleted file mode 100644 index a404536..0000000 --- a/carrefour-link-scraper/package.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "name": "carrefour-link-scraper", - "type": "module", - "version": "1.0.0", - "description": "", - "main": "index.js", - "scripts": { - "test": "echo \"Error: no test specified\" && exit 1" - }, - "keywords": [], - "author": "", - "license": "ISC", - "dependencies": { - "linkedom": "^0.16.5", - "p-map": "^7.0.1" - } -} diff --git a/data/Jumbo.txt b/data/Jumbo.txt new file mode 100644 index 0000000..bfbf74f --- /dev/null +++ b/data/Jumbo.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6299b470d9debc9a173b40c2ba91208eb43a6c8cde02a4819e7bcd76368e4363 +size 922185 diff --git a/data/samples/Jumbo.100.txt b/data/samples/Jumbo.100.txt new file mode 100644 index 0000000..fadf390 --- /dev/null +++ b/data/samples/Jumbo.100.txt @@ -0,0 +1,100 @@ +https://www.jumbo.com.ar/huevos-de-color-avicoper-6-u-1-paquete-2/p +https://www.jumbo.com.ar/ajo-ahumado-organico-pampa-gourmet-285g/p +https://www.jumbo.com.ar/boxer-dst-raya-finita-art-b278-talle-m/p +https://www.jumbo.com.ar/yogur-bebible-ser-sachet-vainilla-900g/p +https://www.jumbo.com.ar/plato-playo-melamina-27-cm-boho-krea-2/p +https://www.jumbo.com.ar/mermelada-la-vieja-fabrica-frutos-del-bosque-350-gr/p +https://www.jumbo.com.ar/dr-lemon-vodka-pomelo-5/p +https://www.jumbo.com.ar/vino-cuvelier-los-andes-grand-vin-750cc/p +https://www.jumbo.com.ar/capsulas-cafe-cabrales-dg-cortado-x88gr/p +https://www.jumbo.com.ar/pizza-muzarella-e/p +https://www.jumbo.com.ar/filet-de-merluza-rebozado-8/p +https://www.jumbo.com.ar/ron-bacardi-carta-blanca-750-ml/p +https://www.jumbo.com.ar/sal-gruesa-celusal-1-kg/p +https://www.jumbo.com.ar/vaso-bajo-acrilico-boho-krea-2/p +https://www.jumbo.com.ar/espumante-chandon-demi-sec/p +https://www.jumbo.com.ar/jarra-electrica-smartlife-sl-ek1714wpn/p +https://www.jumbo.com.ar/espumante-dada-7-rose-dulce-750-cc/p +https://www.jumbo.com.ar/panquequera-hudson-de-aluminio-con-antiadherente-22cm/p +https://www.jumbo.com.ar/sacapuntas-de-plastico-pizzini-2un/p +https://www.jumbo.com.ar/vino-vinas-de-alvear-tinto-750ml/p +https://www.jumbo.com.ar/campera-mujer-puffer-larga/p +https://www.jumbo.com.ar/tabla-de-quesos/p +https://www.jumbo.com.ar/frutos-del-bosque-frutas-del-sur-x400gr/p +https://www.jumbo.com.ar/blister-resaltador-flash-amarillo-x-1-un/p +https://www.jumbo.com.ar/alim-whiskas-gatitos-carne-y-leche-500gr/p +https://www.jumbo.com.ar/detergente-polvo-zorro-blue-3k-x-1un/p +https://www.jumbo.com.ar/media-vestir-hombre-1s10471-negro/p +https://www.jumbo.com.ar/nachos-macritas-ketchup-x90g/p +https://www.jumbo.com.ar/pack-x3-medias-juvenil-liso-t-5-elemento/p +https://www.jumbo.com.ar/set-de-vehiculos-emergencias-duravit/p +https://www.jumbo.com.ar/carbon-patagonia-x-4kgs/p +https://www.jumbo.com.ar/rejilla-mr-trapo-cocina-algodon/p +https://www.jumbo.com.ar/jugo-exprimido-pura-frutta-arandanos-manzana-verde-x-1l/p +https://www.jumbo.com.ar/media-dama-invisible-alta-nyb-urb-2/p +https://www.jumbo.com.ar/boxer-nino-raya-violeta-2-colores-dst-t-10/p +https://www.jumbo.com.ar/barra-zafran-caju-y-sem-de-zapallo-x112g/p +https://www.jumbo.com.ar/iniciador-de-fuego-maderasa/p +https://www.jumbo.com.ar/queso-mozzarella-barraza-x-500grs-paq-gr-500/p +https://www.jumbo.com.ar/vaso-de-vidrio-cuadrado-360-cc/p +https://www.jumbo.com.ar/shampoo-sedal-jengibre-y-ricino-190ml/p +https://www.jumbo.com.ar/roller-gel-filgo-gel-pop-glitter-1un/p +https://www.jumbo.com.ar/una-familia-anormal-el-misterio-de-prh/p +https://www.jumbo.com.ar/veggie-stick-tomate-y-oliva-via-vita-x-50grs/p +https://www.jumbo.com.ar/bowl-stor-bicolor-mickey-mouse/p +https://www.jumbo.com.ar/vino-blanco-don-valentin-lacrado-750-ml/p +https://www.jumbo.com.ar/un-vecino-anormal-2-prh/p +https://www.jumbo.com.ar/paleta-pet-cancat-mordillo-ice/p +https://www.jumbo.com.ar/aceitunas-nucete-premium-descarozadas-180-gr/p +https://www.jumbo.com.ar/caja-plastica-6l-teen-boy-pv23-krea-2/p +https://www.jumbo.com.ar/vino-santa-julia-chardonnay-x-750-cc/p +https://www.jumbo.com.ar/protecor-solar-dermaglos-bebes-fps65-120gr/p +https://www.jumbo.com.ar/oregano-100-gr/p +https://www.jumbo.com.ar/puerro-song/p +https://www.jumbo.com.ar/repuesto-difusor-sandia-pepino-350-ml-2/p +https://www.jumbo.com.ar/botellas-plasticas-origin-580ml-rosa-2/p +https://www.jumbo.com.ar/nescafe-dolca-original-x-170gr/p +https://www.jumbo.com.ar/tapa-empanada-veggie-signo-de-oro-x-500g/p +https://www.jumbo.com.ar/inflador-de-pie-bestway-air-hammer/p +https://www.jumbo.com.ar/ketchup-ahumado-marian-arytza-400g/p +https://www.jumbo.com.ar/sal-marina-finas-hierbas-ahumada-s-tacc-450g/p +https://www.jumbo.com.ar/jugo-smudis-pomelo-500ml-brk-0-5-lt/p +https://www.jumbo.com.ar/limpiador-antihongos-ayudin-removedor-activo-envase-economico-450-ml/p +https://www.jumbo.com.ar/marcador-permanente-punta-redonda-color-negro/p +https://www.jumbo.com.ar/galletitas-dulces-con-chips-de-chocolate-pepitos-119g/p +https://www.jumbo.com.ar/afeitadora-bic-comfort-twin-l5p4-2/p +https://www.jumbo.com.ar/canvas-20x20-cm-paisajes-04-krea/p +https://www.jumbo.com.ar/turron-georgalos-de-mani-con-chocolate-x-90-gr/p +https://www.jumbo.com.ar/arroz-vanguardia-elaborado-largo-fino/p +https://www.jumbo.com.ar/set-x-3-pastafrola-fija-n-14/p +https://www.jumbo.com.ar/pulpa-fina-basilico-mutti-400-gr/p +https://www.jumbo.com.ar/vino-tinto-elementos-malbec-750-cc/p +https://www.jumbo.com.ar/enjuague-bucal-listerine-antisarro-suave-sn-alcohol-x250/p +https://www.jumbo.com.ar/almohaditas-lasfor-avellana-200-grs/p +https://www.jumbo.com.ar/vino-tinto-los-haroldos-estate-cabernet-sauvignon-750-ml/p +https://www.jumbo.com.ar/peluche-funnyland-maxtoys-tibalt-perro-28cm/p +https://www.jumbo.com.ar/cafetera-filtro-negro-electrolux-1-2-litros/p +https://www.jumbo.com.ar/media-nina-ciudadella-minnie-t2/p +https://www.jumbo.com.ar/portaretrato-colores-13x18cm-4c-krea4136010100/p +https://www.jumbo.com.ar/lustramuebles-blem-madera-aceite-de-argan-aerosol-360cc/p +https://www.jumbo.com.ar/sriracha-sauce-hashi-x250ml-2/p +https://www.jumbo.com.ar/plato-hondo-22-1-cm-ceramica-blanca/p +https://www.jumbo.com.ar/limpiador-harpic-banos-sarro-y-manchas-495ml/p +https://www.jumbo.com.ar/shampoo-dove-real-poder-de-las-plantas-purificacion-jengibre-300-ml/p +https://www.jumbo.com.ar/aromatizador-glade-mini-gel-car-3/p +https://www.jumbo.com.ar/carpeta-con-10-folios-a4/p +https://www.jumbo.com.ar/sabana-king-caracol-krea/p +https://www.jumbo.com.ar/leche-en-polvo-nutribaby-1-hmo-x-800-grs/p +https://www.jumbo.com.ar/chalitas-viavita-clasicas-x-100-grs-sin-tacc/p +https://www.jumbo.com.ar/hervidor-tramontina-14cm-cm-x1/p +https://www.jumbo.com.ar/aceitunas-de-gordal-ybarra-x240gr-2/p +https://www.jumbo.com.ar/tableta-vizzio-relleno-nugaton-x100g-2/p +https://www.jumbo.com.ar/mortadela-paladini-fetas-finas-x-200-gr-2/p +https://www.jumbo.com.ar/budin-limon-y-amapolas/p +https://www.jumbo.com.ar/vino-chac-chac-sauvingnon-blanc-lata-269cc/p +https://www.jumbo.com.ar/whisky-chivas-regal-18-yo-700cc/p +https://www.jumbo.com.ar/copa-de-vidrio-rigolleau-6/p +https://www.jumbo.com.ar/notcreamcheese-210-gr/p +https://www.jumbo.com.ar/oso-con-miel-de-abejas-cuisine-co-340-gr/p +https://www.jumbo.com.ar/difusor-aromas-spirit-spirit-win-home-250ml-x1/p +https://www.jumbo.com.ar/exprimidor-ultracomb-ex-2302/p diff --git a/db-datos/package.json b/db-datos/package.json index e5e963c..4903e55 100644 --- a/db-datos/package.json +++ b/db-datos/package.json @@ -11,7 +11,7 @@ "author": "", "license": "ISC", "dependencies": { - "drizzle-orm": "=0.29.1" + "drizzle-orm": "^0.29.1" }, "devDependencies": { "@types/bun": "^1.0.0", diff --git a/db-datos/supermercado.ts b/db-datos/supermercado.ts index e46fe9a..1e167a0 100644 --- a/db-datos/supermercado.ts +++ b/db-datos/supermercado.ts @@ -2,15 +2,23 @@ export enum Supermercado { Dia = "Dia", Carrefour = "Carrefour", Coto = "Coto", + Jumbo = "Jumbo", } - +export const supermercados: Supermercado[] = [ + Supermercado.Carrefour, + Supermercado.Coto, + Supermercado.Dia, + Supermercado.Jumbo, +]; export const hosts: { [host: string]: Supermercado } = { "diaonline.supermercadosdia.com.ar": Supermercado.Dia, "www.carrefour.com.ar": Supermercado.Carrefour, "www.cotodigital3.com.ar": Supermercado.Coto, + "www.jumbo.com.ar": Supermercado.Jumbo, }; export const colorBySupermercado: { [supermercado in Supermercado]: string } = { [Supermercado.Dia]: "#d52b1e", [Supermercado.Carrefour]: "#19549d", [Supermercado.Coto]: "#e20025", + [Supermercado.Jumbo]: "#2dc850", }; diff --git a/dia-link-scraper/package.json b/dia-link-scraper/package.json deleted file mode 100644 index 57ff6fd..0000000 --- a/dia-link-scraper/package.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "name": "dia-link-scraper", - "type": "module", - "version": "1.0.0", - "description": "", - "main": "index.js", - "scripts": { - "test": "echo \"Error: no test specified\" && exit 1" - }, - "keywords": [], - "author": "", - "license": "ISC", - "dependencies": { - "linkedom": "^0.16.5", - "p-map": "^7.0.0" - } -} diff --git a/carrefour-link-scraper/index.ts b/link-scrapers/carrefour.ts similarity index 76% rename from carrefour-link-scraper/index.ts rename to link-scrapers/carrefour.ts index 82c0d15..ce92306 100644 --- a/carrefour-link-scraper/index.ts +++ b/link-scrapers/carrefour.ts @@ -1,5 +1,6 @@ import pMap from "p-map"; import { saveUrls } from "db-datos/urlHelpers.js"; +import { getUrlsFromSitemap } from "./common.js"; export async function scrapCarrefourProducts() { await scrapBySitemap(); @@ -25,17 +26,7 @@ async function scrapBySitemap() { async (sitemapUrl) => { const res = await fetch(sitemapUrl); const xml = await res.text(); - let urls = new Set(); - new HTMLRewriter() - .on("loc", { - text(element) { - const txt = element.text.trim(); - if (!txt) return; - urls.add(txt); - }, - }) - .transform(new Response(xml)); - saveUrls(Array.from(urls)); + saveUrls(getUrlsFromSitemap(xml)); }, { concurrency: 3 } ); diff --git a/link-scrapers/common.ts b/link-scrapers/common.ts new file mode 100644 index 0000000..f4107a0 --- /dev/null +++ b/link-scrapers/common.ts @@ -0,0 +1,14 @@ +import { decodeXML } from "entities"; +export function getUrlsFromSitemap(xml: string) { + let urls = new Set(); + new HTMLRewriter() + .on("loc", { + text(element) { + const txt = element.text.trim(); + if (!txt) return; + urls.add(decodeXML(txt)); + }, + }) + .transform(new Response(xml)); + return Array.from(urls); +} diff --git a/coto-link-scraper/index.ts b/link-scrapers/coto.ts similarity index 89% rename from coto-link-scraper/index.ts rename to link-scrapers/coto.ts index b25be9b..d3de22d 100644 --- a/coto-link-scraper/index.ts +++ b/link-scrapers/coto.ts @@ -1,4 +1,3 @@ -import { getHtml } from "../scraper/fetch.js"; import { parseHTML } from "linkedom"; import PQueue from "p-queue"; import { saveUrls } from "db-datos/urlHelpers.js"; @@ -28,12 +27,13 @@ function getPage(url: string) { return async () => { let html; try { - html = await getHtml(url); + const res = await fetch(url); + html = await res.text(); } catch (error) { await getPage(url)(); return; } - const { document } = parseHTML(html.toString("utf-8")); + const { document } = parseHTML(html); const hrefs = Array.from( document.querySelectorAll(".product_info_container a"), diff --git a/dia-link-scraper/index.ts b/link-scrapers/dia.ts similarity index 91% rename from dia-link-scraper/index.ts rename to link-scrapers/dia.ts index 67709b0..5b469f6 100644 --- a/dia-link-scraper/index.ts +++ b/link-scrapers/dia.ts @@ -1,7 +1,7 @@ import pMap from "p-map"; import { parseHTML } from "linkedom"; -import { getHtml } from "../scraper/fetch.js"; import { saveUrls } from "db-datos/urlHelpers.js"; +import { getUrlsFromSitemap } from "./common.js"; const categorias = [ "https://diaonline.supermercadosdia.com.ar/almacen", @@ -81,21 +81,15 @@ async function scrapBySitemap() { "https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml", ]; - await pMap(sitemaps, async (sitemapUrl) => { - const res = await fetch(sitemapUrl); - const xml = await res.text(); - let urls = new Set(); - new HTMLRewriter() - .on("loc", { - text(element) { - const txt = element.text.trim(); - if (!txt) return; - urls.add(txt); - }, - }) - .transform(new Response(xml)); - saveUrls(Array.from(urls)); - }); + await pMap( + sitemaps, + async (sitemapUrl) => { + const res = await fetch(sitemapUrl); + const xml = await res.text(); + saveUrls(getUrlsFromSitemap(xml)); + }, + { concurrency: 3 } + ); } async function scrapBySite() { @@ -110,8 +104,9 @@ async function scrapBySite() { await pMap( links, async (url) => { - const html = await getHtml(url); - const { document } = parseHTML(html.toString("utf-8")); + const res = await fetch(url); + const html = await res.text(); + const { document } = parseHTML(html); const hrefs = Array.from( document.querySelectorAll( diff --git a/link-scrapers/jumbo.ts b/link-scrapers/jumbo.ts new file mode 100644 index 0000000..77356d5 --- /dev/null +++ b/link-scrapers/jumbo.ts @@ -0,0 +1,38 @@ +import pMap from "p-map"; +import { saveUrls } from "db-datos/urlHelpers.js"; +import { getUrlsFromSitemap } from "./common.js"; + +export async function scrapJumboProducts() { + await scrapBySitemap(); +} + +async function scrapBySitemap() { + // de https://www.jumbo.com.ar/sitemap.xml + const sitemaps = [ + "https://www.jumbo.com.ar/sitemap/product-1.xml", + "https://www.jumbo.com.ar/sitemap/product-10.xml", + "https://www.jumbo.com.ar/sitemap/product-11.xml", + "https://www.jumbo.com.ar/sitemap/product-12.xml", + "https://www.jumbo.com.ar/sitemap/product-13.xml", + "https://www.jumbo.com.ar/sitemap/product-14.xml", + "https://www.jumbo.com.ar/sitemap/product-15.xml", + "https://www.jumbo.com.ar/sitemap/product-2.xml", + "https://www.jumbo.com.ar/sitemap/product-3.xml", + "https://www.jumbo.com.ar/sitemap/product-4.xml", + "https://www.jumbo.com.ar/sitemap/product-5.xml", + "https://www.jumbo.com.ar/sitemap/product-6.xml", + "https://www.jumbo.com.ar/sitemap/product-7.xml", + "https://www.jumbo.com.ar/sitemap/product-8.xml", + "https://www.jumbo.com.ar/sitemap/product-9.xml", + ]; + + await pMap( + sitemaps, + async (sitemapUrl) => { + const res = await fetch(sitemapUrl); + const xml = await res.text(); + saveUrls(getUrlsFromSitemap(xml)); + }, + { concurrency: 3 } + ); +} diff --git a/coto-link-scraper/package.json b/link-scrapers/package.json similarity index 84% rename from coto-link-scraper/package.json rename to link-scrapers/package.json index 04e7eac..9ae66f9 100644 --- a/coto-link-scraper/package.json +++ b/link-scrapers/package.json @@ -1,5 +1,5 @@ { - "name": "coto-link-scraper", + "name": "link-scrapers", "type": "module", "version": "1.0.0", "description": "", @@ -11,6 +11,7 @@ "author": "", "license": "ISC", "dependencies": { + "entities": "^4.5.0", "linkedom": "^0.16.5", "p-queue": "^8.0.1" } diff --git a/package.json b/package.json index 30324d4..6cc95ea 100644 --- a/package.json +++ b/package.json @@ -2,9 +2,7 @@ "name": "preciazo", "private": true, "workspaces": [ - "dia-link-scraper", - "coto-link-scraper", - "carrefour-link-scraper", + "link-scrapers", "scraper", "sitio", "db-datos" diff --git a/readme.md b/readme.md index 1775378..847b3fb 100644 --- a/readme.md +++ b/readme.md @@ -4,33 +4,23 @@ scrapeo "masivo" de precios y datos en supermercados argentinos ## componentes (en orden de proceso) -- los link scrapers ([coto-link-scraper](./coto-link-scraper/), [dia-link-scraper](./dia-link-scraper/) y [carrefour-link-scraper](./carrefour-link-scraper)) crean listas de links a productos para scrapear +- los link scrapers ([link-scrapers/](./link-scrapers/)) crean listas de links a productos para scrapear (no hace falta correrlos porque ya hay listas armadas en [data/](./data/)) -- [warcificator](./warcificator/) descarga las paginas de productos y genera un archivo [WARC](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.0/) con ellas -- el [scraper](./scraper/) procesa estos WARCs, extrayendo varios datos y guardandolos en una base de datos SQLite (definida en [db-datos](./db-datos/schema.ts)) +- el [scraper](./scraper/) descarga todos los links, extrayendo varios datos y guardandolos en una base de datos SQLite (definida en [db-datos](./db-datos/schema.ts)) - el [sitio](./sitio/) renderiza páginas a partir de la base de datos y hace gráficos lindos ## setup hay que instalar [Bun](https://bun.sh/), que lo estoy usando porque hacía que el scraper corra más rápido. quizás en el futuro lo reemplace con good old Node.js. -aparte, se necesita zstd, que se usa para comprimir los WARCs eficientemente. seguro está disponible en las repos de tu distro favorita :) - -empezá descargando un WARC con 50 páginas de sample, y recomprimilo con zstd: - -``` -wget --no-verbose --tries=3 --delete-after --input-file ./data/samples/Dia.txt --warc-file=dia-sample -gzip -dc dia-sample.warc.gz | zstd --long -15 --no-sparse -o dia-sample.warc.zst -``` - -después, scrapealo a una BD: +después, escrapea un sample de productos de Carrefour a una BD: ``` cd scraper/ bun install -bun cli.ts scrap ../dia-sample.warc.zst +bun cli.ts scrap ./data/samples/Carrefour.50.txt ``` ahora miralo en el sitio: diff --git a/scraper/Containerfile b/scraper/Containerfile deleted file mode 100644 index 7df71d2..0000000 --- a/scraper/Containerfile +++ /dev/null @@ -1,42 +0,0 @@ -FROM docker.io/oven/bun:1-alpine AS base -WORKDIR /usr/src/app - -FROM base AS builder -ENV NODE_ENV=production -COPY . . -RUN bun install --frozen-lockfile \ - && bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js \ - && rm -rf node_modules/ - -# https://dev.to/deciduously/use-multi-stage-docker-builds-for-statically-linked-rust-binaries-3jgd -FROM docker.io/rust:1.74 AS warcificator-builder -WORKDIR /usr/src/ -RUN rustup target add x86_64-unknown-linux-musl -RUN apt-get update && apt-get install -y musl-tools musl-dev - -RUN USER=root cargo new warcificator -WORKDIR /usr/src/warcificator -COPY ./warcificator/Cargo.toml ./warcificator/Cargo.lock ./ -RUN cargo build --release - -COPY ./warcificator/src ./src -RUN cargo install --target x86_64-unknown-linux-musl --path . - -FROM base -RUN apk add --no-cache wget zstd tini -RUN printf "#!/bin/sh\nexec bun /bin/scraper auto\n" > /etc/periodic/daily/scraper \ - && chmod +x /etc/periodic/daily/scraper - -COPY --from=builder /tmp/cli.build.js /bin/scraper -COPY --from=warcificator-builder /usr/local/cargo/bin/warcificator /bin/ -COPY --from=builder /usr/src/app/db-datos/drizzle /bin/drizzle -COPY --from=builder /usr/src/app/data /listas -WORKDIR /app - -VOLUME /db -ENV NODE_ENV=production -ENV DB_PATH=/db/db.db -ENV LISTS_DIR=/listas/ - -CMD ["tini", "/bin/busybox", "crond", "-f", "-l2"] -# CMD ["bun", "/bin/scraper"] \ No newline at end of file diff --git a/scraper/auto.ts b/scraper/auto.ts index 582cfd1..482e774 100644 --- a/scraper/auto.ts +++ b/scraper/auto.ts @@ -1,29 +1,20 @@ -import { mkdtemp, access, writeFile } from "node:fs/promises"; +import { mkdtemp, writeFile } from "node:fs/promises"; import { tmpdir } from "node:os"; -import { join, resolve } from "node:path"; -import { spawn } from "node:child_process"; -import { Supermercado, hosts } from "db-datos/supermercado.js"; +import { join } from "node:path"; +import { Supermercado, hosts, supermercados } from "db-datos/supermercado.js"; import PQueue from "p-queue"; -import { format, formatDuration, intervalToDuration } from "date-fns"; -import { parseWarc } from "./scrap.js"; -import { S3Client } from "@aws-sdk/client-s3"; -import { Upload } from "@aws-sdk/lib-storage"; -import { BunFile } from "bun"; +import { formatDuration, intervalToDuration } from "date-fns"; +import { downloadList } from "./scrap.js"; import { db } from "db-datos/db.js"; import { like } from "drizzle-orm"; import { productoUrls } from "db-datos/schema.js"; -import { scrapDiaProducts } from "../dia-link-scraper/index.js"; -import { scrapCotoProducts } from "../coto-link-scraper/index.js"; -import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js"; - -const supermercados: Supermercado[] = [ - Supermercado.Carrefour, - Supermercado.Coto, - Supermercado.Dia, -]; +import { scrapDiaProducts } from "../link-scrapers/dia.js"; +import { scrapCotoProducts } from "../link-scrapers/coto.js"; +import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js"; +import { scrapJumboProducts } from "../link-scrapers/jumbo.js"; // hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU -const scrapQueue = new PQueue({ concurrency: 1 }); +const scrapQueue = new PQueue({ concurrency: 4 }); export async function auto() { const a = new Auto(); @@ -31,35 +22,9 @@ export async function auto() { } class Auto { - s3Config?: { s3: S3Client; bucketName: string }; telegramConfig?: { token: string; chatId: string }; constructor() { - if ( - !process.env.S3_ACCESS_KEY_ID || - !process.env.S3_SECRET_ACCESS_KEY || - !process.env.S3_BUCKET_NAME - ) { - if (process.env.NODE_ENV === "development") { - console.warn("faltan creds de s3, no voy a subir a s3"); - } else { - throw new Error("faltan creds de s3"); - } - } else { - this.s3Config = { - // https://www.backblaze.com/docs/cloud-storage-use-the-aws-sdk-for-javascript-v3-with-backblaze-b2 - s3: new S3Client({ - endpoint: "https://s3.us-west-004.backblazeb2.com", - region: "us-west-004", - credentials: { - accessKeyId: process.env.S3_ACCESS_KEY_ID, - secretAccessKey: process.env.S3_SECRET_ACCESS_KEY, - }, - }), - bucketName: process.env.S3_BUCKET_NAME, - }; - } - if (!process.env.TELEGRAM_BOT_TOKEN) console.warn("no hay TELEGRAM_BOT_TOKEN, no voy a loggear por allá"); else if (!process.env.TELEGRAM_BOT_CHAT_ID) @@ -89,6 +54,9 @@ class Auto { case "Carrefour": await scrapCarrefourProducts(); break; + case "Jumbo": + await scrapJumboProducts(); + break; } this.inform( `[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}` @@ -107,93 +75,29 @@ class Auto { const urls = results.map((r) => r.url); await writeFile(listPath, urls.join("\n") + "\n"); - const date = new Date(); - const zstdWarcName = `${supermercado}-${format( - date, - "yyyy-MM-dd-HH:mm" - )}.warc.zst`; - const zstdWarcPath = join(ctxPath, zstdWarcName); - const subproc = Bun.spawn({ - cmd: ["warcificator", listPath, zstdWarcPath], - stderr: "ignore", - stdout: "ignore", - cwd: ctxPath, - }); - const t0 = performance.now(); - await subproc.exited; - this.inform( - `[downloader] ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}` - ); - - if (!(await fileExists(zstdWarcPath))) { - const err = this.report(`no encontré el ${zstdWarcPath}`); - throw err; - } - - this.scrapAndInform({ zstdWarcPath, zstdWarcName }); - - try { - await this.uploadToBucket({ - fileName: zstdWarcName, - file: Bun.file(zstdWarcPath), - }); - } catch (error) { - this.inform(`Falló subir ${zstdWarcName} a S3; ${error}`); - console.error(error); - } - + this.scrapAndInform({ listPath }); // TODO: borrar archivos temporales } - async scrapAndInform({ - zstdWarcPath, - zstdWarcName, - }: { - zstdWarcPath: string; - zstdWarcName: string; - }) { + async scrapAndInform({ listPath }: { listPath: string }) { const res = await scrapQueue.add(async () => { const t0 = performance.now(); - const progress = await parseWarc(zstdWarcPath); + const progress = await downloadList(listPath); return { took: performance.now() - t0, progress }; }); if (res) { const { took, progress } = res; this.inform( - `Procesado ${zstdWarcName} (${progress.done} ok, ${ - progress.errors.length - } errores) (tardó ${formatMs(took)})` + `Procesado ${listPath} (${progress.done} ok, ${ + progress.skipped + } skipped, ${progress.errors.length} errores) (tardó ${formatMs(took)})` ); } else { - this.inform(`Algo falló en ${zstdWarcName}`); + this.inform(`Algo falló en ${listPath}`); } } - async uploadToBucket({ - fileName, - file, - }: { - fileName: string; - file: BunFile; - }) { - if (!this.s3Config) { - this.inform( - `[s3] Se intentó subir ${fileName} pero no tenemos creds de S3` - ); - return; - } - const parallelUploads3 = new Upload({ - client: this.s3Config.s3, - params: { - Bucket: this.s3Config.bucketName, - Key: fileName, - Body: file, - }, - }); - await parallelUploads3.done(); - } - inform(msg: string) { this.sendTelegramMsg(msg); console.info(msg); @@ -216,16 +120,6 @@ class Auto { } } -// no se llama exists porque bun tiene un bug en el que usa fs.exists por mas que exista una funcion llamada exists -async function fileExists(path: string) { - try { - access(path); - return true; - } catch { - return false; - } -} - function formatMs(ms: number) { return formatDuration(intervalToDuration({ start: 0, end: Math.round(ms) })); } diff --git a/scraper/cli.ts b/scraper/cli.ts index 9507090..8223973 100644 --- a/scraper/cli.ts +++ b/scraper/cli.ts @@ -1,8 +1,9 @@ -import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js"; -import { scrapCotoProducts } from "../coto-link-scraper/index.js"; -import { scrapDiaProducts } from "../dia-link-scraper/index.js"; +import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js"; +import { scrapCotoProducts } from "../link-scrapers/coto.js"; +import { scrapDiaProducts } from "../link-scrapers/dia.js"; +import { scrapJumboProducts } from "../link-scrapers/jumbo.js"; import { auto } from "./auto.js"; -import { parseWarc } from "./scrap.js"; +import { downloadList, getProduct } from "./scrap.js"; if (process.argv[2] === "auto") { await auto(); @@ -12,17 +13,24 @@ if (process.argv[2] === "auto") { await scrapDiaProducts(); } else if (process.argv[2] === "scrap-coto-links") { await scrapCotoProducts(); +} else if (process.argv[2] === "scrap-jumbo-links") { + await scrapJumboProducts(); +} else if (process.argv[2] === "scrap-link") { + const url = new URL(process.argv[3]); + const res = await fetch(url); + const text = await res.text(); + console.info(await getProduct(url, text)); } else if (process.argv[2] === "scrap") { - const warcPaths = process.argv.slice(3); - if (warcPaths.length > 0) { - for (const path of warcPaths) { - const res = await parseWarc(path); + const urlLists = process.argv.slice(3); + if (urlLists.length > 0) { + for (const path of urlLists) { + const res = await downloadList(path); console.info("======================================="); console.info(path, res); console.info("======================================="); } } else { - console.error("Especificá WARCs para scrapear."); + console.error("Especificá listas de urls para scrapear."); process.exit(1); } } else { diff --git a/scraper/fetch.ts b/scraper/fetch.ts deleted file mode 100644 index 59bffb2..0000000 --- a/scraper/fetch.ts +++ /dev/null @@ -1,13 +0,0 @@ -export async function getHtml(url: string) { - const res = await fetch(url); - return readableToBuffer(res.body!); -} - -async function readableToBuffer(source: AsyncIterable) { - // https://stackoverflow.com/a/72891118 - const buffers = []; - for await (const data of source) { - buffers.push(data); - } - return Buffer.concat(buffers); -} diff --git a/scraper/package.json b/scraper/package.json index 1ca6dd7..4399980 100644 --- a/scraper/package.json +++ b/scraper/package.json @@ -5,8 +5,7 @@ "description": "", "main": "index.js", "scripts": { - "build:container": "podman build -t gitea.nulo.in/nulo/preciazo/scraper -f ./Containerfile ..", - "push:container": "bun build:container && podman push gitea.nulo.in/nulo/preciazo/scraper" + "check": "tsc" }, "keywords": [], "author": "", @@ -16,11 +15,11 @@ "@aws-sdk/lib-storage": "^3.478.0", "date-fns": "^3.0.6", "db-datos": "workspace:^", - "drizzle-orm": "=0.29.1", + "drizzle-orm": "^0.29.1", "linkedom": "^0.16.5", "nanoid": "^5.0.4", + "p-map": "^7.0.1", "p-queue": "^8.0.1", - "warcio": "^2.2.1", "zod": "^3.22.4" }, "devDependencies": { diff --git a/scraper/parsers/carrefour.ts b/scraper/parsers/carrefour.ts index e3f74fa..b025f62 100644 --- a/scraper/parsers/carrefour.ts +++ b/scraper/parsers/carrefour.ts @@ -1,6 +1,6 @@ import { parseHTML } from "linkedom"; import { Precioish } from "../scrap.js"; -import { getProductJsonLd, priceFromMeta, stockFromMeta } from "../common.js"; +import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js"; function parseScriptJson(dom: Window, varname: string): T { const script = dom.window.document.querySelector( diff --git a/scraper/common.ts b/scraper/parsers/common.ts similarity index 91% rename from scraper/common.ts rename to scraper/parsers/common.ts index 534800b..34804aa 100644 --- a/scraper/common.ts +++ b/scraper/parsers/common.ts @@ -21,7 +21,7 @@ function parseJsonLds(dom: Window): object[] { const scripts = dom.window.document.querySelectorAll( 'script[type="application/ld+json"]' ); - return [...scripts].map((scripts) => JSON.parse(scripts.innerHTML)); + return Array.from(scripts, (script) => JSON.parse(script.innerHTML)); } function findJsonLd(dom: Window, type: string): object | undefined { return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type); @@ -31,8 +31,9 @@ const zProductLd = z.object({ "@type": z.literal("Product"), name: z.string(), image: z.string(), + sku: z.string().optional(), offers: z.object({ - offers: z.tuple([ + offers: z.array( z.object({ "@type": z.literal("Offer"), price: z.number(), @@ -41,8 +42,8 @@ const zProductLd = z.object({ "http://schema.org/OutOfStock", "http://schema.org/InStock", ]), - }), - ]), + }) + ), }), }); type ProductLd = z.infer; diff --git a/scraper/parsers/coto.ts b/scraper/parsers/coto.ts index 96cabdb..0e92642 100644 --- a/scraper/parsers/coto.ts +++ b/scraper/parsers/coto.ts @@ -19,7 +19,7 @@ function getEanFromText({ document }: Window) { } function getPriceFromText({ document }: Window) { const el = document.querySelector(".atg_store_newPrice"); - if (!el?.textContent) throw new Error("no encuentro el precio"); + if (!el?.textContent) return null; const nStr = el.textContent .trim() .replace("$", "") @@ -27,12 +27,16 @@ function getPriceFromText({ document }: Window) { .replace(",", "."); return parseFloat(nStr) * 100; } +function getInStock({ document }: Window) { + return !document.querySelector(".product_not_available"); +} export function getCotoProduct(html: string | Buffer): Precioish { const dom = parseHTML(html); const ean = getEanFromText(dom); const precioCentavos = getPriceFromText(dom); + const inStock = getInStock(dom); const name = dom.document .querySelector("h1.product_page") @@ -40,5 +44,5 @@ export function getCotoProduct(html: string | Buffer): Precioish { const imageUrl = dom.document.querySelector(".zoom img")?.src; - return { name, imageUrl, ean, precioCentavos }; + return { name, imageUrl, ean, precioCentavos, inStock }; } diff --git a/scraper/parsers/dia.ts b/scraper/parsers/dia.ts index be3e2c5..5fdd1ca 100644 --- a/scraper/parsers/dia.ts +++ b/scraper/parsers/dia.ts @@ -1,6 +1,6 @@ import { parseHTML } from "linkedom"; import { type Precioish } from "../scrap.js"; -import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js"; +import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js"; export function getDiaProduct(html: string | Buffer): Precioish { const dom = parseHTML(html); diff --git a/scraper/parsers/jumbo.ts b/scraper/parsers/jumbo.ts new file mode 100644 index 0000000..b26b09f --- /dev/null +++ b/scraper/parsers/jumbo.ts @@ -0,0 +1,54 @@ +import { parseHTML } from "linkedom"; +import { type Precioish } from "../scrap.js"; +import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js"; +import { z } from "zod"; + +const zJumboSearch = z.tuple([ + z.object({ + items: z.array( + z.object({ + ean: z.string(), + }) + ), + }), +]); + +async function getEanFromSearch(sku: string) { + const url = new URL( + "https://www.jumbo.com.ar/api/catalog_system/pub/products/search" + ); + url.searchParams.set("fq", `skuId:${sku}`); + const res = await fetch(url); + const json = await res.json(); + const parsed = zJumboSearch.parse(json); + const ean = parsed[0].items[0].ean; + if (!parsed[0].items.every((x) => x.ean === ean)) { + throw new Error("Inesperado: no todos los items tienen el mismo EAN"); + } + return ean; +} + +export async function getJumboProduct( + html: string | Buffer +): Promise { + const dom = parseHTML(html); + const precioCentavos = priceFromMeta(dom); + const inStock = stockFromMeta(dom); + + const ld = getProductJsonLd(dom); + const name = ld.name; + const imageUrl = ld.image; + + const retailerSku = ld.sku; + if (!retailerSku) + throw new Error("No encontré el SKU de Jumbo para pedir el EAN"); + const ean = await getEanFromSearch(retailerSku); + + return { + name, + imageUrl, + ean, + precioCentavos, + inStock, + }; +} diff --git a/scraper/scrap.ts b/scraper/scrap.ts index 0698ffe..d8acc55 100644 --- a/scraper/scrap.ts +++ b/scraper/scrap.ts @@ -1,112 +1,127 @@ +/// import * as schema from "db-datos/schema.js"; -import { WARCParser } from "warcio"; -import { writeFile } from "fs/promises"; +import { writeFile, mkdir } from "fs/promises"; import { createHash } from "crypto"; import { getCarrefourProduct } from "./parsers/carrefour.js"; import { getDiaProduct } from "./parsers/dia.js"; import { getCotoProduct } from "./parsers/coto.js"; import { join } from "path"; -import { and, eq, sql } from "drizzle-orm"; import { db } from "db-datos/db.js"; +import pMap from "p-map"; +import { getJumboProduct } from "./parsers/jumbo.js"; -const DEBUG = false; +const DEBUG = true; const PARSER_VERSION = 4; -const getPrevPrecio = db - .select({ id: schema.precios.id }) - .from(schema.precios) - .where( - and( - eq(schema.precios.warcRecordId, sql.placeholder("warcRecordId")), - eq(schema.precios.parserVersion, PARSER_VERSION) - ) - ) - .limit(1) - .prepare(); - export type Precio = typeof schema.precios.$inferInsert; export type Precioish = Omit< Precio, "fetchedAt" | "url" | "id" | "warcRecordId" | "parserVersion" >; -export async function parseWarc(path: string) { - // const warc = createReadStream(path); - let progress: { - done: number; - errors: { error: any; warcRecordId: string; path: string }[]; - } = { done: 0, errors: [] }; +export async function downloadList(path: string) { + let list = (await Bun.file(path).text()) + .split("\n") + .filter((s) => s.length > 0); - const proc = Bun.spawn(["zstdcat", "-d", path], {}); - const warc = proc.stdout; - // TODO: tirar error si falla zstd - - const parser = new WARCParser(warc); - for await (const record of parser) { - if (record.warcType === "response") { - if (!record.warcTargetURI) continue; - const warcRecordId = record.warcHeader("WARC-Record-ID"); - if (!warcRecordId) throw new Error("No tiene WARC-Record-ID"); - - if (getPrevPrecio.get({ warcRecordId })) { - console.debug(`skipped ${warcRecordId}`); - continue; - } - if (record.httpHeaders?.statusCode !== 200) { - console.debug( - `skipped ${warcRecordId} because status=${record.httpHeaders?.statusCode} (!=200)` - ); - continue; - } - // TODO: sobreescribir si existe el mismo record-id pero con version mas bajo? - - const html = await record.contentText(); - - const url = new URL(record.warcTargetURI); - try { - let ish: Precioish | undefined = undefined; - if (url.hostname === "www.carrefour.com.ar") - ish = getCarrefourProduct(html); - else if (url.hostname === "diaonline.supermercadosdia.com.ar") - ish = getDiaProduct(html); - else if (url.hostname === "www.cotodigital3.com.ar") - ish = getCotoProduct(html); - else throw new Error(`Unknown host ${url.hostname}`); - - const p: Precio = { - ...ish, - fetchedAt: new Date(record.warcDate!), - url: record.warcTargetURI, - warcRecordId, - parserVersion: PARSER_VERSION, - }; - - await db.insert(schema.precios).values(p); - - progress.done++; - } catch (error) { - console.error({ path, warcRecordId, error }); - progress.errors.push({ - path, - warcRecordId, - error, - }); - - if (DEBUG) { - const urlHash = createHash("md5") - .update(record.warcTargetURI!) - .digest("hex"); - const output = join("debug", `${urlHash}.html`); - await writeFile(output, html); - console.error(`wrote html to ${output}`); + const results = await pMap( + list, + async (urlS) => { + let res: ScrapResult = { type: "skipped" }; + for (let attempts = 0; attempts < 6; attempts++) { + if (attempts !== 0) await wait(1500); + res = await scrap(urlS); + if (res.type === "done" || res.type === "skipped") { + break; } } + if (res.type === "error") console.error(res); + return res; + }, + { concurrency: 32 } + ); + + let progress: { + done: number; + skipped: number; + errors: { error: any; url: string; debugPath: string }[]; + } = { done: 0, skipped: 0, errors: [] }; + for (const result of results) { + switch (result.type) { + case "done": + progress.done++; + break; + case "error": + progress.errors.push(result); + break; + case "skipped": + progress.skipped++; + break; } } - - if ((await proc.exited) !== 0) { - throw new Error("zstd tiró un error"); - } - return progress; } + +export async function getProduct(url: URL, html: string): Promise { + if (url.hostname === "www.carrefour.com.ar") return getCarrefourProduct(html); + else if (url.hostname === "diaonline.supermercadosdia.com.ar") + return getDiaProduct(html); + else if (url.hostname === "www.cotodigital3.com.ar") + return getCotoProduct(html); + else if (url.hostname === "www.jumbo.com.ar") + return await getJumboProduct(html); + else throw new Error(`Unknown host ${url.hostname}`); +} + +type ScrapResult = + | { type: "skipped" } + | { type: "done" } + | { type: "error"; url: string; error: any; debugPath: string }; +async function scrap(urlS: string): Promise { + let url; + try { + url = new URL(urlS); + } catch (err) { + console.error(`skipped ${urlS} because ${err}`); + return { type: "skipped" }; + } + const res = await fetch(url); + if (!res.ok) { + console.debug(`skipped ${urlS} because status=${res.status} (!=200)`); + return { type: "skipped" }; + } + + const html = await res.text(); + + try { + let ish = await getProduct(url, html); + + const p: Precio = { + ...ish, + fetchedAt: new Date(), + url: urlS, + parserVersion: PARSER_VERSION, + }; + + await db.insert(schema.precios).values(p); + + return { type: "done" }; + } catch (error) { + const urlHash = createHash("md5").update(urlS).digest("hex"); + const output = join("debug", `${urlHash}.html`); + if (DEBUG) { + await mkdir("debug", { recursive: true }); + await writeFile(output, html); + } + return { + type: "error", + url: urlS, + error, + debugPath: output, + }; + } +} + +function wait(ms: number) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} diff --git a/scraper/tsconfig.json b/scraper/tsconfig.json index 3c43903..18a0a92 100644 --- a/scraper/tsconfig.json +++ b/scraper/tsconfig.json @@ -1,3 +1,4 @@ { - "extends": "../tsconfig.json" + "extends": "../tsconfig.json", + "exclude": ["../sitio"] } diff --git a/scraper/warc.ts b/scraper/warc.ts deleted file mode 100644 index 4e719c6..0000000 --- a/scraper/warc.ts +++ /dev/null @@ -1,157 +0,0 @@ -const crlf = "\r\n"; -const crlfB = Buffer.from(crlf, "utf-8"); -const crlfcrlf = crlf + crlf; -const crlfcrlfB = Buffer.from(crlfcrlf, "utf-8"); -const warc10B = Buffer.from("WARC/1.0", "utf-8"); -const emptyBuffer = Buffer.from("", "utf-8"); - -export async function* parseWARC(path: string) { - const warc = Bun.spawn(["zstd", "-do", "/dev/stdout", path], { - stderr: "ignore", - }).stdout; - - // const warc = Bun.stdin.stream(1024 * 1024 * 128); - - // let buffer: Uint8Array[] = []; - // const transform = new TransformStream({ - // transform(chunk, controller) { - // buffer.push(chunk); - // if ( - // buffer.reduce((prev, curr) => prev + curr.length, 0) > - // 1024 * 1024 * 64 - // ) { - // controller.enqueue(Buffer.concat(buffer)); - // buffer = []; - // } - // }, - // flush(controller) { - // controller.enqueue(Buffer.concat(buffer)); - // }, - // }); - - // warc.pipeTo(transform.writable); - - const reader = warc.getReader(); - // const reader = transform.readable.getReader(); - - // const warc = process.stdin; - - let arrays: Buffer[] = []; - let done = false; - while (!done) { - const r = await reader.readMany(); - if (r.done) { - done = true; - } else { - arrays = arrays.concat(r.value.map((x) => Buffer.from(x))); - if ( - arrays.reduce((prev, curr) => prev + curr.length, 0) < - 1024 * 1024 * 10 - ) - continue; - } - let buf: Buffer; - while ( - ((buf = arrays.length === 1 ? arrays[0] : Buffer.concat(arrays)), - buf.subarray(warc10B.length).includes(warc10B)) - ) { - const until = buf.indexOf(crlfcrlfB); - const header = buf.subarray(0, until); - - const lines = splitBuffer(header, crlfB); - let i = 0; - const nextLine = () => { - const line = lines[i]; - i++; - return line ? line : emptyBuffer; - }; - let line: Buffer; - if (!(line = nextLine()).equals(warc10B)) { - throw new Error(`No WARC 1.0 header in '${line}'`); - } - - let field; - let fields = new Map(); - while ( - ((line = nextLine()), - (field = parseField(line.toString("utf8"))), - line.length !== 0) - ) { - fields.set(field[0], field[1]); - } - const length = parseInt(fields.get("Content-Length")!); - - const rawHttp = buf.subarray( - until + crlfcrlfB.length, - until + crlfcrlfB.length + length - ); - const rawHttpHeaders = rawHttp - .subarray( - rawHttp.indexOf(crlfB) + crlfB.length, - rawHttp.indexOf(crlfcrlfB) + crlfcrlfB.length - ) - .toString(); - - let httpHeaders = new Map(); - rawHttpHeaders.split(crlf).forEach((line) => { - if (!line.length) return; - const [key, val] = line.split(": "); - httpHeaders.set(key, val); - }); - - let content = rawHttp.subarray( - rawHttp.indexOf(crlfcrlfB) + crlfcrlfB.length - ); - - if (httpHeaders.get("Transfer-Encoding") === "chunked") { - content = dechunk(content); - } - - // console.debug(fields.get("WARC-Date"), content.length); - - yield { - fields, - content, - }; - - arrays = [ - buf.subarray(until + crlfcrlfB.length + length + crlfcrlfB.length), - ]; - if (!arrays[0].length) break; - } - } -} - -function splitBuffer(buffer: Buffer, val: Buffer): Buffer[] { - let bufs = []; - let rest = buffer; - let i; - while (((i = rest.indexOf(val)), i !== -1)) { - bufs.push(rest.subarray(0, i)); - rest = rest.subarray(i + val.length); - } - bufs.push(rest); - return bufs; -} - -function parseField(line: string): [string, string] { - const [key, val] = line.split(": "); - return [key, val]; -} - -function dechunk(content: Buffer): Buffer { - let actualContent = []; - - while (true) { - let until = content.indexOf(crlf); - const hexLen = content.subarray(0, until).toString(); - if (hexLen.length === 0) break; - const len = parseInt(hexLen, 16); - actualContent.push( - content.subarray(until + crlfB.length, until + crlfB.length + len) - ); - content = content.subarray(until + crlfB.length + len + crlfB.length); - } - - return Buffer.concat(actualContent); -} diff --git a/sitio/Containerfile b/sitio/Containerfile deleted file mode 100644 index 687b071..0000000 --- a/sitio/Containerfile +++ /dev/null @@ -1,31 +0,0 @@ -FROM docker.io/oven/bun:1-alpine as build -RUN apk add --no-cache nodejs -WORKDIR /usr/src/app -COPY . . -WORKDIR /usr/src/app/sitio -RUN bun install && \ - bun run build - -# FROM docker.io/oven/bun:1-alpine as deps -# WORKDIR /usr/src/app/sitio -# RUN bun init && bun install "better-sqlite3"@"^9.2.2" "chart.js"@"^4.4.1" "chartjs-adapter-dayjs-4"@"^1.0.4" "dayjs"@"^1.11.10" "drizzle-orm"@"^0.29.1" -# COPY --from=build /usr/src/app/db-datos node_modules/db-datos - -FROM docker.io/alpine:3.19 -RUN apk add --no-cache tini nodejs npm jq - -WORKDIR /app -COPY --from=build /usr/src/app/sitio/package.json package.real.json -RUN sh -c 'echo {\"name\":\"sitio\",\"type\":\"module\",\"dependencies\":$(jq .dependencies < package.real.json)} > package.json' && npm install -COPY --from=build /usr/src/app/db-datos node_modules/db-datos -COPY --from=build /usr/src/app/sitio/build . - -# https://github.com/gornostay25/svelte-adapter-bun/issues/39 -ENV PROTOCOL_HEADER=x-forwarded-proto -ENV HOST_HEADER=x-forwarded-host - -VOLUME /db -ENV DB_PATH=/db/db.db -EXPOSE 3000 - -CMD ["tini", "node", "."] \ No newline at end of file diff --git a/sitio/package.json b/sitio/package.json index 184c06a..ee66782 100644 --- a/sitio/package.json +++ b/sitio/package.json @@ -38,7 +38,8 @@ "better-sqlite3": "^9.2.2", "chart.js": "^4.4.1", "chartjs-adapter-dayjs-4": "^1.0.4", + "croner": "^8.0.0", "dayjs": "^1.11.10", - "drizzle-orm": "=0.29.1" + "drizzle-orm": "^0.29.1" } } diff --git a/sitio/src/hooks.server.ts b/sitio/src/hooks.server.ts new file mode 100644 index 0000000..a7fc486 --- /dev/null +++ b/sitio/src/hooks.server.ts @@ -0,0 +1,12 @@ +import { spawn } from "child_process"; +import Cron from "croner"; + +if (process.env.NODE_ENV === "production") { + const job = Cron("15 3 * * *", () => { + runScraper(); + }); +} + +function runScraper() { + spawn("bun", ["/bin/scraper", "auto"], { stdio: "inherit" }); +} diff --git a/sitio/src/lib/ProductPreview.svelte b/sitio/src/lib/ProductPreview.svelte index cf377b9..17fa6cc 100644 --- a/sitio/src/lib/ProductPreview.svelte +++ b/sitio/src/lib/ProductPreview.svelte @@ -1,8 +1,10 @@ - {product.name} + {#if product.imageUrl} + {product.name} + {/if}

{product.name}

diff --git a/sitio/src/routes/+page.server.ts b/sitio/src/routes/+page.server.ts index 91b9e82..fb92f70 100644 --- a/sitio/src/routes/+page.server.ts +++ b/sitio/src/routes/+page.server.ts @@ -1,9 +1,22 @@ -import type { PageServerLoad } from "./$types"; +import type { PageData, PageServerLoad } from "./$types"; import { db, schema } from "$lib/server/db"; const { precios } = schema; import { sql } from "drizzle-orm"; -export const load: PageServerLoad = async ({ params }) => { +let cache: null | { key: Date; data: { precios: Precios } } = null; + +type Precios = { + ean: string; + name: string | null; + imageUrl: string | null; +}[]; + +export const load: PageServerLoad = async ({ + params, +}): Promise<{ precios: Precios }> => { + if (cache && +new Date() < +cache.key + 1000 * 60 * 10) { + return cache.data; + } const q = db .select({ ean: precios.ean, @@ -12,9 +25,11 @@ export const load: PageServerLoad = async ({ params }) => { }) .from(precios) .groupBy(precios.ean) - .having(sql`max(length(name))`) + .having(sql`max(length(name)) and max(parser_version) and in_stock`) .orderBy(sql`random()`) .limit(150); const res = await q; - return { precios: res }; + const data = { precios: res }; + cache = { key: new Date(), data }; + return data; }; diff --git a/sitio/src/routes/+page.svelte b/sitio/src/routes/+page.svelte index 349bd3b..e16cb94 100644 --- a/sitio/src/routes/+page.svelte +++ b/sitio/src/routes/+page.svelte @@ -3,6 +3,10 @@ import type { PageData } from "./$types"; export let data: PageData; + $: precios = data.precios.filter( + (d): d is { ean: string; name: string; imageUrl: string | null } => + !!d.name, + );

WIP

@@ -32,7 +36,7 @@

Random

    - {#each data.precios as product} + {#each precios as product}
  • diff --git a/sitio/src/routes/ean/[ean]/+page.server.ts b/sitio/src/routes/ean/[ean]/+page.server.ts index 32d0688..9f9f9e3 100644 --- a/sitio/src/routes/ean/[ean]/+page.server.ts +++ b/sitio/src/routes/ean/[ean]/+page.server.ts @@ -9,13 +9,11 @@ export const load: PageServerLoad = async ({ params }) => { .select() .from(precios) .where(eq(precios.ean, params.ean)) - .groupBy(precios.warcRecordId) - .having(max(precios.parserVersion)) .orderBy(precios.fetchedAt); const res = await q; if (res.length === 0) return error(404, "Not Found"); - const meta = res.find((p) => p.name); + const meta = res.findLast((p) => p.name); return { precios: res, meta }; }; diff --git a/sitio/src/routes/ean/[ean]/+page.svelte b/sitio/src/routes/ean/[ean]/+page.svelte index a05b458..d3ce771 100644 --- a/sitio/src/routes/ean/[ean]/+page.svelte +++ b/sitio/src/routes/ean/[ean]/+page.svelte @@ -17,6 +17,7 @@ [Supermercado.Dia]: "bg-[#d52b1e] focus:ring-[#d52b1e]", [Supermercado.Carrefour]: "bg-[#19549d] focus:ring-[#19549d]", [Supermercado.Coto]: "bg-[#e20025] focus:ring-[#e20025]", + [Supermercado.Jumbo]: "bg-[#2dc850] focus:ring-[#2dc850]", }; diff --git a/sitio/src/routes/search/+page.server.ts b/sitio/src/routes/search/+page.server.ts index 65f2783..39c15d3 100644 --- a/sitio/src/routes/search/+page.server.ts +++ b/sitio/src/routes/search/+page.server.ts @@ -1,18 +1,19 @@ import { error } from "@sveltejs/kit"; -import { eq, max, sql } from "drizzle-orm"; +import { sql } from "drizzle-orm"; import type { PageServerLoad } from "./$types"; -import { db, schema } from "$lib/server/db"; -const { precios } = schema; +import { db } from "$lib/server/db"; export const load: PageServerLoad = async ({ url }) => { const query = url.searchParams.get("q"); let results: null | { ean: string; name: string; imageUrl: string }[] = null; if (query) { - results = db.all( - sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f + const sqlQuery = sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f join precios p on p.ean = f.ean - where f.name match ${query};`, - ); + where f.name match ${`"${query}"`} + group by p.ean + having max(p.fetched_at) and max(p.in_stock) + order by p.in_stock desc;`; + results = db.all(sqlQuery); } return { query, results }; diff --git a/tsconfig.json b/tsconfig.json index d975f54..3a51a30 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -15,5 +15,6 @@ "noEmit": true, "forceConsistentCasingInFileNames": true }, - "include": ["**/*.ts", "**/*.js"] + "include": ["**/*.ts", "**/*.js"], + "exclude": ["sitio/build"] } diff --git a/warcificator/Cargo.lock b/warcificator/Cargo.lock index 3b15e8c..bac9aa5 100644 --- a/warcificator/Cargo.lock +++ b/warcificator/Cargo.lock @@ -24,7 +24,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01" dependencies = [ "cfg-if", - "getrandom", "once_cell", "version_check", "zerocopy", @@ -144,12 +143,6 @@ version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" -[[package]] -name = "byteorder" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" - [[package]] name = "bytes" version = "1.5.0" @@ -205,6 +198,16 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-channel" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82a9b73a36529d9c47029b9fb3a6f0ea3cc916a261195352ba19e770fc1748b2" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.18" @@ -215,60 +218,14 @@ dependencies = [ ] [[package]] -name = "cssparser" -version = "0.31.2" +name = "deranged" +version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be" +checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" dependencies = [ - "cssparser-macros", - "dtoa-short", - "itoa", - "phf 0.11.2", - "smallvec", + "powerfmt", ] -[[package]] -name = "cssparser-macros" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" -dependencies = [ - "quote", - "syn 2.0.43", -] - -[[package]] -name = "derive_more" -version = "0.99.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - -[[package]] -name = "dtoa" -version = "1.0.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653" - -[[package]] -name = "dtoa-short" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbaceec3c6e4211c79e7b1800fb9680527106beb2f9c51904a3210c03a448c74" -dependencies = [ - "dtoa", -] - -[[package]] -name = "ego-tree" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591" - [[package]] name = "encoding_rs" version = "0.8.33" @@ -342,16 +299,6 @@ dependencies = [ "percent-encoding", ] -[[package]] -name = "futf" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" -dependencies = [ - "mac", - "new_debug_unreachable", -] - [[package]] name = "futures-channel" version = "0.3.30" @@ -391,24 +338,6 @@ dependencies = [ "pin-utils", ] -[[package]] -name = "fxhash" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" -dependencies = [ - "byteorder", -] - -[[package]] -name = "getopts" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" -dependencies = [ - "unicode-width", -] - [[package]] name = "getrandom" version = "0.2.11" @@ -470,20 +399,6 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" -[[package]] -name = "html5ever" -version = "0.26.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" -dependencies = [ - "log", - "mac", - "markup5ever", - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "http" version = "0.2.11" @@ -597,6 +512,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + [[package]] name = "libc" version = "0.2.151" @@ -629,26 +550,6 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" -[[package]] -name = "mac" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" - -[[package]] -name = "markup5ever" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" -dependencies = [ - "log", - "phf 0.10.1", - "phf_codegen", - "string_cache", - "string_cache_codegen", - "tendril", -] - [[package]] name = "memchr" version = "2.7.1" @@ -682,10 +583,14 @@ dependencies = [ ] [[package]] -name = "new_debug_unreachable" -version = "1.0.4" +name = "nu-ansi-term" +version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] [[package]] name = "num_cpus" @@ -712,6 +617,12 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + [[package]] name = "parking" version = "2.2.0" @@ -747,86 +658,6 @@ version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" -[[package]] -name = "phf" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" -dependencies = [ - "phf_shared 0.10.0", -] - -[[package]] -name = "phf" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" -dependencies = [ - "phf_macros", - "phf_shared 0.11.2", -] - -[[package]] -name = "phf_codegen" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" -dependencies = [ - "phf_generator 0.10.0", - "phf_shared 0.10.0", -] - -[[package]] -name = "phf_generator" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" -dependencies = [ - "phf_shared 0.10.0", - "rand", -] - -[[package]] -name = "phf_generator" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" -dependencies = [ - "phf_shared 0.11.2", - "rand", -] - -[[package]] -name = "phf_macros" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b" -dependencies = [ - "phf_generator 0.11.2", - "phf_shared 0.11.2", - "proc-macro2", - "quote", - "syn 2.0.43", -] - -[[package]] -name = "phf_shared" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" -dependencies = [ - "siphasher", -] - -[[package]] -name = "phf_shared" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" -dependencies = [ - "siphasher", -] - [[package]] name = "pin-project-lite" version = "0.2.13" @@ -846,16 +677,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69d3587f8a9e599cc7ec2c00e331f71c4e69a5f9a4b8a6efd5b07466b9736f9a" [[package]] -name = "ppv-lite86" -version = "0.2.17" +name = "powerfmt" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" - -[[package]] -name = "precomputed-hash" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" [[package]] name = "proc-macro2" @@ -875,36 +700,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "rand" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" -dependencies = [ - "libc", - "rand_chacha", - "rand_core", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" -dependencies = [ - "getrandom", -] - [[package]] name = "redox_syscall" version = "0.4.1" @@ -1033,22 +828,6 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" -[[package]] -name = "scraper" -version = "0.18.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "585480e3719b311b78a573db1c9d9c4c1f8010c2dee4cc59c2efe58ea4dbc3e1" -dependencies = [ - "ahash", - "cssparser", - "ego-tree", - "getopts", - "html5ever", - "once_cell", - "selectors", - "tendril", -] - [[package]] name = "sct" version = "0.7.1" @@ -1059,25 +838,6 @@ dependencies = [ "untrusted", ] -[[package]] -name = "selectors" -version = "0.25.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" -dependencies = [ - "bitflags 2.4.1", - "cssparser", - "derive_more", - "fxhash", - "log", - "new_debug_unreachable", - "phf 0.10.1", - "phf_codegen", - "precomputed-hash", - "servo_arc", - "smallvec", -] - [[package]] name = "serde" version = "1.0.193" @@ -1095,7 +855,7 @@ checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.43", + "syn", ] [[package]] @@ -1122,12 +882,12 @@ dependencies = [ ] [[package]] -name = "servo_arc" -version = "0.3.0" +name = "sharded-slab" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" dependencies = [ - "stable_deref_trait", + "lazy_static", ] [[package]] @@ -1139,12 +899,6 @@ dependencies = [ "libc", ] -[[package]] -name = "siphasher" -version = "0.3.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" - [[package]] name = "slab" version = "0.4.9" @@ -1176,49 +930,6 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" -[[package]] -name = "stable_deref_trait" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" - -[[package]] -name = "string_cache" -version = "0.8.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" -dependencies = [ - "new_debug_unreachable", - "once_cell", - "parking_lot", - "phf_shared 0.10.0", - "precomputed-hash", - "serde", -] - -[[package]] -name = "string_cache_codegen" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" -dependencies = [ - "phf_generator 0.10.0", - "phf_shared 0.10.0", - "proc-macro2", - "quote", -] - -[[package]] -name = "syn" -version = "1.0.109" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - [[package]] name = "syn" version = "2.0.43" @@ -1252,14 +963,62 @@ dependencies = [ ] [[package]] -name = "tendril" -version = "0.4.3" +name = "thiserror" +version = "1.0.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +checksum = "6e3de26b0965292219b4287ff031fcba86837900fe9cd2b34ea8ad893c0953d2" dependencies = [ - "futf", - "mac", - "utf-8", + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "268026685b2be38d7103e9e507c938a1fcb3d7e6eb15e87870b617bf37b6d581" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thread_local" +version = "1.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152" +dependencies = [ + "cfg-if", + "once_cell", +] + +[[package]] +name = "time" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f657ba42c3f86e7680e53c8cd3af8abbe56b5491790b46e22e19c0d57463583e" +dependencies = [ + "deranged", + "itoa", + "powerfmt", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" + +[[package]] +name = "time-macros" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26197e33420244aeb70c3e8c78376ca46571bc4e701e4791c2cd9f57dcb3a43f" +dependencies = [ + "time-core", ] [[package]] @@ -1277,6 +1036,11 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tl" +version = "0.7.7" +source = "git+https://github.com/evertedsphere/tl?branch=patch-1#56711166588fa6c7729a08e5740dca2526436316" + [[package]] name = "tokio" version = "1.35.1" @@ -1304,7 +1068,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.43", + "syn", ] [[package]] @@ -1343,10 +1107,35 @@ version = "0.1.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" dependencies = [ + "log", "pin-project-lite", + "tracing-attributes", "tracing-core", ] +[[package]] +name = "tracing-appender" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3566e8ce28cc0a3fe42519fc80e6b4c943cc4c8cef275620eb8dac2d3d4e06cf" +dependencies = [ + "crossbeam-channel", + "thiserror", + "time", + "tracing-subscriber", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "tracing-core" version = "0.1.32" @@ -1354,6 +1143,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" dependencies = [ "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" +dependencies = [ + "nu-ansi-term", + "sharded-slab", + "smallvec", + "thread_local", + "tracing-core", + "tracing-log", ] [[package]] @@ -1383,12 +1198,6 @@ dependencies = [ "tinyvec", ] -[[package]] -name = "unicode-width" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" - [[package]] name = "untrusted" version = "0.9.0" @@ -1407,10 +1216,10 @@ dependencies = [ ] [[package]] -name = "utf-8" -version = "0.7.6" +name = "valuable" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" [[package]] name = "vcpkg" @@ -1440,10 +1249,13 @@ dependencies = [ "async-channel", "reqwest", "rusqlite", - "scraper", "serde", "serde_json", + "tl", "tokio", + "tracing", + "tracing-appender", + "tracing-subscriber", ] [[package]] @@ -1473,7 +1285,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.43", + "syn", "wasm-bindgen-shared", ] @@ -1507,7 +1319,7 @@ checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283" dependencies = [ "proc-macro2", "quote", - "syn 2.0.43", + "syn", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -1534,6 +1346,28 @@ version = "0.25.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1778a42e8b3b90bff8d0f5032bf22250792889a5cdc752aa0020c84abe3aaf10" +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + [[package]] name = "windows-sys" version = "0.48.0" @@ -1627,5 +1461,5 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.43", + "syn", ] diff --git a/warcificator/Cargo.toml b/warcificator/Cargo.toml index 99a875f..a9f7076 100644 --- a/warcificator/Cargo.toml +++ b/warcificator/Cargo.toml @@ -7,13 +7,18 @@ edition = "2021" [dependencies] async-channel = "2.1.1" +# lol_html = "1.2.0" reqwest = { version = "0.11.23", default-features = false, features = [ "rustls-tls", "gzip", "brotli", ] } rusqlite = "0.30.0" -scraper = "0.18.1" +# scraper = "0.18.1" serde = { version = "1.0.193", features = ["derive"] } serde_json = "1.0.109" +tl = { git = "https://github.com/evertedsphere/tl", branch = "patch-1", features = ["simd"] } tokio = { version = "1.35.1", features = ["full"] } +tracing = { version = "0.1", features = ["log"] } +tracing-appender = "0.2.3" +tracing-subscriber = "0.3.18" diff --git a/warcificator/src/main.rs b/warcificator/src/main.rs index 1b2c9ac..5311b41 100644 --- a/warcificator/src/main.rs +++ b/warcificator/src/main.rs @@ -1,9 +1,18 @@ use async_channel::{Receiver, Sender}; +// use lol_html::{ +// element, +// html_content::{Element, TextChunk}, +// text, ElementContentHandlers, HtmlRewriter, Selector, Settings, +// }; use rusqlite::Connection; -use scraper::{Element, Html, Selector}; +use serde::de::value; +use tl::VDom; +// use scraper::{Element, Html, Selector}; use std::{ + borrow::Cow, env::args, fs, + ops::Deref, time::{SystemTime, UNIX_EPOCH}, }; use tokio::io::{stderr, AsyncWriteExt}; @@ -21,6 +30,109 @@ struct PrecioPoint { image_url: Option, } +// fn main() { +// let arg = args().skip(1).next().unwrap(); + +// let file_iter = fs::read_dir(arg) +// .unwrap() +// .filter(|pr| { +// if let Ok(p) = pr { +// !p.file_name().to_str().unwrap().ends_with(".link") +// } else { +// false +// } +// }) +// .take(1000) +// .map(|f| fs::read(f.unwrap().path()).unwrap()); + +// let mut i = 0; +// for item in file_iter { +// i = i + 1; +// { +// // let mut text: Option = None; +// // let mut price_str: Option = None; +// // let mut rewriter = HtmlRewriter::new( +// // Settings { +// // element_content_handlers: vec![ +// // // Rewrite insecure hyperlinks +// // element!("a[href]", |el| { +// // let href = el.get_attribute("href").unwrap().replace("http:", "https:"); + +// // el.set_attribute("href", &href).unwrap(); + +// // Ok(()) +// // }), +// // ( +// // Cow::Owned("a".parse().unwrap()), +// // ElementContentHandlers::default().text(extract_first_text(&mut text)), +// // ), +// // element!( +// // "meta[property=\"product:price:amount\"]", +// // extract_first_attr(&mut price_str, "content") +// // ), +// // ], +// // memory_settings: lol_html::MemorySettings { +// // preallocated_parsing_buffer_size: 1024 * 16, +// // max_allowed_memory_usage: std::usize::MAX, +// // }, +// // ..Settings::default() +// // }, +// // |_: &[u8]| {}, +// // ); + +// // rewriter.write(&item).unwrap(); +// // rewriter.end().unwrap(); +// // println!("{:#?}", price_str); + +// // let html = scraper::Html::parse_document(&String::from_utf8(item).unwrap()); + +// let html = String::from_utf8(item).unwrap(); +// let dom = tl::parse(&html, tl::ParserOptions::default()).unwrap(); + +// match parse_carrefour("".into(), &dom) { +// Ok(point) => { +// // println!("{:?}", point); +// } +// Err(err) => { +// // println!("Error {:#?}: {}", err, html); +// } +// }; +// } +// } +// println!("n={}", i); +// } + +// fn extract_first_text( +// output: &mut Option, +// ) -> impl FnMut( +// &mut TextChunk, +// ) -> Result<(), Box<(dyn std::error::Error + std::marker::Send + Sync + 'static)>> +// + '_ { +// move |el| { +// if *output == None { +// *output = Some(el.as_str().to_owned()); +// } +// Ok(()) +// } +// } + +// fn extract_first_attr<'a>( +// output: &'a mut Option, +// attr: &'a str, +// ) -> impl FnMut( +// &mut Element, +// ) -> Result<(), Box<(dyn std::error::Error + std::marker::Send + Sync + 'static)>> +// + 'a { +// move |el| { +// if *output == None { +// if let Some(value) = el.get_attribute(attr) { +// *output = Some(value); +// } +// } +// Ok(()) +// } +// } + #[tokio::main] async fn main() { let mut args = args().skip(1); @@ -38,7 +150,7 @@ async fn main() { let (res_sender, res_receiver) = async_channel::unbounded::(); let mut handles = Vec::new(); - for _ in 1..16 { + for _ in 1..32 { let rx = receiver.clone(); let tx = res_sender.clone(); handles.push(tokio::spawn(worker(rx, tx))); @@ -81,14 +193,7 @@ async fn worker(rx: Receiver, tx: Sender) { #[derive(Debug)] enum FetchError { HttpError(reqwest::Error), - NoPriceMetaEl, - NoMetaContent, - NotANumber, - NoStockMetaEl, - NoValidStockMeta, - NoSeedState, - NoProductInSeedState, - NoProductSkuInSeedState, + ParseError(&'static str), } async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result { @@ -102,69 +207,68 @@ async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result Result { - let meta_price_sel = Selector::parse("meta[property=\"product:price:amount\"]").unwrap(); - let precio_centavos = match html.select(&meta_price_sel).next() { - Some(el) => match el.attr("content") { - Some(attr) => match attr.parse::() { - Ok(f) => Ok((f * 100.0) as u64), - Err(_) => Err(FetchError::NotANumber), - }, - None => Err(FetchError::NoMetaContent), - }, - None => Err(FetchError::NoPriceMetaEl), + +fn parse_carrefour(url: String, dom: &tl::VDom) -> Result { + let precio_centavos = { + get_meta_content(dom, "product:price:amount")? + .map(|s| { + s.parse::() + .map_err(|_| FetchError::ParseError("Failed to parse number")) + }) + .transpose() + .map(|f| f.map(|f| (f * 100.0) as u64)) }?; - let meta_stock_el = Selector::parse("meta[property=\"product:availability\"]").unwrap(); - let in_stock = match html.select(&meta_stock_el).next() { - Some(el) => match el.attr("content") { - Some(attr) => match attr { - "oos" => Ok(Some(false)), - "instock" => Ok(Some(true)), - _ => Err(FetchError::NoValidStockMeta), - }, - None => Err(FetchError::NoMetaContent), + let in_stock_meta = get_meta_content(dom, "product:availability")?.map(|s| s.into_owned()); + let in_stock = match in_stock_meta { + Some(s) => match s.as_ref() { + "oos" => Some(false), + "instock" => Some(true), + _ => return Err(FetchError::ParseError("Not a valid product:availability")), }, - None => Err(FetchError::NoStockMetaEl), - }?; + None => None, + }; let ean = { - let state = parse_script_json(&html, "__STATE__").ok_or(FetchError::NoSeedState)?; - let seed_state = &state.as_object().ok_or(FetchError::NoSeedState)?; - let (_, product_json) = seed_state + let json = &parse_script_json(dom, "__STATE__")?; + let state = json + .as_object() + .ok_or(FetchError::ParseError("Seed state not an object"))?; + let (_, product_json) = state .into_iter() .find(|(key, val)| { key.starts_with("Product:") - && val.as_object().map_or(false, |val| { - val.get("__typename") - .map_or(false, |typename| typename == "Product") - }) + && val + .as_object() + .and_then(|val| val.get("__typename")) + .map_or(false, |typename| typename == "Product") }) - .ok_or(FetchError::NoProductInSeedState)?; + .ok_or(FetchError::ParseError("No product in seed state"))?; let cache_id = product_json .get("cacheId") - .ok_or(FetchError::NoProductInSeedState)?; - let (_, product_sku_json) = seed_state - .into_iter() - .filter_map(|(key, val)| val.as_object().map_or(None, |o| Some((key, o)))) + .and_then(|v| v.as_str()) + .ok_or(FetchError::ParseError("No cacheId in seed state"))?; + let (_, product_sku_json) = state + .iter() .find(|(key, val)| { key.starts_with(&format!("Product:{}", cache_id)) - && val - .get("__typename") - .map_or(false, |typename| typename == "SKU") + && val.as_object().map_or(false, |obj| { + obj.get("__typename") + .map_or(false, |typename| typename == "SKU") + }) }) - .ok_or(FetchError::NoProductSkuInSeedState)?; + .ok_or(FetchError::ParseError("No Product:cacheId* found"))?; product_sku_json .get("ean") - .ok_or(FetchError::NoProductSkuInSeedState)? - .as_str() - .ok_or(FetchError::NoProductSkuInSeedState)? + .and_then(|v| v.as_str()) + .ok_or(FetchError::ParseError("No product SKU in seed state"))? .to_string() }; @@ -175,29 +279,70 @@ fn parse_carrefour(url: String, html: Html) -> Result { name: None, image_url: None, parser_version: 5, - precio_centavos: Some(precio_centavos), + precio_centavos: precio_centavos, url: url, }) } -fn parse_script_json(html: &Html, varname: &str) -> Option { - let template_sel = Selector::parse(&format!( - "template[data-type=\"json\"][data-varname=\"{}\"]", - varname - )) - .unwrap(); - match html.select(&template_sel).next() { - Some(value) => match value.first_element_child() { - Some(script) => match serde_json::from_str(&script.inner_html()) { - Ok(val) => val, - Err(_) => None, - }, - None => None, - }, - None => None, +fn get_meta_content<'a>(dom: &'a VDom<'a>, prop: &str) -> Result>, FetchError> { + let tag = &dom + .query_selector(&format!("meta[property=\"{}\"]", prop)) + .and_then(|mut iter| iter.next()) + .and_then(|h| h.get(dom.parser())) + .and_then(|n| n.as_tag()); + match tag { + Some(tag) => Ok(Some( + tag.attributes() + .get("content") + .flatten() + .ok_or(FetchError::ParseError("Failed to get content attr"))? + .as_utf8_str(), + )), + None => Ok(None), } } +fn parse_script_json(dom: &VDom, varname: &str) -> Result { + let parser = dom.parser(); + let inner_html = &dom + .query_selector(&format!( + "template[data-type=\"json\"][data-varname=\"{}\"]", + varname + )) + .and_then(|mut iter| iter.next()) + .and_then(|h| h.get(parser)) + .and_then(|n| n.as_tag()) + .and_then(|t| { + t.children() + .all(parser) + .iter() + .find(|n| n.as_tag().is_some()) + }) + .ok_or(FetchError::ParseError("Failed to get script tag"))? + .inner_html(parser); + Ok(inner_html + .parse() + .map_err(|_| FetchError::ParseError("Couldn't parse JSON in script"))?) +} + +// fn parse_script_json(html: &Html, varname: &str) -> Option { +// let template_sel = Selector::parse(&format!( +// "template[data-type=\"json\"][data-varname=\"{}\"]", +// varname +// )) +// .unwrap(); +// match html.select(&template_sel).next() { +// Some(value) => match value.first_element_child() { +// Some(script) => match serde_json::from_str(&script.inner_html()) { +// Ok(val) => val, +// Err(_) => None, +// }, +// None => None, +// }, +// None => None, +// } +// } + fn now_sec() -> u64 { let start = SystemTime::now(); let since_the_epoch = start @@ -210,6 +355,6 @@ async fn db_writer(rx: Receiver) { let conn = Connection::open("../scraper/sqlite.db").unwrap(); // let mut stmt = conn.prepare("SELECT id, name, data FROM person")?; while let Ok(res) = rx.recv().await { - println!("{:#?}", res) + println!("{:?}", res) } }