ci: activar cache

dia: solo pedir sitemap
volver a tl viejo pero sin simd
2024-11-29 13:06:19 +00:00 · 2024-01-12 00:08:02 -03:00 · 2024-01-12 00:07:11 -03:00 · 2024-01-11 23:46:40 -03:00 · 2024-01-11 23:46:33 -03:00 · 2024-01-11 23:35:12 -03:00
24 changed files with 2643 additions and 59 deletions
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@ -9,7 +9,8 @@
 		"ghcr.io/devcontainers/features/git-lfs:1": {},
 		"ghcr.io/devcontainers/features/node:1": {},
 		"ghcr.io/swift-server-community/swift-devcontainer-features/sqlite:1": {},
-		"ghcr.io/devcontainers/features/rust:1": {}
+		"ghcr.io/devcontainers/features/rust:1": {},
 		"ghcr.io/devcontainers/features/docker-in-docker:2": {}
 	},
 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
--- a/.github/workflows/container.yml
+++ b/.github/workflows/container.yml
@ -81,3 +81,5 @@ jobs:
          push: true
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
          cache-from: type=gha
          cache-to: type=gha,mode=max
--- a/.gitignore
+++ b/.gitignore
@ -13,3 +13,7 @@ scraper/x.tsv
 *.tmp
 target/
 .env.*
 */flamegraph.svg
 */perf.data*
 scraper-rs/debug/
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -7,13 +7,13 @@
    {
      "type": "lldb",
      "request": "launch",
-      "name": "warcificator",
+      "name": "scraper-rs",
-      "cwd": "warcificator/",
+      "cwd": "scraper-rs/",
      "cargo": {
        // https://github.com/vadimcn/codelldb/issues/884
-        "args": ["build", "--manifest-path=warcificator/Cargo.toml"]
+        "args": ["build", "--manifest-path=scraper-rs/Cargo.toml"]
      },
-      "args": ["../data/samples/Carrefour.50.txt"],
+      "args": ["../data/Carrefour.txt"],
      "env": {}
    },
    {
--- a/2
+++ b/2
@ -11,7 +11,7 @@ RUN cd sitio && \
 RUN bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js
 FROM cgr.dev/chainguard/wolfi-base
-RUN apk add --no-cache nodejs npm jq bun
+RUN apk add --no-cache nodejs npm jq bun sqlite
 # Sitio
 COPY --from=build /usr/src/app/sitio/package.json package.real.json
--- a/Dockerfile.scraper
+++ b/Dockerfile.scraper
@ -1,5 +1,6 @@
-FROM docker.io/oven/bun:1-alpine AS base
+FROM cgr.dev/chainguard/wolfi-base AS base
 WORKDIR /usr/src/app
 RUN apk add --no-cache bun libgcc
 FROM base as build
 ENV NODE_ENV=production
@ -7,17 +8,22 @@ COPY . .
 RUN bun install --frozen-lockfile
 RUN bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js
 # nightly porque usamos tl con `simd` activado
 FROM base as rs-build
 RUN apk add --no-cache rust build-base sqlite-dev
 COPY scraper-rs/ .
 RUN cargo install --locked --path .
 FROM base
 RUN apk add --no-cache sqlite sqlite-libs
 # Scraper
 COPY --from=build /tmp/cli.build.js /bin/scraper
 COPY --from=build /usr/src/app/db-datos/drizzle /bin/drizzle
 COPY --from=rs-build /root/.cargo/bin/scraper-rs /usr/local/bin/scraper-rs
 ENV NODE_ENV=production
 ENV DB_PATH=/db/db.db
-# Cron scraper
+CMD ["bun", "/bin/scraper", "cron"]
 RUN printf "#!/bin/sh\nexec bun /bin/scraper auto\n" > /etc/periodic/daily/scraper \
    && chmod +x /etc/periodic/daily/scraper
 CMD ["busybox", "crond", "-f", "-l2"]
--- a/bun.lockb
+++ b/bun.lockb
--- a/data/Jumbo.txt
+++ b/data/Jumbo.txt
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6299b470d9debc9a173b40c2ba91208eb43a6c8cde02a4819e7bcd76368e4363
+oid sha256:f231884c2b9fd0b633746892a00824379b4d8aa110e6348309197b83b0d1c555
-size 922185
+size 926218
--- a/link-scrapers/dia.ts
+++ b/link-scrapers/dia.ts
@ -68,7 +68,10 @@ const categorias = [
 ];
 export async function scrapDiaProducts() {
-  await Promise.all([scrapBySite(), scrapBySitemap()]);
+  await Promise.all([
    // scrapBySite(),
    scrapBySitemap(),
  ]);
 }
 async function scrapBySitemap() {
@ -104,7 +107,7 @@ async function scrapBySite() {
  await pMap(
    links,
    async (url) => {
-      const res = await fetch(url);
+      const res = await fetch(url, { timeout: false });
      const html = await res.text();
      const { document } = parseHTML(html);
--- a/scraper-rs/Cargo.lock
+++ b/scraper-rs/Cargo.lock
--- a/scraper-rs/Cargo.toml
+++ b/scraper-rs/Cargo.toml
@ -0,0 +1,33 @@
 [package]
 name = "scraper-rs"
 version = "0.1.0"
 edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [dependencies]
 again = "0.1.2"
 anyhow = "1.0.79"
 async-channel = "2.1.1"
 clap = { version = "4.4.15", features = ["derive"] }
 nanoid = "0.4.0"
 r2d2 = "0.8.10"
 r2d2_sqlite = "0.23.0"
 rand = "0.8.5"
 # lol_html = "1.2.0"
 reqwest = { version = "0.11.23", default-features = false, features = [
    "rustls-tls",
    "gzip",
    "brotli",
    "socks",
 ] }
 rusqlite = "0.30.0"
 # scraper = "0.18.1"
 serde = { version = "1.0.193", features = ["derive"] }
 serde_json = "1.0.109"
 simple-error = "0.3.0"
 thiserror = "1.0.56"
 tl = { git = "https://github.com/evertedsphere/tl", branch = "patch-1" }
 tokio = { version = "1.35.1", features = ["full"] }
 tracing = "0.1"
 tracing-subscriber = "0.3"
--- a/scraper-rs/src/main.rs
+++ b/scraper-rs/src/main.rs
@ -0,0 +1,247 @@
 use again::RetryPolicy;
 use async_channel::Receiver;
 use clap::Parser;
 use nanoid::nanoid;
 use r2d2::Pool;
 use r2d2_sqlite::SqliteConnectionManager;
 use reqwest::{StatusCode, Url};
 use simple_error::{bail, SimpleError};
 use std::{
    env::{self},
    fs,
    path::PathBuf,
    time::Duration,
 };
 use thiserror::Error;
 #[derive(Parser)] // requires `derive` feature
 enum Args {
    FetchList(FetchListArgs),
    ParseFile(ParseFileArgs),
 }
 #[derive(clap::Args)]
 struct FetchListArgs {
    list_path: String,
 }
 #[derive(clap::Args)]
 struct ParseFileArgs {
    file_path: String,
 }
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    tracing_subscriber::fmt::init();
    match Args::parse() {
        Args::FetchList(a) => fetch_list_cli(a.list_path).await,
        Args::ParseFile(a) => parse_file_cli(a.file_path).await,
    }
 }
 async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
    let links_str = fs::read_to_string(links_list_path).unwrap();
    let links = links_str
        .split('\n')
        .map(|s| s.trim())
        .filter(|s| !s.is_empty())
        .map(|s| s.to_owned())
        .collect::<Vec<_>>();
    let (sender, receiver) = async_channel::bounded::<String>(1);
    let db_path = env::var("DB_PATH").unwrap_or("../scraper/sqlite.db".to_string());
    let manager = SqliteConnectionManager::file(db_path);
    let pool = Pool::new(manager).unwrap();
    let n_coroutines = env::var("N_COROUTINES")
        .map_or(Ok(128), |s| s.parse::<usize>())
        .expect("N_COROUTINES no es un número");
    let handles = (1..n_coroutines)
        .map(|_| {
            let rx = receiver.clone();
            let pool = pool.clone();
            tokio::spawn(worker(rx, pool))
        })
        .collect::<Vec<_>>();
    for link in links {
        sender.send_blocking(link).unwrap();
    }
    sender.close();
    let mut counters = Counters::default();
    for handle in handles {
        let c = handle.await.unwrap();
        counters.success += c.success;
        counters.errored += c.errored;
        counters.skipped += c.skipped;
    }
    println!("Finished: {:?}", counters);
    Ok(())
 }
 fn build_client() -> reqwest::Client {
    reqwest::ClientBuilder::default().build().unwrap()
 }
 #[derive(Default, Debug)]
 struct Counters {
    success: u64,
    errored: u64,
    skipped: u64,
 }
 async fn worker(rx: Receiver<String>, pool: Pool<SqliteConnectionManager>) -> Counters {
    let client = build_client();
    let mut counters = Counters::default();
    while let Ok(url) = rx.recv().await {
        let res = fetch_and_parse(&client, url.clone()).await;
        match res {
            Ok(res) => {
                counters.success += 1;
                pool.get().unwrap().execute("INSERT INTO precios(ean, fetched_at, precio_centavos, in_stock, url, warc_record_id, parser_version, name, image_url) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9);",rusqlite::params![
                    res.ean,
                    res.fetched_at,
                    res.precio_centavos,
                    res.in_stock,
                    res.url,
                    None::<String>,
                    res.parser_version,
                    res.name,
                    res.image_url,
                ]).unwrap();
            }
            Err(err) => {
                match err.downcast_ref::<FetchError>() {
                    Some(FetchError::HttpStatus(StatusCode::NOT_FOUND)) => counters.skipped += 1,
                    _ => counters.errored += 1,
                }
                tracing::error!(error=%err, url=url);
            }
        }
    }
    counters
 }
 #[derive(Debug, Error)]
 enum FetchError {
    #[error("reqwest error")]
    Http(#[from] reqwest::Error),
    #[error("http status: {0}")]
    HttpStatus(reqwest::StatusCode),
    #[error("parse error")]
    Parse(#[from] SimpleError),
    #[error("tl error")]
    Tl(#[from] tl::ParseError),
 }
 #[tracing::instrument(skip(client))]
 async fn fetch_and_parse(
    client: &reqwest::Client,
    url: String,
 ) -> Result<PrecioPoint, anyhow::Error> {
    let policy = RetryPolicy::exponential(Duration::from_millis(300))
        .with_max_retries(10)
        .with_jitter(true);
    let response = policy
        .retry(|| {
            let request = client.get(url.as_str()).build().unwrap();
            client.execute(request)
        })
        .await
        .map_err(FetchError::Http)?;
    if !response.status().is_success() {
        bail!(FetchError::HttpStatus(response.status()));
    }
    let body = response.text().await.map_err(FetchError::Http)?;
    let maybe_point = { scrap_url(client, url, &body).await };
    let point = match maybe_point {
        Ok(p) => Ok(p),
        Err(err) => {
            let debug_path = PathBuf::from("debug/");
            tokio::fs::create_dir_all(&debug_path).await.unwrap();
            let file_path = debug_path.join(format!("{}.html", nanoid!()));
            tokio::fs::write(&file_path, &body).await.unwrap();
            tracing::debug!(error=%err, "Failed to parse, saved body at {}",file_path.display());
            Err(err)
        }
    }?;
    Ok(point)
 }
 async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
    let file = tokio::fs::read_to_string(file_path).await?;
    let client = build_client();
    let url = {
        let dom = tl::parse(&file, tl::ParserOptions::default())?;
        dom.query_selector("link[rel=\"canonical\"]")
            .unwrap()
            .filter_map(|h| h.get(dom.parser()))
            .filter_map(|n| n.as_tag())
            .next()
            .and_then(|t| t.attributes().get("href").flatten())
            .expect("No meta canonical")
            .as_utf8_str()
            .to_string()
    };
    println!("URL: {}", &url);
    println!("{:?}", scrap_url(&client, url, &file).await);
    Ok(())
 }
 async fn scrap_url(
    client: &reqwest::Client,
    url: String,
    body: &str,
 ) -> anyhow::Result<PrecioPoint> {
    let url_p = Url::parse(&url).unwrap();
    match url_p.host_str().unwrap() {
        "www.carrefour.com.ar" => {
            sites::carrefour::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
        }
        "diaonline.supermercadosdia.com.ar" => {
            sites::dia::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
        }
        "www.cotodigital3.com.ar" => {
            sites::coto::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
        }
        "www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await,
        s => bail!("Unknown host {}", s),
    }
 }
 use std::time::{SystemTime, UNIX_EPOCH};
 mod sites;
 #[derive(Debug)]
 struct PrecioPoint {
    ean: String,
    // unix
    fetched_at: u64,
    precio_centavos: Option<u64>,
    in_stock: Option<bool>,
    url: String,
    parser_version: u16,
    name: Option<String>,
    image_url: Option<String>,
 }
 fn now_sec() -> u64 {
    let start = SystemTime::now();
    let since_the_epoch = start
        .duration_since(UNIX_EPOCH)
        .expect("Time went backwards");
    since_the_epoch.as_secs()
 }
--- a/scraper-rs/src/sites/carrefour.rs
+++ b/scraper-rs/src/sites/carrefour.rs
@ -0,0 +1,68 @@
 use simple_error::bail;
 use simple_error::SimpleError;
 use crate::sites::common;
 use crate::sites::vtex;
 use crate::PrecioPoint;
 use super::vtex::find_product_ld;
 pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
    let precio_centavos = common::price_from_meta(dom)?;
    let in_stock = vtex::in_stock_from_meta(dom)?;
    let ean = {
        let json = &vtex::parse_script_json(dom, "__STATE__")?;
        let state = json
            .as_object()
            .ok_or(SimpleError::new("Seed state not an object"))?;
        if state.is_empty() {
            bail!("Seed state is an empty object")
        }
        let (_, product_json) = state
            .iter()
            .find(|(key, val)| {
                key.starts_with("Product:") && val.get("__typename").is_some_and(|t| t == "Product")
            })
            .ok_or(SimpleError::new("No product in seed state"))?;
        let cache_id = product_json
            .get("cacheId")
            .and_then(|v| v.as_str())
            .ok_or(SimpleError::new("No cacheId in seed state"))?;
        let (_, product_sku_json) = state
            .iter()
            .find(|(key, val)| {
                key.starts_with(&format!("Product:{}", cache_id))
                    && val.get("__typename").is_some_and(|t| t == "SKU")
            })
            .ok_or(SimpleError::new("No Product:cacheId* found"))?;
        product_sku_json
            .get("ean")
            .and_then(|v| v.as_str())
            .ok_or(SimpleError::new("No product SKU in seed state"))?
            .to_string()
    };
    let (name, image_url) = match find_product_ld(dom) {
        Some(pm) => {
            let p = pm?;
            (Some(p.name), Some(p.image))
        }
        None => match in_stock {
            true => bail!("No JSONLD product in in stock product"),
            false => (None, None),
        },
    };
    Ok(PrecioPoint {
        ean,
        fetched_at: crate::now_sec(),
        in_stock: Some(in_stock),
        name,
        image_url,
        parser_version: 5,
        precio_centavos,
        url,
    })
 }
--- a/scraper-rs/src/sites/common.rs
+++ b/scraper-rs/src/sites/common.rs
@ -0,0 +1,19 @@
 use std::borrow::Cow;
 use tl::VDom;
 pub fn get_meta_content<'a>(dom: &'a VDom<'a>, prop: &str) -> Option<Cow<'a, str>> {
    dom.query_selector(&format!("meta[property=\"{}\"]", prop))
        .and_then(|mut iter| iter.next())
        .and_then(|h| h.get(dom.parser()))
        .and_then(|n| n.as_tag())
        .and_then(|tag| tag.attributes().get("content").flatten())
        .map(|s| s.as_utf8_str())
 }
 pub fn price_from_meta(dom: &tl::VDom<'_>) -> Result<Option<u64>, anyhow::Error> {
    let precio_centavos = get_meta_content(dom, "product:price:amount")
        .map(|s| s.parse::<f64>().map(|f| (f * 100.0) as u64))
        .transpose()?;
    Ok(precio_centavos)
 }
--- a/scraper-rs/src/sites/coto.rs
+++ b/scraper-rs/src/sites/coto.rs
@ -0,0 +1,77 @@
 use anyhow::Context;
 use crate::PrecioPoint;
 pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
    let ean = dom
        .query_selector("div#brandText")
        .unwrap()
        .filter_map(|h| h.get(dom.parser()))
        .filter_map(|n| n.as_tag())
        .find(|t| t.inner_text(dom.parser()).as_ref().contains("| EAN: "))
        .context("No encuentro eanparent")?
        .query_selector(dom.parser(), "span.span_codigoplu")
        .unwrap()
        .filter_map(|h| h.get(dom.parser()))
        .filter_map(|n| n.as_tag())
        .nth(1)
        .context("no encuentro el ean")?
        .inner_text(dom.parser())
        .trim()
        .to_string();
    let precio_centavos = dom
        .query_selector(".atg_store_newPrice")
        .unwrap()
        .filter_map(|h| h.get(dom.parser()))
        .filter_map(|n| n.as_tag())
        .next()
        .map(|t| t.inner_text(dom.parser()))
        .filter(|s| !s.is_empty())
        .map(|s| {
            let s = s.replacen('$', "", 1).replace('.', "").replace(',', ".");
            let s = s.trim();
            s.parse::<f64>()
        })
        .transpose()
        .context("Parseando precio")?
        .map(|f| (f * 100.0) as u64);
    let in_stock = Some(
        dom.query_selector(".product_not_available")
            .unwrap()
            .filter_map(|h| h.get(dom.parser()))
            .filter_map(|n| n.as_tag())
            .next()
            .is_some(),
    );
    let name = dom
        .query_selector("h1.product_page")
        .unwrap()
        .filter_map(|h| h.get(dom.parser()))
        .filter_map(|n| n.as_tag())
        .next()
        .map(|t| t.inner_text(dom.parser()))
        .map(|s| s.trim().to_string());
    let image_url = dom
        .query_selector(".zoomImage1")
        .unwrap()
        .filter_map(|h| h.get(dom.parser()))
        .filter_map(|n| n.as_tag())
        .next()
        .and_then(|t| t.attributes().get("src").flatten())
        .map(|s| s.as_utf8_str().to_string());
    Ok(PrecioPoint {
        ean,
        fetched_at: crate::now_sec(),
        in_stock,
        name,
        image_url,
        parser_version: 5,
        precio_centavos,
        url,
    })
 }
--- a/scraper-rs/src/sites/dia.rs
+++ b/scraper-rs/src/sites/dia.rs
@ -0,0 +1,41 @@
 use anyhow::Context;
 use simple_error::bail;
 use crate::sites::common;
 use crate::PrecioPoint;
 use super::vtex::find_product_ld;
 use super::vtex::AvailabilityLd;
 pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
    let ean = common::get_meta_content(dom, "product:retailer_item_id")
        .context("Parsing EAN")?
        .to_string();
    let precio_centavos = common::price_from_meta(dom)?;
    let (name, image_url, in_stock) = match find_product_ld(dom) {
        Some(pm) => {
            let p = pm?;
            (
                Some(p.name),
                Some(p.image),
                Some(
                    p.offers.offers.first().context("No offer")?.availability
                        == AvailabilityLd::InStock,
                ),
            )
        }
        None => bail!("No JSON/LD"),
    };
    Ok(PrecioPoint {
        ean,
        fetched_at: crate::now_sec(),
        in_stock,
        name,
        image_url,
        parser_version: 5,
        precio_centavos,
        url,
    })
 }
--- a/scraper-rs/src/sites/jumbo.rs
+++ b/scraper-rs/src/sites/jumbo.rs
@ -0,0 +1,92 @@
 use std::str::FromStr;
 use anyhow::Context;
 use reqwest::Url;
 use serde::Deserialize;
 use simple_error::bail;
 use crate::sites::common;
 use crate::PrecioPoint;
 use super::vtex;
 #[derive(Deserialize)]
 struct JumboSearch {
    items: Vec<JumboSearchItem>,
 }
 #[derive(Deserialize)]
 struct JumboSearchItem {
    ean: String,
 }
 async fn get_ean_from_search(
    client: &reqwest::Client,
    retailer_sku: String,
 ) -> anyhow::Result<String> {
    let s = client
        .get({
            let mut url =
                Url::from_str("https://www.jumbo.com.ar/api/catalog_system/pub/products/search")
                    .unwrap();
            url.set_query(Some(&format!("fq=skuId:{}", retailer_sku)));
            url
        })
        .send()
        .await?
        .text()
        .await?;
    let ean = {
        let search: Vec<JumboSearch> = serde_json::from_str(&s)?;
        let result = search.first().context("No search result")?;
        let ean = result
            .items
            .first()
            .context("No search result")?
            .ean
            .clone();
        if !result.items.iter().all(|i| i.ean == ean) {
            bail!("Inesperado: no todos los items tienen el mismo EAN")
        }
        ean
    };
    Ok(ean)
 }
 pub async fn scrap(
    client: &reqwest::Client,
    url: String,
    body: &str,
 ) -> Result<PrecioPoint, anyhow::Error> {
    let (name, image_url, sku, precio_centavos, in_stock) = {
        let dom = tl::parse(body, tl::ParserOptions::default())?;
        let precio_centavos = common::price_from_meta(&dom)?;
        let in_stock = vtex::in_stock_from_meta(&dom)?;
        match vtex::find_product_ld(&dom) {
            Some(pm) => {
                let p = pm?;
                (
                    Some(p.name),
                    Some(p.image),
                    p.sku.context("No retailer SKU in Product LD")?,
                    precio_centavos,
                    in_stock,
                )
            }
            None => bail!("No JSON/LD"),
        }
    };
    let ean = get_ean_from_search(client, sku).await?;
    Ok(PrecioPoint {
        ean,
        fetched_at: crate::now_sec(),
        in_stock: Some(in_stock),
        name,
        image_url,
        parser_version: 5,
        precio_centavos,
        url,
    })
 }
--- a/scraper-rs/src/sites/mod.rs
+++ b/scraper-rs/src/sites/mod.rs
@ -0,0 +1,6 @@
 pub mod carrefour;
 mod common;
 pub mod coto;
 pub mod dia;
 pub mod jumbo;
 mod vtex;
--- a/scraper-rs/src/sites/vtex.rs
+++ b/scraper-rs/src/sites/vtex.rs
@ -0,0 +1,102 @@
 use anyhow::{bail, Context};
 use serde::Deserialize;
 use simple_error::SimpleError;
 use tl::VDom;
 use super::common;
 pub fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, anyhow::Error> {
    let inner_html = &dom
        .query_selector("template[data-type=\"json\"]")
        .unwrap()
        .filter_map(|h| h.get(dom.parser()).and_then(|n| n.as_tag()))
        .find(|t| {
            t.attributes()
                .get("data-varname")
                .flatten()
                .map_or(false, |v| v.as_utf8_str() == varname)
        })
        .ok_or(SimpleError::new("Failed to get template tag"))?
        .query_selector(dom.parser(), "script")
        .and_then(|mut it| it.next())
        .and_then(|h| h.get(dom.parser()))
        .ok_or(SimpleError::new("Failed to get script tag"))?
        .inner_html(dom.parser());
    inner_html.parse().context("Couldn't parse JSON in script")
 }
 pub fn get_json_lds<'a>(
    dom: &'a VDom,
 ) -> impl Iterator<Item = std::result::Result<serde_json::Value, serde_json::Error>> + 'a {
    dom.query_selector("script[type=\"application/ld+json\"]")
        .unwrap()
        .filter_map(|h| h.get(dom.parser()))
        .filter_map(|n| n.as_tag())
        .map(|t| serde_json::from_str(&t.inner_html(dom.parser())))
 }
 pub fn find_json_ld(dom: &VDom, typ: &str) -> Option<Result<Ld, serde_json::Error>> {
    get_json_lds(dom)
        .filter_map(|v| v.ok())
        .find(|v| v.get("@type").is_some_and(|t| t == typ))
        .map(serde_json::from_value)
 }
 pub fn find_product_ld(dom: &VDom) -> Option<Result<ProductLd, serde_json::Error>> {
    find_json_ld(dom, "Product").map(|l| {
        l.map(|l| match l {
            Ld::Product(p) => p,
        })
    })
 }
 #[derive(Deserialize)]
 #[serde(tag = "@type")]
 pub enum Ld {
    Product(ProductLd),
 }
 #[derive(Deserialize)]
 #[serde(rename_all = "camelCase")]
 pub struct ProductLd {
    pub name: String,
    pub image: String,
    pub sku: Option<String>,
    pub offers: OffersLd,
 }
 #[derive(Deserialize)]
 pub struct OffersLd {
    pub offers: Vec<OfferLd>,
 }
 #[derive(Deserialize)]
 #[serde(rename_all = "camelCase")]
 pub struct OfferLd {
    #[serde(rename = "@type")]
    _type: OfferTypeLd,
    pub price: f64,
    pub price_currency: String,
    pub availability: AvailabilityLd,
 }
 #[derive(Deserialize)]
 pub enum OfferTypeLd {
    Offer,
 }
 #[derive(Deserialize, PartialEq)]
 pub enum AvailabilityLd {
    #[serde(rename = "http://schema.org/InStock")]
    InStock,
    #[serde(rename = "http://schema.org/OutOfStock")]
    OutOfStock,
 }
 pub fn in_stock_from_meta(dom: &VDom) -> anyhow::Result<bool> {
    Ok(
        match common::get_meta_content(dom, "product:availability") {
            Some(s) => match s.as_ref() {
                "oos" => false,
                "instock" => true,
                _ => bail!("Not a valid product:availability"),
            },
            None => bail!("No product:availability in carrefour"),
        },
    )
 }
--- a/scraper/auto.ts
+++ b/scraper/auto.ts
@ -4,7 +4,6 @@ import { join } from "node:path";
 import { Supermercado, hosts, supermercados } from "db-datos/supermercado.js";
 import PQueue from "p-queue";
 import { formatDuration, intervalToDuration } from "date-fns";
 import { downloadList } from "./scrap.js";
 import { db } from "db-datos/db.js";
 import { like } from "drizzle-orm";
 import { productoUrls } from "db-datos/schema.js";
@ -12,9 +11,10 @@ import { scrapDiaProducts } from "../link-scrapers/dia.js";
 import { scrapCotoProducts } from "../link-scrapers/coto.js";
 import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
 import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
 import { readableStreamToText } from "bun";
 // hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU
-const scrapQueue = new PQueue({ concurrency: 4 });
+const scrapQueue = new PQueue({ concurrency: 1 });
 export async function auto() {
  const a = new Auto();
@ -38,11 +38,7 @@ class Auto {
    this.inform("[auto] Empezando scrap");
  }
-  async downloadList(supermercado: Supermercado) {
+  async scrapUrls(supermercado: Supermercado) {
    const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-download-"));
    let listPath: string;
    {
    const t0 = performance.now();
    switch (supermercado) {
      case "Dia":
@ -63,7 +59,14 @@ class Auto {
    );
  }
-    listPath = join(ctxPath, `lista-${supermercado}.txt`);
+  async downloadList(supermercado: Supermercado) {
    const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-download-"));
    await scrapQueue.add(async () => {
      await this.scrapUrls(supermercado);
    });
    const listPath = join(ctxPath, `lista-${supermercado}.txt`);
    const host = Object.entries(hosts).find(
      ([host, supe]) => supe === supermercado
    )![0];
@ -82,16 +85,25 @@ class Auto {
  async scrapAndInform({ listPath }: { listPath: string }) {
    const res = await scrapQueue.add(async () => {
      const t0 = performance.now();
-      const progress = await downloadList(listPath);
+
-      return { took: performance.now() - t0, progress };
+      const sub = Bun.spawn({
        cmd: ["scraper-rs", "fetch-list", listPath],
        stdio: ["ignore", "pipe", "inherit"],
      });
      const text = await readableStreamToText(sub.stdout);
      const code = await sub.exited;
      if (code !== 0) throw new Error(`scraper-rs threw ${code}`);
      return { took: performance.now() - t0, text };
    });
    if (res) {
-      const { took, progress } = res;
+      const { took, text } = res;
      this.inform(
-        `Procesado ${listPath} (${progress.done} ok, ${
+        `Procesado ${listPath} (${text}) (tardó ${formatMs(took)})`
-          progress.skipped
+        //(${progress.done} ok, ${
-        } skipped, ${progress.errors.length} errores) (tardó ${formatMs(took)})`
+        //   progress.skipped
        // } skipped, ${progress.errors.length} errores)
      );
    } else {
      this.inform(`Algo falló en ${listPath}`);
--- a/scraper/cli.ts
+++ b/scraper/cli.ts
@ -4,9 +4,14 @@ import { scrapDiaProducts } from "../link-scrapers/dia.js";
 import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
 import { auto } from "./auto.js";
 import { downloadList, getProduct } from "./scrap.js";
 import Cron from "croner";
 if (process.argv[2] === "auto") {
  await auto();
 } else if (process.argv[2] === "cron") {
  Cron("0 2 * * *", () => {
    auto();
  });
 } else if (process.argv[2] === "scrap-carrefour-links") {
  await scrapCarrefourProducts();
 } else if (process.argv[2] === "scrap-dia-links") {
--- a/scraper/package.json
+++ b/scraper/package.json
@ -13,6 +13,7 @@
  "dependencies": {
    "@aws-sdk/client-s3": "^3.478.0",
    "@aws-sdk/lib-storage": "^3.478.0",
    "croner": "^8.0.0",
    "date-fns": "^3.0.6",
    "db-datos": "workspace:^",
    "drizzle-orm": "^0.29.1",
--- a/sitio/package.json
+++ b/sitio/package.json
@ -38,7 +38,6 @@
    "better-sqlite3": "^9.2.2",
    "chart.js": "^4.4.1",
    "chartjs-adapter-dayjs-4": "^1.0.4",
    "croner": "^8.0.0",
    "dayjs": "^1.11.10",
    "drizzle-orm": "^0.29.1"
  }
--- a/sitio/src/hooks.server.ts
+++ b/sitio/src/hooks.server.ts
@ -1,12 +0,0 @@
 import { spawn } from "child_process";
 import Cron from "croner";
 if (process.env.NODE_ENV === "production") {
  const job = Cron("15 3 * * *", () => {
    runScraper();
  });
 }
 function runScraper() {
  spawn("bun", ["/bin/scraper", "auto"], { stdio: "inherit" });
 }
Author	SHA1	Message	Date
Nulo	6853b6389a	ci: activar cache	2024-01-12 00:08:02 -03:00
Nulo	a29b86c8d2	dia: solo pedir sitemap	2024-01-12 00:07:11 -03:00
Nulo	76cd8f3658	volver a tl viejo pero sin simd	2024-01-11 23:46:40 -03:00
Nulo	34bd9aa07e	scrapear otras cosas aparte de dia	2024-01-11 23:46:33 -03:00
Nulo	7b9533cde6	traer sqlite-libs	2024-01-11 23:35:12 -03:00
Nulo	6507bd944b	ups	2024-01-11 23:23:47 -03:00
Nulo	78e0f3cdee	arreglar container scraper	2024-01-11 22:47:20 -03:00
Nulo	66fc8767ef	correr cron dentro de scraper	2024-01-11 21:08:43 -03:00
Nulo	a438eec238	wip: blah	2024-01-11 21:11:12 +00:00
Nulo	eb2f04695c	wip: arreglar dockerfile.scraper	2024-01-11 20:56:51 +00:00
Nulo	bf3dbcc019	docker en devcontainer	2024-01-11 20:56:40 +00:00
Nulo	fa9a010263	wip: docker -rs	2024-01-11 17:40:09 -03:00
Nulo	07324f756c	lockfile	2024-01-11 17:35:57 -03:00
Nulo	91397a31d2	borrar cron sitio	2024-01-11 17:21:14 -03:00
Nulo	4b211c89af	conectar auto con scraper-rs!	2024-01-11 17:20:12 -03:00
Nulo	cfae80cb9a	Merge branch 'master' into wip-rust-downloader	2024-01-11 17:19:07 -03:00
Nulo	972d5ade18	jumbo	2024-01-11 15:48:20 -03:00
Nulo	1348bee6c7	no instrumentar funciones pesadas arregla problemas de perf	2024-01-11 15:47:54 -03:00
Nulo	8e8fe8ddaf	lista jumbo actualizada cambian las urls muy rápido	2024-01-11 15:30:25 -03:00
Nulo	37ceb15e74	arreglar image_url coto	2024-01-11 14:45:50 -03:00
Nulo	f2401aa965	parse file y init coto (WIP	2024-01-11 14:09:18 -03:00
Nulo	3a31586193	limpiar	2024-01-11 14:02:13 -03:00
Nulo	8f6f62a261	clap cli	2024-01-11 14:02:02 -03:00
Nulo	b696551949	scraper-rs: dia	2024-01-11 13:05:51 -03:00
Nulo	27aee01c1a	scraper-rs: simplificar y parsear json ld	2024-01-11 12:55:14 -03:00
Nulo	348d054b7b	renombrar warcificator -> scraper-rs	2024-01-10 21:46:10 -03:00
Nulo	613efc3111	warcificator: restructuracion masiva	2024-01-10 21:44:35 -03:00
Nulo	78878d8b7e	warcificator: seguir limpiando	2024-01-08 11:57:08 -03:00
Nulo	1abd98724d	warcificator: limpiar	2024-01-08 11:55:59 -03:00
Nulo	56a257c389	warcificator: limpiar	2024-01-08 11:53:46 -03:00
Nulo	2d2912e4e9	warcificator : conseguir threads de env	2024-01-08 11:51:49 -03:00
Nulo	abd430421c	sqlite en contenedor	2024-01-08 10:29:55 -03:00
Nulo	c56272dc30	Merge branch 'master' into wip-rust-downloader	2024-01-08 10:29:24 -03:00
Nulo	3cf723cc3d	xoxo	2024-01-01 22:31:57 -03:00