Compare commits

...

34 commits

Author SHA1 Message Date
6853b6389a ci: activar cache 2024-01-12 00:08:02 -03:00
a29b86c8d2 dia: solo pedir sitemap 2024-01-12 00:07:11 -03:00
76cd8f3658 volver a tl viejo pero sin simd 2024-01-11 23:46:40 -03:00
34bd9aa07e scrapear otras cosas aparte de dia 2024-01-11 23:46:33 -03:00
7b9533cde6 traer sqlite-libs 2024-01-11 23:35:12 -03:00
6507bd944b ups 2024-01-11 23:23:47 -03:00
78e0f3cdee arreglar container scraper 2024-01-11 22:47:20 -03:00
66fc8767ef correr cron dentro de scraper 2024-01-11 21:08:43 -03:00
Nulo
a438eec238
wip: blah 2024-01-11 21:11:12 +00:00
Nulo
eb2f04695c
wip: arreglar dockerfile.scraper 2024-01-11 20:56:51 +00:00
Nulo
bf3dbcc019
docker en devcontainer 2024-01-11 20:56:40 +00:00
fa9a010263 wip: docker -rs 2024-01-11 17:40:09 -03:00
07324f756c lockfile 2024-01-11 17:35:57 -03:00
91397a31d2 borrar cron sitio 2024-01-11 17:21:14 -03:00
4b211c89af conectar auto con scraper-rs! 2024-01-11 17:20:12 -03:00
cfae80cb9a Merge branch 'master' into wip-rust-downloader 2024-01-11 17:19:07 -03:00
972d5ade18 jumbo 2024-01-11 15:48:20 -03:00
1348bee6c7 no instrumentar funciones pesadas
arregla problemas de perf
2024-01-11 15:47:54 -03:00
8e8fe8ddaf lista jumbo actualizada
cambian las urls muy rápido
2024-01-11 15:30:25 -03:00
37ceb15e74 arreglar image_url coto 2024-01-11 14:45:50 -03:00
f2401aa965 parse file y init coto (WIP 2024-01-11 14:09:18 -03:00
3a31586193 limpiar 2024-01-11 14:02:13 -03:00
8f6f62a261 clap cli 2024-01-11 14:02:02 -03:00
b696551949 scraper-rs: dia 2024-01-11 13:05:51 -03:00
27aee01c1a scraper-rs: simplificar y parsear json ld 2024-01-11 12:55:14 -03:00
348d054b7b renombrar warcificator -> scraper-rs 2024-01-10 21:46:10 -03:00
613efc3111 warcificator: restructuracion masiva 2024-01-10 21:44:35 -03:00
78878d8b7e warcificator: seguir limpiando 2024-01-08 11:57:08 -03:00
1abd98724d warcificator: limpiar 2024-01-08 11:55:59 -03:00
56a257c389 warcificator: limpiar 2024-01-08 11:53:46 -03:00
2d2912e4e9 warcificator : conseguir threads de env 2024-01-08 11:51:49 -03:00
abd430421c sqlite en contenedor 2024-01-08 10:29:55 -03:00
c56272dc30 Merge branch 'master' into wip-rust-downloader 2024-01-08 10:29:24 -03:00
3cf723cc3d xoxo 2024-01-01 22:31:57 -03:00
24 changed files with 2643 additions and 59 deletions

View file

@ -9,7 +9,8 @@
"ghcr.io/devcontainers/features/git-lfs:1": {}, "ghcr.io/devcontainers/features/git-lfs:1": {},
"ghcr.io/devcontainers/features/node:1": {}, "ghcr.io/devcontainers/features/node:1": {},
"ghcr.io/swift-server-community/swift-devcontainer-features/sqlite:1": {}, "ghcr.io/swift-server-community/swift-devcontainer-features/sqlite:1": {},
"ghcr.io/devcontainers/features/rust:1": {} "ghcr.io/devcontainers/features/rust:1": {},
"ghcr.io/devcontainers/features/docker-in-docker:2": {}
}, },
// Use 'forwardPorts' to make a list of ports inside the container available locally. // Use 'forwardPorts' to make a list of ports inside the container available locally.

View file

@ -81,3 +81,5 @@ jobs:
push: true push: true
tags: ${{ steps.meta.outputs.tags }} tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }} labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max

4
.gitignore vendored
View file

@ -13,3 +13,7 @@ scraper/x.tsv
*.tmp *.tmp
target/ target/
.env.* .env.*
*/flamegraph.svg
*/perf.data*
scraper-rs/debug/

8
.vscode/launch.json vendored
View file

@ -7,13 +7,13 @@
{ {
"type": "lldb", "type": "lldb",
"request": "launch", "request": "launch",
"name": "warcificator", "name": "scraper-rs",
"cwd": "warcificator/", "cwd": "scraper-rs/",
"cargo": { "cargo": {
// https://github.com/vadimcn/codelldb/issues/884 // https://github.com/vadimcn/codelldb/issues/884
"args": ["build", "--manifest-path=warcificator/Cargo.toml"] "args": ["build", "--manifest-path=scraper-rs/Cargo.toml"]
}, },
"args": ["../data/samples/Carrefour.50.txt"], "args": ["../data/Carrefour.txt"],
"env": {} "env": {}
}, },
{ {

View file

@ -11,7 +11,7 @@ RUN cd sitio && \
RUN bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js RUN bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js
FROM cgr.dev/chainguard/wolfi-base FROM cgr.dev/chainguard/wolfi-base
RUN apk add --no-cache nodejs npm jq bun RUN apk add --no-cache nodejs npm jq bun sqlite
# Sitio # Sitio
COPY --from=build /usr/src/app/sitio/package.json package.real.json COPY --from=build /usr/src/app/sitio/package.json package.real.json

View file

@ -1,5 +1,6 @@
FROM docker.io/oven/bun:1-alpine AS base FROM cgr.dev/chainguard/wolfi-base AS base
WORKDIR /usr/src/app WORKDIR /usr/src/app
RUN apk add --no-cache bun libgcc
FROM base as build FROM base as build
ENV NODE_ENV=production ENV NODE_ENV=production
@ -7,17 +8,22 @@ COPY . .
RUN bun install --frozen-lockfile RUN bun install --frozen-lockfile
RUN bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js RUN bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js
# nightly porque usamos tl con `simd` activado
FROM base as rs-build
RUN apk add --no-cache rust build-base sqlite-dev
COPY scraper-rs/ .
RUN cargo install --locked --path .
FROM base FROM base
RUN apk add --no-cache sqlite sqlite-libs
# Scraper # Scraper
COPY --from=build /tmp/cli.build.js /bin/scraper COPY --from=build /tmp/cli.build.js /bin/scraper
COPY --from=build /usr/src/app/db-datos/drizzle /bin/drizzle COPY --from=build /usr/src/app/db-datos/drizzle /bin/drizzle
COPY --from=rs-build /root/.cargo/bin/scraper-rs /usr/local/bin/scraper-rs
ENV NODE_ENV=production ENV NODE_ENV=production
ENV DB_PATH=/db/db.db ENV DB_PATH=/db/db.db
# Cron scraper CMD ["bun", "/bin/scraper", "cron"]
RUN printf "#!/bin/sh\nexec bun /bin/scraper auto\n" > /etc/periodic/daily/scraper \
&& chmod +x /etc/periodic/daily/scraper
CMD ["busybox", "crond", "-f", "-l2"]

BIN
bun.lockb

Binary file not shown.

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1 version https://git-lfs.github.com/spec/v1
oid sha256:6299b470d9debc9a173b40c2ba91208eb43a6c8cde02a4819e7bcd76368e4363 oid sha256:f231884c2b9fd0b633746892a00824379b4d8aa110e6348309197b83b0d1c555
size 922185 size 926218

View file

@ -68,7 +68,10 @@ const categorias = [
]; ];
export async function scrapDiaProducts() { export async function scrapDiaProducts() {
await Promise.all([scrapBySite(), scrapBySitemap()]); await Promise.all([
// scrapBySite(),
scrapBySitemap(),
]);
} }
async function scrapBySitemap() { async function scrapBySitemap() {
@ -104,7 +107,7 @@ async function scrapBySite() {
await pMap( await pMap(
links, links,
async (url) => { async (url) => {
const res = await fetch(url); const res = await fetch(url, { timeout: false });
const html = await res.text(); const html = await res.text();
const { document } = parseHTML(html); const { document } = parseHTML(html);

1878
scraper-rs/Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

33
scraper-rs/Cargo.toml Normal file
View file

@ -0,0 +1,33 @@
[package]
name = "scraper-rs"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
again = "0.1.2"
anyhow = "1.0.79"
async-channel = "2.1.1"
clap = { version = "4.4.15", features = ["derive"] }
nanoid = "0.4.0"
r2d2 = "0.8.10"
r2d2_sqlite = "0.23.0"
rand = "0.8.5"
# lol_html = "1.2.0"
reqwest = { version = "0.11.23", default-features = false, features = [
"rustls-tls",
"gzip",
"brotli",
"socks",
] }
rusqlite = "0.30.0"
# scraper = "0.18.1"
serde = { version = "1.0.193", features = ["derive"] }
serde_json = "1.0.109"
simple-error = "0.3.0"
thiserror = "1.0.56"
tl = { git = "https://github.com/evertedsphere/tl", branch = "patch-1" }
tokio = { version = "1.35.1", features = ["full"] }
tracing = "0.1"
tracing-subscriber = "0.3"

247
scraper-rs/src/main.rs Normal file
View file

@ -0,0 +1,247 @@
use again::RetryPolicy;
use async_channel::Receiver;
use clap::Parser;
use nanoid::nanoid;
use r2d2::Pool;
use r2d2_sqlite::SqliteConnectionManager;
use reqwest::{StatusCode, Url};
use simple_error::{bail, SimpleError};
use std::{
env::{self},
fs,
path::PathBuf,
time::Duration,
};
use thiserror::Error;
#[derive(Parser)] // requires `derive` feature
enum Args {
FetchList(FetchListArgs),
ParseFile(ParseFileArgs),
}
#[derive(clap::Args)]
struct FetchListArgs {
list_path: String,
}
#[derive(clap::Args)]
struct ParseFileArgs {
file_path: String,
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
tracing_subscriber::fmt::init();
match Args::parse() {
Args::FetchList(a) => fetch_list_cli(a.list_path).await,
Args::ParseFile(a) => parse_file_cli(a.file_path).await,
}
}
async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
let links_str = fs::read_to_string(links_list_path).unwrap();
let links = links_str
.split('\n')
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.map(|s| s.to_owned())
.collect::<Vec<_>>();
let (sender, receiver) = async_channel::bounded::<String>(1);
let db_path = env::var("DB_PATH").unwrap_or("../scraper/sqlite.db".to_string());
let manager = SqliteConnectionManager::file(db_path);
let pool = Pool::new(manager).unwrap();
let n_coroutines = env::var("N_COROUTINES")
.map_or(Ok(128), |s| s.parse::<usize>())
.expect("N_COROUTINES no es un número");
let handles = (1..n_coroutines)
.map(|_| {
let rx = receiver.clone();
let pool = pool.clone();
tokio::spawn(worker(rx, pool))
})
.collect::<Vec<_>>();
for link in links {
sender.send_blocking(link).unwrap();
}
sender.close();
let mut counters = Counters::default();
for handle in handles {
let c = handle.await.unwrap();
counters.success += c.success;
counters.errored += c.errored;
counters.skipped += c.skipped;
}
println!("Finished: {:?}", counters);
Ok(())
}
fn build_client() -> reqwest::Client {
reqwest::ClientBuilder::default().build().unwrap()
}
#[derive(Default, Debug)]
struct Counters {
success: u64,
errored: u64,
skipped: u64,
}
async fn worker(rx: Receiver<String>, pool: Pool<SqliteConnectionManager>) -> Counters {
let client = build_client();
let mut counters = Counters::default();
while let Ok(url) = rx.recv().await {
let res = fetch_and_parse(&client, url.clone()).await;
match res {
Ok(res) => {
counters.success += 1;
pool.get().unwrap().execute("INSERT INTO precios(ean, fetched_at, precio_centavos, in_stock, url, warc_record_id, parser_version, name, image_url) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9);",rusqlite::params![
res.ean,
res.fetched_at,
res.precio_centavos,
res.in_stock,
res.url,
None::<String>,
res.parser_version,
res.name,
res.image_url,
]).unwrap();
}
Err(err) => {
match err.downcast_ref::<FetchError>() {
Some(FetchError::HttpStatus(StatusCode::NOT_FOUND)) => counters.skipped += 1,
_ => counters.errored += 1,
}
tracing::error!(error=%err, url=url);
}
}
}
counters
}
#[derive(Debug, Error)]
enum FetchError {
#[error("reqwest error")]
Http(#[from] reqwest::Error),
#[error("http status: {0}")]
HttpStatus(reqwest::StatusCode),
#[error("parse error")]
Parse(#[from] SimpleError),
#[error("tl error")]
Tl(#[from] tl::ParseError),
}
#[tracing::instrument(skip(client))]
async fn fetch_and_parse(
client: &reqwest::Client,
url: String,
) -> Result<PrecioPoint, anyhow::Error> {
let policy = RetryPolicy::exponential(Duration::from_millis(300))
.with_max_retries(10)
.with_jitter(true);
let response = policy
.retry(|| {
let request = client.get(url.as_str()).build().unwrap();
client.execute(request)
})
.await
.map_err(FetchError::Http)?;
if !response.status().is_success() {
bail!(FetchError::HttpStatus(response.status()));
}
let body = response.text().await.map_err(FetchError::Http)?;
let maybe_point = { scrap_url(client, url, &body).await };
let point = match maybe_point {
Ok(p) => Ok(p),
Err(err) => {
let debug_path = PathBuf::from("debug/");
tokio::fs::create_dir_all(&debug_path).await.unwrap();
let file_path = debug_path.join(format!("{}.html", nanoid!()));
tokio::fs::write(&file_path, &body).await.unwrap();
tracing::debug!(error=%err, "Failed to parse, saved body at {}",file_path.display());
Err(err)
}
}?;
Ok(point)
}
async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
let file = tokio::fs::read_to_string(file_path).await?;
let client = build_client();
let url = {
let dom = tl::parse(&file, tl::ParserOptions::default())?;
dom.query_selector("link[rel=\"canonical\"]")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.next()
.and_then(|t| t.attributes().get("href").flatten())
.expect("No meta canonical")
.as_utf8_str()
.to_string()
};
println!("URL: {}", &url);
println!("{:?}", scrap_url(&client, url, &file).await);
Ok(())
}
async fn scrap_url(
client: &reqwest::Client,
url: String,
body: &str,
) -> anyhow::Result<PrecioPoint> {
let url_p = Url::parse(&url).unwrap();
match url_p.host_str().unwrap() {
"www.carrefour.com.ar" => {
sites::carrefour::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
}
"diaonline.supermercadosdia.com.ar" => {
sites::dia::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
}
"www.cotodigital3.com.ar" => {
sites::coto::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
}
"www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await,
s => bail!("Unknown host {}", s),
}
}
use std::time::{SystemTime, UNIX_EPOCH};
mod sites;
#[derive(Debug)]
struct PrecioPoint {
ean: String,
// unix
fetched_at: u64,
precio_centavos: Option<u64>,
in_stock: Option<bool>,
url: String,
parser_version: u16,
name: Option<String>,
image_url: Option<String>,
}
fn now_sec() -> u64 {
let start = SystemTime::now();
let since_the_epoch = start
.duration_since(UNIX_EPOCH)
.expect("Time went backwards");
since_the_epoch.as_secs()
}

View file

@ -0,0 +1,68 @@
use simple_error::bail;
use simple_error::SimpleError;
use crate::sites::common;
use crate::sites::vtex;
use crate::PrecioPoint;
use super::vtex::find_product_ld;
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
let precio_centavos = common::price_from_meta(dom)?;
let in_stock = vtex::in_stock_from_meta(dom)?;
let ean = {
let json = &vtex::parse_script_json(dom, "__STATE__")?;
let state = json
.as_object()
.ok_or(SimpleError::new("Seed state not an object"))?;
if state.is_empty() {
bail!("Seed state is an empty object")
}
let (_, product_json) = state
.iter()
.find(|(key, val)| {
key.starts_with("Product:") && val.get("__typename").is_some_and(|t| t == "Product")
})
.ok_or(SimpleError::new("No product in seed state"))?;
let cache_id = product_json
.get("cacheId")
.and_then(|v| v.as_str())
.ok_or(SimpleError::new("No cacheId in seed state"))?;
let (_, product_sku_json) = state
.iter()
.find(|(key, val)| {
key.starts_with(&format!("Product:{}", cache_id))
&& val.get("__typename").is_some_and(|t| t == "SKU")
})
.ok_or(SimpleError::new("No Product:cacheId* found"))?;
product_sku_json
.get("ean")
.and_then(|v| v.as_str())
.ok_or(SimpleError::new("No product SKU in seed state"))?
.to_string()
};
let (name, image_url) = match find_product_ld(dom) {
Some(pm) => {
let p = pm?;
(Some(p.name), Some(p.image))
}
None => match in_stock {
true => bail!("No JSONLD product in in stock product"),
false => (None, None),
},
};
Ok(PrecioPoint {
ean,
fetched_at: crate::now_sec(),
in_stock: Some(in_stock),
name,
image_url,
parser_version: 5,
precio_centavos,
url,
})
}

View file

@ -0,0 +1,19 @@
use std::borrow::Cow;
use tl::VDom;
pub fn get_meta_content<'a>(dom: &'a VDom<'a>, prop: &str) -> Option<Cow<'a, str>> {
dom.query_selector(&format!("meta[property=\"{}\"]", prop))
.and_then(|mut iter| iter.next())
.and_then(|h| h.get(dom.parser()))
.and_then(|n| n.as_tag())
.and_then(|tag| tag.attributes().get("content").flatten())
.map(|s| s.as_utf8_str())
}
pub fn price_from_meta(dom: &tl::VDom<'_>) -> Result<Option<u64>, anyhow::Error> {
let precio_centavos = get_meta_content(dom, "product:price:amount")
.map(|s| s.parse::<f64>().map(|f| (f * 100.0) as u64))
.transpose()?;
Ok(precio_centavos)
}

View file

@ -0,0 +1,77 @@
use anyhow::Context;
use crate::PrecioPoint;
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
let ean = dom
.query_selector("div#brandText")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.find(|t| t.inner_text(dom.parser()).as_ref().contains("| EAN: "))
.context("No encuentro eanparent")?
.query_selector(dom.parser(), "span.span_codigoplu")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.nth(1)
.context("no encuentro el ean")?
.inner_text(dom.parser())
.trim()
.to_string();
let precio_centavos = dom
.query_selector(".atg_store_newPrice")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.next()
.map(|t| t.inner_text(dom.parser()))
.filter(|s| !s.is_empty())
.map(|s| {
let s = s.replacen('$', "", 1).replace('.', "").replace(',', ".");
let s = s.trim();
s.parse::<f64>()
})
.transpose()
.context("Parseando precio")?
.map(|f| (f * 100.0) as u64);
let in_stock = Some(
dom.query_selector(".product_not_available")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.next()
.is_some(),
);
let name = dom
.query_selector("h1.product_page")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.next()
.map(|t| t.inner_text(dom.parser()))
.map(|s| s.trim().to_string());
let image_url = dom
.query_selector(".zoomImage1")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.next()
.and_then(|t| t.attributes().get("src").flatten())
.map(|s| s.as_utf8_str().to_string());
Ok(PrecioPoint {
ean,
fetched_at: crate::now_sec(),
in_stock,
name,
image_url,
parser_version: 5,
precio_centavos,
url,
})
}

View file

@ -0,0 +1,41 @@
use anyhow::Context;
use simple_error::bail;
use crate::sites::common;
use crate::PrecioPoint;
use super::vtex::find_product_ld;
use super::vtex::AvailabilityLd;
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
let ean = common::get_meta_content(dom, "product:retailer_item_id")
.context("Parsing EAN")?
.to_string();
let precio_centavos = common::price_from_meta(dom)?;
let (name, image_url, in_stock) = match find_product_ld(dom) {
Some(pm) => {
let p = pm?;
(
Some(p.name),
Some(p.image),
Some(
p.offers.offers.first().context("No offer")?.availability
== AvailabilityLd::InStock,
),
)
}
None => bail!("No JSON/LD"),
};
Ok(PrecioPoint {
ean,
fetched_at: crate::now_sec(),
in_stock,
name,
image_url,
parser_version: 5,
precio_centavos,
url,
})
}

View file

@ -0,0 +1,92 @@
use std::str::FromStr;
use anyhow::Context;
use reqwest::Url;
use serde::Deserialize;
use simple_error::bail;
use crate::sites::common;
use crate::PrecioPoint;
use super::vtex;
#[derive(Deserialize)]
struct JumboSearch {
items: Vec<JumboSearchItem>,
}
#[derive(Deserialize)]
struct JumboSearchItem {
ean: String,
}
async fn get_ean_from_search(
client: &reqwest::Client,
retailer_sku: String,
) -> anyhow::Result<String> {
let s = client
.get({
let mut url =
Url::from_str("https://www.jumbo.com.ar/api/catalog_system/pub/products/search")
.unwrap();
url.set_query(Some(&format!("fq=skuId:{}", retailer_sku)));
url
})
.send()
.await?
.text()
.await?;
let ean = {
let search: Vec<JumboSearch> = serde_json::from_str(&s)?;
let result = search.first().context("No search result")?;
let ean = result
.items
.first()
.context("No search result")?
.ean
.clone();
if !result.items.iter().all(|i| i.ean == ean) {
bail!("Inesperado: no todos los items tienen el mismo EAN")
}
ean
};
Ok(ean)
}
pub async fn scrap(
client: &reqwest::Client,
url: String,
body: &str,
) -> Result<PrecioPoint, anyhow::Error> {
let (name, image_url, sku, precio_centavos, in_stock) = {
let dom = tl::parse(body, tl::ParserOptions::default())?;
let precio_centavos = common::price_from_meta(&dom)?;
let in_stock = vtex::in_stock_from_meta(&dom)?;
match vtex::find_product_ld(&dom) {
Some(pm) => {
let p = pm?;
(
Some(p.name),
Some(p.image),
p.sku.context("No retailer SKU in Product LD")?,
precio_centavos,
in_stock,
)
}
None => bail!("No JSON/LD"),
}
};
let ean = get_ean_from_search(client, sku).await?;
Ok(PrecioPoint {
ean,
fetched_at: crate::now_sec(),
in_stock: Some(in_stock),
name,
image_url,
parser_version: 5,
precio_centavos,
url,
})
}

View file

@ -0,0 +1,6 @@
pub mod carrefour;
mod common;
pub mod coto;
pub mod dia;
pub mod jumbo;
mod vtex;

View file

@ -0,0 +1,102 @@
use anyhow::{bail, Context};
use serde::Deserialize;
use simple_error::SimpleError;
use tl::VDom;
use super::common;
pub fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, anyhow::Error> {
let inner_html = &dom
.query_selector("template[data-type=\"json\"]")
.unwrap()
.filter_map(|h| h.get(dom.parser()).and_then(|n| n.as_tag()))
.find(|t| {
t.attributes()
.get("data-varname")
.flatten()
.map_or(false, |v| v.as_utf8_str() == varname)
})
.ok_or(SimpleError::new("Failed to get template tag"))?
.query_selector(dom.parser(), "script")
.and_then(|mut it| it.next())
.and_then(|h| h.get(dom.parser()))
.ok_or(SimpleError::new("Failed to get script tag"))?
.inner_html(dom.parser());
inner_html.parse().context("Couldn't parse JSON in script")
}
pub fn get_json_lds<'a>(
dom: &'a VDom,
) -> impl Iterator<Item = std::result::Result<serde_json::Value, serde_json::Error>> + 'a {
dom.query_selector("script[type=\"application/ld+json\"]")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.map(|t| serde_json::from_str(&t.inner_html(dom.parser())))
}
pub fn find_json_ld(dom: &VDom, typ: &str) -> Option<Result<Ld, serde_json::Error>> {
get_json_lds(dom)
.filter_map(|v| v.ok())
.find(|v| v.get("@type").is_some_and(|t| t == typ))
.map(serde_json::from_value)
}
pub fn find_product_ld(dom: &VDom) -> Option<Result<ProductLd, serde_json::Error>> {
find_json_ld(dom, "Product").map(|l| {
l.map(|l| match l {
Ld::Product(p) => p,
})
})
}
#[derive(Deserialize)]
#[serde(tag = "@type")]
pub enum Ld {
Product(ProductLd),
}
#[derive(Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ProductLd {
pub name: String,
pub image: String,
pub sku: Option<String>,
pub offers: OffersLd,
}
#[derive(Deserialize)]
pub struct OffersLd {
pub offers: Vec<OfferLd>,
}
#[derive(Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct OfferLd {
#[serde(rename = "@type")]
_type: OfferTypeLd,
pub price: f64,
pub price_currency: String,
pub availability: AvailabilityLd,
}
#[derive(Deserialize)]
pub enum OfferTypeLd {
Offer,
}
#[derive(Deserialize, PartialEq)]
pub enum AvailabilityLd {
#[serde(rename = "http://schema.org/InStock")]
InStock,
#[serde(rename = "http://schema.org/OutOfStock")]
OutOfStock,
}
pub fn in_stock_from_meta(dom: &VDom) -> anyhow::Result<bool> {
Ok(
match common::get_meta_content(dom, "product:availability") {
Some(s) => match s.as_ref() {
"oos" => false,
"instock" => true,
_ => bail!("Not a valid product:availability"),
},
None => bail!("No product:availability in carrefour"),
},
)
}

View file

@ -4,7 +4,6 @@ import { join } from "node:path";
import { Supermercado, hosts, supermercados } from "db-datos/supermercado.js"; import { Supermercado, hosts, supermercados } from "db-datos/supermercado.js";
import PQueue from "p-queue"; import PQueue from "p-queue";
import { formatDuration, intervalToDuration } from "date-fns"; import { formatDuration, intervalToDuration } from "date-fns";
import { downloadList } from "./scrap.js";
import { db } from "db-datos/db.js"; import { db } from "db-datos/db.js";
import { like } from "drizzle-orm"; import { like } from "drizzle-orm";
import { productoUrls } from "db-datos/schema.js"; import { productoUrls } from "db-datos/schema.js";
@ -12,9 +11,10 @@ import { scrapDiaProducts } from "../link-scrapers/dia.js";
import { scrapCotoProducts } from "../link-scrapers/coto.js"; import { scrapCotoProducts } from "../link-scrapers/coto.js";
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js"; import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
import { scrapJumboProducts } from "../link-scrapers/jumbo.js"; import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
import { readableStreamToText } from "bun";
// hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU // hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU
const scrapQueue = new PQueue({ concurrency: 4 }); const scrapQueue = new PQueue({ concurrency: 1 });
export async function auto() { export async function auto() {
const a = new Auto(); const a = new Auto();
@ -38,11 +38,7 @@ class Auto {
this.inform("[auto] Empezando scrap"); this.inform("[auto] Empezando scrap");
} }
async downloadList(supermercado: Supermercado) { async scrapUrls(supermercado: Supermercado) {
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-download-"));
let listPath: string;
{
const t0 = performance.now(); const t0 = performance.now();
switch (supermercado) { switch (supermercado) {
case "Dia": case "Dia":
@ -63,7 +59,14 @@ class Auto {
); );
} }
listPath = join(ctxPath, `lista-${supermercado}.txt`); async downloadList(supermercado: Supermercado) {
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-download-"));
await scrapQueue.add(async () => {
await this.scrapUrls(supermercado);
});
const listPath = join(ctxPath, `lista-${supermercado}.txt`);
const host = Object.entries(hosts).find( const host = Object.entries(hosts).find(
([host, supe]) => supe === supermercado ([host, supe]) => supe === supermercado
)![0]; )![0];
@ -82,16 +85,25 @@ class Auto {
async scrapAndInform({ listPath }: { listPath: string }) { async scrapAndInform({ listPath }: { listPath: string }) {
const res = await scrapQueue.add(async () => { const res = await scrapQueue.add(async () => {
const t0 = performance.now(); const t0 = performance.now();
const progress = await downloadList(listPath);
return { took: performance.now() - t0, progress }; const sub = Bun.spawn({
cmd: ["scraper-rs", "fetch-list", listPath],
stdio: ["ignore", "pipe", "inherit"],
});
const text = await readableStreamToText(sub.stdout);
const code = await sub.exited;
if (code !== 0) throw new Error(`scraper-rs threw ${code}`);
return { took: performance.now() - t0, text };
}); });
if (res) { if (res) {
const { took, progress } = res; const { took, text } = res;
this.inform( this.inform(
`Procesado ${listPath} (${progress.done} ok, ${ `Procesado ${listPath} (${text}) (tardó ${formatMs(took)})`
progress.skipped //(${progress.done} ok, ${
} skipped, ${progress.errors.length} errores) (tardó ${formatMs(took)})` // progress.skipped
// } skipped, ${progress.errors.length} errores)
); );
} else { } else {
this.inform(`Algo falló en ${listPath}`); this.inform(`Algo falló en ${listPath}`);

View file

@ -4,9 +4,14 @@ import { scrapDiaProducts } from "../link-scrapers/dia.js";
import { scrapJumboProducts } from "../link-scrapers/jumbo.js"; import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
import { auto } from "./auto.js"; import { auto } from "./auto.js";
import { downloadList, getProduct } from "./scrap.js"; import { downloadList, getProduct } from "./scrap.js";
import Cron from "croner";
if (process.argv[2] === "auto") { if (process.argv[2] === "auto") {
await auto(); await auto();
} else if (process.argv[2] === "cron") {
Cron("0 2 * * *", () => {
auto();
});
} else if (process.argv[2] === "scrap-carrefour-links") { } else if (process.argv[2] === "scrap-carrefour-links") {
await scrapCarrefourProducts(); await scrapCarrefourProducts();
} else if (process.argv[2] === "scrap-dia-links") { } else if (process.argv[2] === "scrap-dia-links") {

View file

@ -13,6 +13,7 @@
"dependencies": { "dependencies": {
"@aws-sdk/client-s3": "^3.478.0", "@aws-sdk/client-s3": "^3.478.0",
"@aws-sdk/lib-storage": "^3.478.0", "@aws-sdk/lib-storage": "^3.478.0",
"croner": "^8.0.0",
"date-fns": "^3.0.6", "date-fns": "^3.0.6",
"db-datos": "workspace:^", "db-datos": "workspace:^",
"drizzle-orm": "^0.29.1", "drizzle-orm": "^0.29.1",

View file

@ -38,7 +38,6 @@
"better-sqlite3": "^9.2.2", "better-sqlite3": "^9.2.2",
"chart.js": "^4.4.1", "chart.js": "^4.4.1",
"chartjs-adapter-dayjs-4": "^1.0.4", "chartjs-adapter-dayjs-4": "^1.0.4",
"croner": "^8.0.0",
"dayjs": "^1.11.10", "dayjs": "^1.11.10",
"drizzle-orm": "^0.29.1" "drizzle-orm": "^0.29.1"
} }

View file

@ -1,12 +0,0 @@
import { spawn } from "child_process";
import Cron from "croner";
if (process.env.NODE_ENV === "production") {
const job = Cron("15 3 * * *", () => {
runScraper();
});
}
function runScraper() {
spawn("bun", ["/bin/scraper", "auto"], { stdio: "inherit" });
}