mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 03:26:19 +00:00
Compare commits
34 commits
5c9b6d16c8
...
6853b6389a
Author | SHA1 | Date | |
---|---|---|---|
6853b6389a | |||
a29b86c8d2 | |||
76cd8f3658 | |||
34bd9aa07e | |||
7b9533cde6 | |||
6507bd944b | |||
78e0f3cdee | |||
66fc8767ef | |||
|
a438eec238 | ||
|
eb2f04695c | ||
|
bf3dbcc019 | ||
fa9a010263 | |||
07324f756c | |||
91397a31d2 | |||
4b211c89af | |||
cfae80cb9a | |||
972d5ade18 | |||
1348bee6c7 | |||
8e8fe8ddaf | |||
37ceb15e74 | |||
f2401aa965 | |||
3a31586193 | |||
8f6f62a261 | |||
b696551949 | |||
27aee01c1a | |||
348d054b7b | |||
613efc3111 | |||
78878d8b7e | |||
1abd98724d | |||
56a257c389 | |||
2d2912e4e9 | |||
abd430421c | |||
c56272dc30 | |||
3cf723cc3d |
24 changed files with 2643 additions and 59 deletions
|
@ -9,7 +9,8 @@
|
||||||
"ghcr.io/devcontainers/features/git-lfs:1": {},
|
"ghcr.io/devcontainers/features/git-lfs:1": {},
|
||||||
"ghcr.io/devcontainers/features/node:1": {},
|
"ghcr.io/devcontainers/features/node:1": {},
|
||||||
"ghcr.io/swift-server-community/swift-devcontainer-features/sqlite:1": {},
|
"ghcr.io/swift-server-community/swift-devcontainer-features/sqlite:1": {},
|
||||||
"ghcr.io/devcontainers/features/rust:1": {}
|
"ghcr.io/devcontainers/features/rust:1": {},
|
||||||
|
"ghcr.io/devcontainers/features/docker-in-docker:2": {}
|
||||||
},
|
},
|
||||||
|
|
||||||
// Use 'forwardPorts' to make a list of ports inside the container available locally.
|
// Use 'forwardPorts' to make a list of ports inside the container available locally.
|
||||||
|
|
2
.github/workflows/container.yml
vendored
2
.github/workflows/container.yml
vendored
|
@ -81,3 +81,5 @@ jobs:
|
||||||
push: true
|
push: true
|
||||||
tags: ${{ steps.meta.outputs.tags }}
|
tags: ${{ steps.meta.outputs.tags }}
|
||||||
labels: ${{ steps.meta.outputs.labels }}
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
|
cache-from: type=gha
|
||||||
|
cache-to: type=gha,mode=max
|
||||||
|
|
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -13,3 +13,7 @@ scraper/x.tsv
|
||||||
*.tmp
|
*.tmp
|
||||||
target/
|
target/
|
||||||
.env.*
|
.env.*
|
||||||
|
|
||||||
|
*/flamegraph.svg
|
||||||
|
*/perf.data*
|
||||||
|
scraper-rs/debug/
|
8
.vscode/launch.json
vendored
8
.vscode/launch.json
vendored
|
@ -7,13 +7,13 @@
|
||||||
{
|
{
|
||||||
"type": "lldb",
|
"type": "lldb",
|
||||||
"request": "launch",
|
"request": "launch",
|
||||||
"name": "warcificator",
|
"name": "scraper-rs",
|
||||||
"cwd": "warcificator/",
|
"cwd": "scraper-rs/",
|
||||||
"cargo": {
|
"cargo": {
|
||||||
// https://github.com/vadimcn/codelldb/issues/884
|
// https://github.com/vadimcn/codelldb/issues/884
|
||||||
"args": ["build", "--manifest-path=warcificator/Cargo.toml"]
|
"args": ["build", "--manifest-path=scraper-rs/Cargo.toml"]
|
||||||
},
|
},
|
||||||
"args": ["../data/samples/Carrefour.50.txt"],
|
"args": ["../data/Carrefour.txt"],
|
||||||
"env": {}
|
"env": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -11,7 +11,7 @@ RUN cd sitio && \
|
||||||
RUN bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js
|
RUN bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js
|
||||||
|
|
||||||
FROM cgr.dev/chainguard/wolfi-base
|
FROM cgr.dev/chainguard/wolfi-base
|
||||||
RUN apk add --no-cache nodejs npm jq bun
|
RUN apk add --no-cache nodejs npm jq bun sqlite
|
||||||
|
|
||||||
# Sitio
|
# Sitio
|
||||||
COPY --from=build /usr/src/app/sitio/package.json package.real.json
|
COPY --from=build /usr/src/app/sitio/package.json package.real.json
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
FROM docker.io/oven/bun:1-alpine AS base
|
FROM cgr.dev/chainguard/wolfi-base AS base
|
||||||
WORKDIR /usr/src/app
|
WORKDIR /usr/src/app
|
||||||
|
RUN apk add --no-cache bun libgcc
|
||||||
|
|
||||||
FROM base as build
|
FROM base as build
|
||||||
ENV NODE_ENV=production
|
ENV NODE_ENV=production
|
||||||
|
@ -7,17 +8,22 @@ COPY . .
|
||||||
RUN bun install --frozen-lockfile
|
RUN bun install --frozen-lockfile
|
||||||
RUN bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js
|
RUN bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js
|
||||||
|
|
||||||
|
# nightly porque usamos tl con `simd` activado
|
||||||
|
FROM base as rs-build
|
||||||
|
RUN apk add --no-cache rust build-base sqlite-dev
|
||||||
|
|
||||||
|
COPY scraper-rs/ .
|
||||||
|
RUN cargo install --locked --path .
|
||||||
|
|
||||||
FROM base
|
FROM base
|
||||||
|
RUN apk add --no-cache sqlite sqlite-libs
|
||||||
|
|
||||||
# Scraper
|
# Scraper
|
||||||
COPY --from=build /tmp/cli.build.js /bin/scraper
|
COPY --from=build /tmp/cli.build.js /bin/scraper
|
||||||
COPY --from=build /usr/src/app/db-datos/drizzle /bin/drizzle
|
COPY --from=build /usr/src/app/db-datos/drizzle /bin/drizzle
|
||||||
|
COPY --from=rs-build /root/.cargo/bin/scraper-rs /usr/local/bin/scraper-rs
|
||||||
|
|
||||||
ENV NODE_ENV=production
|
ENV NODE_ENV=production
|
||||||
ENV DB_PATH=/db/db.db
|
ENV DB_PATH=/db/db.db
|
||||||
|
|
||||||
# Cron scraper
|
CMD ["bun", "/bin/scraper", "cron"]
|
||||||
RUN printf "#!/bin/sh\nexec bun /bin/scraper auto\n" > /etc/periodic/daily/scraper \
|
|
||||||
&& chmod +x /etc/periodic/daily/scraper
|
|
||||||
|
|
||||||
CMD ["busybox", "crond", "-f", "-l2"]
|
|
||||||
|
|
BIN
bun.lockb
BIN
bun.lockb
Binary file not shown.
|
@ -1,3 +1,3 @@
|
||||||
version https://git-lfs.github.com/spec/v1
|
version https://git-lfs.github.com/spec/v1
|
||||||
oid sha256:6299b470d9debc9a173b40c2ba91208eb43a6c8cde02a4819e7bcd76368e4363
|
oid sha256:f231884c2b9fd0b633746892a00824379b4d8aa110e6348309197b83b0d1c555
|
||||||
size 922185
|
size 926218
|
||||||
|
|
|
@ -68,7 +68,10 @@ const categorias = [
|
||||||
];
|
];
|
||||||
|
|
||||||
export async function scrapDiaProducts() {
|
export async function scrapDiaProducts() {
|
||||||
await Promise.all([scrapBySite(), scrapBySitemap()]);
|
await Promise.all([
|
||||||
|
// scrapBySite(),
|
||||||
|
scrapBySitemap(),
|
||||||
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapBySitemap() {
|
async function scrapBySitemap() {
|
||||||
|
@ -104,7 +107,7 @@ async function scrapBySite() {
|
||||||
await pMap(
|
await pMap(
|
||||||
links,
|
links,
|
||||||
async (url) => {
|
async (url) => {
|
||||||
const res = await fetch(url);
|
const res = await fetch(url, { timeout: false });
|
||||||
const html = await res.text();
|
const html = await res.text();
|
||||||
const { document } = parseHTML(html);
|
const { document } = parseHTML(html);
|
||||||
|
|
||||||
|
|
1878
scraper-rs/Cargo.lock
generated
Normal file
1878
scraper-rs/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
33
scraper-rs/Cargo.toml
Normal file
33
scraper-rs/Cargo.toml
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
[package]
|
||||||
|
name = "scraper-rs"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
again = "0.1.2"
|
||||||
|
anyhow = "1.0.79"
|
||||||
|
async-channel = "2.1.1"
|
||||||
|
clap = { version = "4.4.15", features = ["derive"] }
|
||||||
|
nanoid = "0.4.0"
|
||||||
|
r2d2 = "0.8.10"
|
||||||
|
r2d2_sqlite = "0.23.0"
|
||||||
|
rand = "0.8.5"
|
||||||
|
# lol_html = "1.2.0"
|
||||||
|
reqwest = { version = "0.11.23", default-features = false, features = [
|
||||||
|
"rustls-tls",
|
||||||
|
"gzip",
|
||||||
|
"brotli",
|
||||||
|
"socks",
|
||||||
|
] }
|
||||||
|
rusqlite = "0.30.0"
|
||||||
|
# scraper = "0.18.1"
|
||||||
|
serde = { version = "1.0.193", features = ["derive"] }
|
||||||
|
serde_json = "1.0.109"
|
||||||
|
simple-error = "0.3.0"
|
||||||
|
thiserror = "1.0.56"
|
||||||
|
tl = { git = "https://github.com/evertedsphere/tl", branch = "patch-1" }
|
||||||
|
tokio = { version = "1.35.1", features = ["full"] }
|
||||||
|
tracing = "0.1"
|
||||||
|
tracing-subscriber = "0.3"
|
247
scraper-rs/src/main.rs
Normal file
247
scraper-rs/src/main.rs
Normal file
|
@ -0,0 +1,247 @@
|
||||||
|
use again::RetryPolicy;
|
||||||
|
use async_channel::Receiver;
|
||||||
|
use clap::Parser;
|
||||||
|
use nanoid::nanoid;
|
||||||
|
use r2d2::Pool;
|
||||||
|
use r2d2_sqlite::SqliteConnectionManager;
|
||||||
|
use reqwest::{StatusCode, Url};
|
||||||
|
use simple_error::{bail, SimpleError};
|
||||||
|
use std::{
|
||||||
|
env::{self},
|
||||||
|
fs,
|
||||||
|
path::PathBuf,
|
||||||
|
time::Duration,
|
||||||
|
};
|
||||||
|
use thiserror::Error;
|
||||||
|
|
||||||
|
#[derive(Parser)] // requires `derive` feature
|
||||||
|
enum Args {
|
||||||
|
FetchList(FetchListArgs),
|
||||||
|
ParseFile(ParseFileArgs),
|
||||||
|
}
|
||||||
|
#[derive(clap::Args)]
|
||||||
|
struct FetchListArgs {
|
||||||
|
list_path: String,
|
||||||
|
}
|
||||||
|
#[derive(clap::Args)]
|
||||||
|
struct ParseFileArgs {
|
||||||
|
file_path: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> anyhow::Result<()> {
|
||||||
|
tracing_subscriber::fmt::init();
|
||||||
|
|
||||||
|
match Args::parse() {
|
||||||
|
Args::FetchList(a) => fetch_list_cli(a.list_path).await,
|
||||||
|
Args::ParseFile(a) => parse_file_cli(a.file_path).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
|
||||||
|
let links_str = fs::read_to_string(links_list_path).unwrap();
|
||||||
|
let links = links_str
|
||||||
|
.split('\n')
|
||||||
|
.map(|s| s.trim())
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
.map(|s| s.to_owned())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let (sender, receiver) = async_channel::bounded::<String>(1);
|
||||||
|
|
||||||
|
let db_path = env::var("DB_PATH").unwrap_or("../scraper/sqlite.db".to_string());
|
||||||
|
let manager = SqliteConnectionManager::file(db_path);
|
||||||
|
let pool = Pool::new(manager).unwrap();
|
||||||
|
|
||||||
|
let n_coroutines = env::var("N_COROUTINES")
|
||||||
|
.map_or(Ok(128), |s| s.parse::<usize>())
|
||||||
|
.expect("N_COROUTINES no es un número");
|
||||||
|
let handles = (1..n_coroutines)
|
||||||
|
.map(|_| {
|
||||||
|
let rx = receiver.clone();
|
||||||
|
let pool = pool.clone();
|
||||||
|
tokio::spawn(worker(rx, pool))
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
for link in links {
|
||||||
|
sender.send_blocking(link).unwrap();
|
||||||
|
}
|
||||||
|
sender.close();
|
||||||
|
|
||||||
|
let mut counters = Counters::default();
|
||||||
|
for handle in handles {
|
||||||
|
let c = handle.await.unwrap();
|
||||||
|
counters.success += c.success;
|
||||||
|
counters.errored += c.errored;
|
||||||
|
counters.skipped += c.skipped;
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("Finished: {:?}", counters);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_client() -> reqwest::Client {
|
||||||
|
reqwest::ClientBuilder::default().build().unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Default, Debug)]
|
||||||
|
struct Counters {
|
||||||
|
success: u64,
|
||||||
|
errored: u64,
|
||||||
|
skipped: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn worker(rx: Receiver<String>, pool: Pool<SqliteConnectionManager>) -> Counters {
|
||||||
|
let client = build_client();
|
||||||
|
|
||||||
|
let mut counters = Counters::default();
|
||||||
|
while let Ok(url) = rx.recv().await {
|
||||||
|
let res = fetch_and_parse(&client, url.clone()).await;
|
||||||
|
match res {
|
||||||
|
Ok(res) => {
|
||||||
|
counters.success += 1;
|
||||||
|
pool.get().unwrap().execute("INSERT INTO precios(ean, fetched_at, precio_centavos, in_stock, url, warc_record_id, parser_version, name, image_url) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9);",rusqlite::params![
|
||||||
|
res.ean,
|
||||||
|
res.fetched_at,
|
||||||
|
res.precio_centavos,
|
||||||
|
res.in_stock,
|
||||||
|
res.url,
|
||||||
|
None::<String>,
|
||||||
|
res.parser_version,
|
||||||
|
res.name,
|
||||||
|
res.image_url,
|
||||||
|
]).unwrap();
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
match err.downcast_ref::<FetchError>() {
|
||||||
|
Some(FetchError::HttpStatus(StatusCode::NOT_FOUND)) => counters.skipped += 1,
|
||||||
|
_ => counters.errored += 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
tracing::error!(error=%err, url=url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
counters
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Error)]
|
||||||
|
enum FetchError {
|
||||||
|
#[error("reqwest error")]
|
||||||
|
Http(#[from] reqwest::Error),
|
||||||
|
#[error("http status: {0}")]
|
||||||
|
HttpStatus(reqwest::StatusCode),
|
||||||
|
#[error("parse error")]
|
||||||
|
Parse(#[from] SimpleError),
|
||||||
|
#[error("tl error")]
|
||||||
|
Tl(#[from] tl::ParseError),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tracing::instrument(skip(client))]
|
||||||
|
async fn fetch_and_parse(
|
||||||
|
client: &reqwest::Client,
|
||||||
|
url: String,
|
||||||
|
) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
|
let policy = RetryPolicy::exponential(Duration::from_millis(300))
|
||||||
|
.with_max_retries(10)
|
||||||
|
.with_jitter(true);
|
||||||
|
|
||||||
|
let response = policy
|
||||||
|
.retry(|| {
|
||||||
|
let request = client.get(url.as_str()).build().unwrap();
|
||||||
|
client.execute(request)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.map_err(FetchError::Http)?;
|
||||||
|
if !response.status().is_success() {
|
||||||
|
bail!(FetchError::HttpStatus(response.status()));
|
||||||
|
}
|
||||||
|
let body = response.text().await.map_err(FetchError::Http)?;
|
||||||
|
|
||||||
|
let maybe_point = { scrap_url(client, url, &body).await };
|
||||||
|
|
||||||
|
let point = match maybe_point {
|
||||||
|
Ok(p) => Ok(p),
|
||||||
|
Err(err) => {
|
||||||
|
let debug_path = PathBuf::from("debug/");
|
||||||
|
tokio::fs::create_dir_all(&debug_path).await.unwrap();
|
||||||
|
let file_path = debug_path.join(format!("{}.html", nanoid!()));
|
||||||
|
tokio::fs::write(&file_path, &body).await.unwrap();
|
||||||
|
tracing::debug!(error=%err, "Failed to parse, saved body at {}",file_path.display());
|
||||||
|
Err(err)
|
||||||
|
}
|
||||||
|
}?;
|
||||||
|
|
||||||
|
Ok(point)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
|
||||||
|
let file = tokio::fs::read_to_string(file_path).await?;
|
||||||
|
|
||||||
|
let client = build_client();
|
||||||
|
|
||||||
|
let url = {
|
||||||
|
let dom = tl::parse(&file, tl::ParserOptions::default())?;
|
||||||
|
dom.query_selector("link[rel=\"canonical\"]")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.next()
|
||||||
|
.and_then(|t| t.attributes().get("href").flatten())
|
||||||
|
.expect("No meta canonical")
|
||||||
|
.as_utf8_str()
|
||||||
|
.to_string()
|
||||||
|
};
|
||||||
|
|
||||||
|
println!("URL: {}", &url);
|
||||||
|
println!("{:?}", scrap_url(&client, url, &file).await);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn scrap_url(
|
||||||
|
client: &reqwest::Client,
|
||||||
|
url: String,
|
||||||
|
body: &str,
|
||||||
|
) -> anyhow::Result<PrecioPoint> {
|
||||||
|
let url_p = Url::parse(&url).unwrap();
|
||||||
|
match url_p.host_str().unwrap() {
|
||||||
|
"www.carrefour.com.ar" => {
|
||||||
|
sites::carrefour::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
|
||||||
|
}
|
||||||
|
"diaonline.supermercadosdia.com.ar" => {
|
||||||
|
sites::dia::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
|
||||||
|
}
|
||||||
|
"www.cotodigital3.com.ar" => {
|
||||||
|
sites::coto::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
|
||||||
|
}
|
||||||
|
"www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await,
|
||||||
|
s => bail!("Unknown host {}", s),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
use std::time::{SystemTime, UNIX_EPOCH};
|
||||||
|
|
||||||
|
mod sites;
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct PrecioPoint {
|
||||||
|
ean: String,
|
||||||
|
// unix
|
||||||
|
fetched_at: u64,
|
||||||
|
precio_centavos: Option<u64>,
|
||||||
|
in_stock: Option<bool>,
|
||||||
|
url: String,
|
||||||
|
parser_version: u16,
|
||||||
|
name: Option<String>,
|
||||||
|
image_url: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn now_sec() -> u64 {
|
||||||
|
let start = SystemTime::now();
|
||||||
|
let since_the_epoch = start
|
||||||
|
.duration_since(UNIX_EPOCH)
|
||||||
|
.expect("Time went backwards");
|
||||||
|
since_the_epoch.as_secs()
|
||||||
|
}
|
68
scraper-rs/src/sites/carrefour.rs
Normal file
68
scraper-rs/src/sites/carrefour.rs
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
use simple_error::bail;
|
||||||
|
use simple_error::SimpleError;
|
||||||
|
|
||||||
|
use crate::sites::common;
|
||||||
|
use crate::sites::vtex;
|
||||||
|
use crate::PrecioPoint;
|
||||||
|
|
||||||
|
use super::vtex::find_product_ld;
|
||||||
|
|
||||||
|
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
|
let precio_centavos = common::price_from_meta(dom)?;
|
||||||
|
|
||||||
|
let in_stock = vtex::in_stock_from_meta(dom)?;
|
||||||
|
|
||||||
|
let ean = {
|
||||||
|
let json = &vtex::parse_script_json(dom, "__STATE__")?;
|
||||||
|
let state = json
|
||||||
|
.as_object()
|
||||||
|
.ok_or(SimpleError::new("Seed state not an object"))?;
|
||||||
|
if state.is_empty() {
|
||||||
|
bail!("Seed state is an empty object")
|
||||||
|
}
|
||||||
|
let (_, product_json) = state
|
||||||
|
.iter()
|
||||||
|
.find(|(key, val)| {
|
||||||
|
key.starts_with("Product:") && val.get("__typename").is_some_and(|t| t == "Product")
|
||||||
|
})
|
||||||
|
.ok_or(SimpleError::new("No product in seed state"))?;
|
||||||
|
let cache_id = product_json
|
||||||
|
.get("cacheId")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.ok_or(SimpleError::new("No cacheId in seed state"))?;
|
||||||
|
let (_, product_sku_json) = state
|
||||||
|
.iter()
|
||||||
|
.find(|(key, val)| {
|
||||||
|
key.starts_with(&format!("Product:{}", cache_id))
|
||||||
|
&& val.get("__typename").is_some_and(|t| t == "SKU")
|
||||||
|
})
|
||||||
|
.ok_or(SimpleError::new("No Product:cacheId* found"))?;
|
||||||
|
product_sku_json
|
||||||
|
.get("ean")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.ok_or(SimpleError::new("No product SKU in seed state"))?
|
||||||
|
.to_string()
|
||||||
|
};
|
||||||
|
|
||||||
|
let (name, image_url) = match find_product_ld(dom) {
|
||||||
|
Some(pm) => {
|
||||||
|
let p = pm?;
|
||||||
|
(Some(p.name), Some(p.image))
|
||||||
|
}
|
||||||
|
None => match in_stock {
|
||||||
|
true => bail!("No JSONLD product in in stock product"),
|
||||||
|
false => (None, None),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(PrecioPoint {
|
||||||
|
ean,
|
||||||
|
fetched_at: crate::now_sec(),
|
||||||
|
in_stock: Some(in_stock),
|
||||||
|
name,
|
||||||
|
image_url,
|
||||||
|
parser_version: 5,
|
||||||
|
precio_centavos,
|
||||||
|
url,
|
||||||
|
})
|
||||||
|
}
|
19
scraper-rs/src/sites/common.rs
Normal file
19
scraper-rs/src/sites/common.rs
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
use std::borrow::Cow;
|
||||||
|
|
||||||
|
use tl::VDom;
|
||||||
|
|
||||||
|
pub fn get_meta_content<'a>(dom: &'a VDom<'a>, prop: &str) -> Option<Cow<'a, str>> {
|
||||||
|
dom.query_selector(&format!("meta[property=\"{}\"]", prop))
|
||||||
|
.and_then(|mut iter| iter.next())
|
||||||
|
.and_then(|h| h.get(dom.parser()))
|
||||||
|
.and_then(|n| n.as_tag())
|
||||||
|
.and_then(|tag| tag.attributes().get("content").flatten())
|
||||||
|
.map(|s| s.as_utf8_str())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn price_from_meta(dom: &tl::VDom<'_>) -> Result<Option<u64>, anyhow::Error> {
|
||||||
|
let precio_centavos = get_meta_content(dom, "product:price:amount")
|
||||||
|
.map(|s| s.parse::<f64>().map(|f| (f * 100.0) as u64))
|
||||||
|
.transpose()?;
|
||||||
|
Ok(precio_centavos)
|
||||||
|
}
|
77
scraper-rs/src/sites/coto.rs
Normal file
77
scraper-rs/src/sites/coto.rs
Normal file
|
@ -0,0 +1,77 @@
|
||||||
|
use anyhow::Context;
|
||||||
|
|
||||||
|
use crate::PrecioPoint;
|
||||||
|
|
||||||
|
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
|
let ean = dom
|
||||||
|
.query_selector("div#brandText")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.find(|t| t.inner_text(dom.parser()).as_ref().contains("| EAN: "))
|
||||||
|
.context("No encuentro eanparent")?
|
||||||
|
.query_selector(dom.parser(), "span.span_codigoplu")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.nth(1)
|
||||||
|
.context("no encuentro el ean")?
|
||||||
|
.inner_text(dom.parser())
|
||||||
|
.trim()
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
let precio_centavos = dom
|
||||||
|
.query_selector(".atg_store_newPrice")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.next()
|
||||||
|
.map(|t| t.inner_text(dom.parser()))
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
.map(|s| {
|
||||||
|
let s = s.replacen('$', "", 1).replace('.', "").replace(',', ".");
|
||||||
|
let s = s.trim();
|
||||||
|
s.parse::<f64>()
|
||||||
|
})
|
||||||
|
.transpose()
|
||||||
|
.context("Parseando precio")?
|
||||||
|
.map(|f| (f * 100.0) as u64);
|
||||||
|
|
||||||
|
let in_stock = Some(
|
||||||
|
dom.query_selector(".product_not_available")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.next()
|
||||||
|
.is_some(),
|
||||||
|
);
|
||||||
|
|
||||||
|
let name = dom
|
||||||
|
.query_selector("h1.product_page")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.next()
|
||||||
|
.map(|t| t.inner_text(dom.parser()))
|
||||||
|
.map(|s| s.trim().to_string());
|
||||||
|
|
||||||
|
let image_url = dom
|
||||||
|
.query_selector(".zoomImage1")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.next()
|
||||||
|
.and_then(|t| t.attributes().get("src").flatten())
|
||||||
|
.map(|s| s.as_utf8_str().to_string());
|
||||||
|
|
||||||
|
Ok(PrecioPoint {
|
||||||
|
ean,
|
||||||
|
fetched_at: crate::now_sec(),
|
||||||
|
in_stock,
|
||||||
|
name,
|
||||||
|
image_url,
|
||||||
|
parser_version: 5,
|
||||||
|
precio_centavos,
|
||||||
|
url,
|
||||||
|
})
|
||||||
|
}
|
41
scraper-rs/src/sites/dia.rs
Normal file
41
scraper-rs/src/sites/dia.rs
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
use anyhow::Context;
|
||||||
|
use simple_error::bail;
|
||||||
|
|
||||||
|
use crate::sites::common;
|
||||||
|
use crate::PrecioPoint;
|
||||||
|
|
||||||
|
use super::vtex::find_product_ld;
|
||||||
|
use super::vtex::AvailabilityLd;
|
||||||
|
|
||||||
|
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
|
let ean = common::get_meta_content(dom, "product:retailer_item_id")
|
||||||
|
.context("Parsing EAN")?
|
||||||
|
.to_string();
|
||||||
|
let precio_centavos = common::price_from_meta(dom)?;
|
||||||
|
|
||||||
|
let (name, image_url, in_stock) = match find_product_ld(dom) {
|
||||||
|
Some(pm) => {
|
||||||
|
let p = pm?;
|
||||||
|
(
|
||||||
|
Some(p.name),
|
||||||
|
Some(p.image),
|
||||||
|
Some(
|
||||||
|
p.offers.offers.first().context("No offer")?.availability
|
||||||
|
== AvailabilityLd::InStock,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
None => bail!("No JSON/LD"),
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(PrecioPoint {
|
||||||
|
ean,
|
||||||
|
fetched_at: crate::now_sec(),
|
||||||
|
in_stock,
|
||||||
|
name,
|
||||||
|
image_url,
|
||||||
|
parser_version: 5,
|
||||||
|
precio_centavos,
|
||||||
|
url,
|
||||||
|
})
|
||||||
|
}
|
92
scraper-rs/src/sites/jumbo.rs
Normal file
92
scraper-rs/src/sites/jumbo.rs
Normal file
|
@ -0,0 +1,92 @@
|
||||||
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use anyhow::Context;
|
||||||
|
use reqwest::Url;
|
||||||
|
use serde::Deserialize;
|
||||||
|
use simple_error::bail;
|
||||||
|
|
||||||
|
use crate::sites::common;
|
||||||
|
use crate::PrecioPoint;
|
||||||
|
|
||||||
|
use super::vtex;
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct JumboSearch {
|
||||||
|
items: Vec<JumboSearchItem>,
|
||||||
|
}
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct JumboSearchItem {
|
||||||
|
ean: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_ean_from_search(
|
||||||
|
client: &reqwest::Client,
|
||||||
|
retailer_sku: String,
|
||||||
|
) -> anyhow::Result<String> {
|
||||||
|
let s = client
|
||||||
|
.get({
|
||||||
|
let mut url =
|
||||||
|
Url::from_str("https://www.jumbo.com.ar/api/catalog_system/pub/products/search")
|
||||||
|
.unwrap();
|
||||||
|
url.set_query(Some(&format!("fq=skuId:{}", retailer_sku)));
|
||||||
|
url
|
||||||
|
})
|
||||||
|
.send()
|
||||||
|
.await?
|
||||||
|
.text()
|
||||||
|
.await?;
|
||||||
|
let ean = {
|
||||||
|
let search: Vec<JumboSearch> = serde_json::from_str(&s)?;
|
||||||
|
let result = search.first().context("No search result")?;
|
||||||
|
let ean = result
|
||||||
|
.items
|
||||||
|
.first()
|
||||||
|
.context("No search result")?
|
||||||
|
.ean
|
||||||
|
.clone();
|
||||||
|
if !result.items.iter().all(|i| i.ean == ean) {
|
||||||
|
bail!("Inesperado: no todos los items tienen el mismo EAN")
|
||||||
|
}
|
||||||
|
ean
|
||||||
|
};
|
||||||
|
Ok(ean)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn scrap(
|
||||||
|
client: &reqwest::Client,
|
||||||
|
url: String,
|
||||||
|
body: &str,
|
||||||
|
) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
|
let (name, image_url, sku, precio_centavos, in_stock) = {
|
||||||
|
let dom = tl::parse(body, tl::ParserOptions::default())?;
|
||||||
|
let precio_centavos = common::price_from_meta(&dom)?;
|
||||||
|
let in_stock = vtex::in_stock_from_meta(&dom)?;
|
||||||
|
|
||||||
|
match vtex::find_product_ld(&dom) {
|
||||||
|
Some(pm) => {
|
||||||
|
let p = pm?;
|
||||||
|
(
|
||||||
|
Some(p.name),
|
||||||
|
Some(p.image),
|
||||||
|
p.sku.context("No retailer SKU in Product LD")?,
|
||||||
|
precio_centavos,
|
||||||
|
in_stock,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
None => bail!("No JSON/LD"),
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let ean = get_ean_from_search(client, sku).await?;
|
||||||
|
|
||||||
|
Ok(PrecioPoint {
|
||||||
|
ean,
|
||||||
|
fetched_at: crate::now_sec(),
|
||||||
|
in_stock: Some(in_stock),
|
||||||
|
name,
|
||||||
|
image_url,
|
||||||
|
parser_version: 5,
|
||||||
|
precio_centavos,
|
||||||
|
url,
|
||||||
|
})
|
||||||
|
}
|
6
scraper-rs/src/sites/mod.rs
Normal file
6
scraper-rs/src/sites/mod.rs
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
pub mod carrefour;
|
||||||
|
mod common;
|
||||||
|
pub mod coto;
|
||||||
|
pub mod dia;
|
||||||
|
pub mod jumbo;
|
||||||
|
mod vtex;
|
102
scraper-rs/src/sites/vtex.rs
Normal file
102
scraper-rs/src/sites/vtex.rs
Normal file
|
@ -0,0 +1,102 @@
|
||||||
|
use anyhow::{bail, Context};
|
||||||
|
use serde::Deserialize;
|
||||||
|
use simple_error::SimpleError;
|
||||||
|
use tl::VDom;
|
||||||
|
|
||||||
|
use super::common;
|
||||||
|
|
||||||
|
pub fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, anyhow::Error> {
|
||||||
|
let inner_html = &dom
|
||||||
|
.query_selector("template[data-type=\"json\"]")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()).and_then(|n| n.as_tag()))
|
||||||
|
.find(|t| {
|
||||||
|
t.attributes()
|
||||||
|
.get("data-varname")
|
||||||
|
.flatten()
|
||||||
|
.map_or(false, |v| v.as_utf8_str() == varname)
|
||||||
|
})
|
||||||
|
.ok_or(SimpleError::new("Failed to get template tag"))?
|
||||||
|
.query_selector(dom.parser(), "script")
|
||||||
|
.and_then(|mut it| it.next())
|
||||||
|
.and_then(|h| h.get(dom.parser()))
|
||||||
|
.ok_or(SimpleError::new("Failed to get script tag"))?
|
||||||
|
.inner_html(dom.parser());
|
||||||
|
inner_html.parse().context("Couldn't parse JSON in script")
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_json_lds<'a>(
|
||||||
|
dom: &'a VDom,
|
||||||
|
) -> impl Iterator<Item = std::result::Result<serde_json::Value, serde_json::Error>> + 'a {
|
||||||
|
dom.query_selector("script[type=\"application/ld+json\"]")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.map(|t| serde_json::from_str(&t.inner_html(dom.parser())))
|
||||||
|
}
|
||||||
|
pub fn find_json_ld(dom: &VDom, typ: &str) -> Option<Result<Ld, serde_json::Error>> {
|
||||||
|
get_json_lds(dom)
|
||||||
|
.filter_map(|v| v.ok())
|
||||||
|
.find(|v| v.get("@type").is_some_and(|t| t == typ))
|
||||||
|
.map(serde_json::from_value)
|
||||||
|
}
|
||||||
|
pub fn find_product_ld(dom: &VDom) -> Option<Result<ProductLd, serde_json::Error>> {
|
||||||
|
find_json_ld(dom, "Product").map(|l| {
|
||||||
|
l.map(|l| match l {
|
||||||
|
Ld::Product(p) => p,
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
#[serde(tag = "@type")]
|
||||||
|
pub enum Ld {
|
||||||
|
Product(ProductLd),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct ProductLd {
|
||||||
|
pub name: String,
|
||||||
|
pub image: String,
|
||||||
|
pub sku: Option<String>,
|
||||||
|
pub offers: OffersLd,
|
||||||
|
}
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
pub struct OffersLd {
|
||||||
|
pub offers: Vec<OfferLd>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct OfferLd {
|
||||||
|
#[serde(rename = "@type")]
|
||||||
|
_type: OfferTypeLd,
|
||||||
|
pub price: f64,
|
||||||
|
pub price_currency: String,
|
||||||
|
pub availability: AvailabilityLd,
|
||||||
|
}
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
pub enum OfferTypeLd {
|
||||||
|
Offer,
|
||||||
|
}
|
||||||
|
#[derive(Deserialize, PartialEq)]
|
||||||
|
pub enum AvailabilityLd {
|
||||||
|
#[serde(rename = "http://schema.org/InStock")]
|
||||||
|
InStock,
|
||||||
|
#[serde(rename = "http://schema.org/OutOfStock")]
|
||||||
|
OutOfStock,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn in_stock_from_meta(dom: &VDom) -> anyhow::Result<bool> {
|
||||||
|
Ok(
|
||||||
|
match common::get_meta_content(dom, "product:availability") {
|
||||||
|
Some(s) => match s.as_ref() {
|
||||||
|
"oos" => false,
|
||||||
|
"instock" => true,
|
||||||
|
_ => bail!("Not a valid product:availability"),
|
||||||
|
},
|
||||||
|
None => bail!("No product:availability in carrefour"),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
|
@ -4,7 +4,6 @@ import { join } from "node:path";
|
||||||
import { Supermercado, hosts, supermercados } from "db-datos/supermercado.js";
|
import { Supermercado, hosts, supermercados } from "db-datos/supermercado.js";
|
||||||
import PQueue from "p-queue";
|
import PQueue from "p-queue";
|
||||||
import { formatDuration, intervalToDuration } from "date-fns";
|
import { formatDuration, intervalToDuration } from "date-fns";
|
||||||
import { downloadList } from "./scrap.js";
|
|
||||||
import { db } from "db-datos/db.js";
|
import { db } from "db-datos/db.js";
|
||||||
import { like } from "drizzle-orm";
|
import { like } from "drizzle-orm";
|
||||||
import { productoUrls } from "db-datos/schema.js";
|
import { productoUrls } from "db-datos/schema.js";
|
||||||
|
@ -12,9 +11,10 @@ import { scrapDiaProducts } from "../link-scrapers/dia.js";
|
||||||
import { scrapCotoProducts } from "../link-scrapers/coto.js";
|
import { scrapCotoProducts } from "../link-scrapers/coto.js";
|
||||||
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
|
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
|
||||||
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
|
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
|
||||||
|
import { readableStreamToText } from "bun";
|
||||||
|
|
||||||
// hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU
|
// hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU
|
||||||
const scrapQueue = new PQueue({ concurrency: 4 });
|
const scrapQueue = new PQueue({ concurrency: 1 });
|
||||||
|
|
||||||
export async function auto() {
|
export async function auto() {
|
||||||
const a = new Auto();
|
const a = new Auto();
|
||||||
|
@ -38,32 +38,35 @@ class Auto {
|
||||||
this.inform("[auto] Empezando scrap");
|
this.inform("[auto] Empezando scrap");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async scrapUrls(supermercado: Supermercado) {
|
||||||
|
const t0 = performance.now();
|
||||||
|
switch (supermercado) {
|
||||||
|
case "Dia":
|
||||||
|
await scrapDiaProducts();
|
||||||
|
break;
|
||||||
|
case "Coto":
|
||||||
|
await scrapCotoProducts();
|
||||||
|
break;
|
||||||
|
case "Carrefour":
|
||||||
|
await scrapCarrefourProducts();
|
||||||
|
break;
|
||||||
|
case "Jumbo":
|
||||||
|
await scrapJumboProducts();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
this.inform(
|
||||||
|
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
async downloadList(supermercado: Supermercado) {
|
async downloadList(supermercado: Supermercado) {
|
||||||
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-download-"));
|
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-download-"));
|
||||||
|
|
||||||
let listPath: string;
|
await scrapQueue.add(async () => {
|
||||||
{
|
await this.scrapUrls(supermercado);
|
||||||
const t0 = performance.now();
|
});
|
||||||
switch (supermercado) {
|
|
||||||
case "Dia":
|
|
||||||
await scrapDiaProducts();
|
|
||||||
break;
|
|
||||||
case "Coto":
|
|
||||||
await scrapCotoProducts();
|
|
||||||
break;
|
|
||||||
case "Carrefour":
|
|
||||||
await scrapCarrefourProducts();
|
|
||||||
break;
|
|
||||||
case "Jumbo":
|
|
||||||
await scrapJumboProducts();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
this.inform(
|
|
||||||
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
listPath = join(ctxPath, `lista-${supermercado}.txt`);
|
const listPath = join(ctxPath, `lista-${supermercado}.txt`);
|
||||||
const host = Object.entries(hosts).find(
|
const host = Object.entries(hosts).find(
|
||||||
([host, supe]) => supe === supermercado
|
([host, supe]) => supe === supermercado
|
||||||
)![0];
|
)![0];
|
||||||
|
@ -82,16 +85,25 @@ class Auto {
|
||||||
async scrapAndInform({ listPath }: { listPath: string }) {
|
async scrapAndInform({ listPath }: { listPath: string }) {
|
||||||
const res = await scrapQueue.add(async () => {
|
const res = await scrapQueue.add(async () => {
|
||||||
const t0 = performance.now();
|
const t0 = performance.now();
|
||||||
const progress = await downloadList(listPath);
|
|
||||||
return { took: performance.now() - t0, progress };
|
const sub = Bun.spawn({
|
||||||
|
cmd: ["scraper-rs", "fetch-list", listPath],
|
||||||
|
stdio: ["ignore", "pipe", "inherit"],
|
||||||
|
});
|
||||||
|
const text = await readableStreamToText(sub.stdout);
|
||||||
|
const code = await sub.exited;
|
||||||
|
if (code !== 0) throw new Error(`scraper-rs threw ${code}`);
|
||||||
|
|
||||||
|
return { took: performance.now() - t0, text };
|
||||||
});
|
});
|
||||||
|
|
||||||
if (res) {
|
if (res) {
|
||||||
const { took, progress } = res;
|
const { took, text } = res;
|
||||||
this.inform(
|
this.inform(
|
||||||
`Procesado ${listPath} (${progress.done} ok, ${
|
`Procesado ${listPath} (${text}) (tardó ${formatMs(took)})`
|
||||||
progress.skipped
|
//(${progress.done} ok, ${
|
||||||
} skipped, ${progress.errors.length} errores) (tardó ${formatMs(took)})`
|
// progress.skipped
|
||||||
|
// } skipped, ${progress.errors.length} errores)
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
this.inform(`Algo falló en ${listPath}`);
|
this.inform(`Algo falló en ${listPath}`);
|
||||||
|
|
|
@ -4,9 +4,14 @@ import { scrapDiaProducts } from "../link-scrapers/dia.js";
|
||||||
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
|
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
|
||||||
import { auto } from "./auto.js";
|
import { auto } from "./auto.js";
|
||||||
import { downloadList, getProduct } from "./scrap.js";
|
import { downloadList, getProduct } from "./scrap.js";
|
||||||
|
import Cron from "croner";
|
||||||
|
|
||||||
if (process.argv[2] === "auto") {
|
if (process.argv[2] === "auto") {
|
||||||
await auto();
|
await auto();
|
||||||
|
} else if (process.argv[2] === "cron") {
|
||||||
|
Cron("0 2 * * *", () => {
|
||||||
|
auto();
|
||||||
|
});
|
||||||
} else if (process.argv[2] === "scrap-carrefour-links") {
|
} else if (process.argv[2] === "scrap-carrefour-links") {
|
||||||
await scrapCarrefourProducts();
|
await scrapCarrefourProducts();
|
||||||
} else if (process.argv[2] === "scrap-dia-links") {
|
} else if (process.argv[2] === "scrap-dia-links") {
|
||||||
|
|
|
@ -13,6 +13,7 @@
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@aws-sdk/client-s3": "^3.478.0",
|
"@aws-sdk/client-s3": "^3.478.0",
|
||||||
"@aws-sdk/lib-storage": "^3.478.0",
|
"@aws-sdk/lib-storage": "^3.478.0",
|
||||||
|
"croner": "^8.0.0",
|
||||||
"date-fns": "^3.0.6",
|
"date-fns": "^3.0.6",
|
||||||
"db-datos": "workspace:^",
|
"db-datos": "workspace:^",
|
||||||
"drizzle-orm": "^0.29.1",
|
"drizzle-orm": "^0.29.1",
|
||||||
|
|
|
@ -38,7 +38,6 @@
|
||||||
"better-sqlite3": "^9.2.2",
|
"better-sqlite3": "^9.2.2",
|
||||||
"chart.js": "^4.4.1",
|
"chart.js": "^4.4.1",
|
||||||
"chartjs-adapter-dayjs-4": "^1.0.4",
|
"chartjs-adapter-dayjs-4": "^1.0.4",
|
||||||
"croner": "^8.0.0",
|
|
||||||
"dayjs": "^1.11.10",
|
"dayjs": "^1.11.10",
|
||||||
"drizzle-orm": "^0.29.1"
|
"drizzle-orm": "^0.29.1"
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,12 +0,0 @@
|
||||||
import { spawn } from "child_process";
|
|
||||||
import Cron from "croner";
|
|
||||||
|
|
||||||
if (process.env.NODE_ENV === "production") {
|
|
||||||
const job = Cron("15 3 * * *", () => {
|
|
||||||
runScraper();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
function runScraper() {
|
|
||||||
spawn("bun", ["/bin/scraper", "auto"], { stdio: "inherit" });
|
|
||||||
}
|
|
Loading…
Reference in a new issue