mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-29 13:06:19 +00:00
rustificar todo
This commit is contained in:
parent
d233dbd259
commit
0144a56158
3 changed files with 216 additions and 41 deletions
|
@ -1,14 +1,7 @@
|
||||||
FROM cgr.dev/chainguard/wolfi-base AS base
|
FROM cgr.dev/chainguard/wolfi-base AS base
|
||||||
WORKDIR /usr/src/app
|
WORKDIR /usr/src/app
|
||||||
RUN apk add --no-cache bun libgcc
|
RUN apk add --no-cache libgcc
|
||||||
|
|
||||||
FROM base as build
|
|
||||||
ENV NODE_ENV=production
|
|
||||||
COPY . .
|
|
||||||
RUN bun install --frozen-lockfile
|
|
||||||
RUN bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js
|
|
||||||
|
|
||||||
# nightly porque usamos tl con `simd` activado
|
|
||||||
FROM base as rs-build
|
FROM base as rs-build
|
||||||
RUN apk add --no-cache rust build-base sqlite-dev
|
RUN apk add --no-cache rust build-base sqlite-dev
|
||||||
|
|
||||||
|
@ -19,11 +12,8 @@ FROM base
|
||||||
RUN apk add --no-cache sqlite sqlite-libs
|
RUN apk add --no-cache sqlite sqlite-libs
|
||||||
|
|
||||||
# Scraper
|
# Scraper
|
||||||
COPY --from=build /tmp/cli.build.js /bin/scraper
|
|
||||||
COPY --from=build /usr/src/app/db-datos/drizzle /bin/drizzle
|
|
||||||
COPY --from=rs-build /root/.cargo/bin/scraper-rs /usr/local/bin/scraper-rs
|
COPY --from=rs-build /root/.cargo/bin/scraper-rs /usr/local/bin/scraper-rs
|
||||||
|
|
||||||
ENV NODE_ENV=production
|
|
||||||
ENV DB_PATH=/db/db.db
|
ENV DB_PATH=/db/db.db
|
||||||
|
|
||||||
CMD ["bun", "/bin/scraper", "cron"]
|
CMD ["scraper-rs", "cron"]
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
use again::RetryPolicy;
|
use again::RetryPolicy;
|
||||||
use async_channel::Receiver;
|
use async_channel::Receiver;
|
||||||
use clap::{Parser, ValueEnum};
|
use clap::{Parser, ValueEnum};
|
||||||
|
use futures::{stream, StreamExt, TryStreamExt};
|
||||||
use nanoid::nanoid;
|
use nanoid::nanoid;
|
||||||
use r2d2::Pool;
|
use r2d2::Pool;
|
||||||
use r2d2_sqlite::SqliteConnectionManager;
|
use r2d2_sqlite::SqliteConnectionManager;
|
||||||
|
@ -13,8 +14,9 @@ use std::{
|
||||||
time::Duration,
|
time::Duration,
|
||||||
};
|
};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
use tokio::time;
|
||||||
|
|
||||||
#[derive(ValueEnum, Clone)]
|
#[derive(ValueEnum, Clone, Debug)]
|
||||||
enum Supermercado {
|
enum Supermercado {
|
||||||
Dia,
|
Dia,
|
||||||
Jumbo,
|
Jumbo,
|
||||||
|
@ -27,6 +29,8 @@ enum Args {
|
||||||
FetchList(FetchListArgs),
|
FetchList(FetchListArgs),
|
||||||
ParseFile(ParseFileArgs),
|
ParseFile(ParseFileArgs),
|
||||||
GetUrlList(GetUrlListArgs),
|
GetUrlList(GetUrlListArgs),
|
||||||
|
Auto(AutoArgs),
|
||||||
|
Cron(AutoArgs),
|
||||||
}
|
}
|
||||||
#[derive(clap::Args)]
|
#[derive(clap::Args)]
|
||||||
struct FetchListArgs {
|
struct FetchListArgs {
|
||||||
|
@ -41,6 +45,8 @@ struct GetUrlListArgs {
|
||||||
#[arg(value_enum)]
|
#[arg(value_enum)]
|
||||||
supermercado: Supermercado,
|
supermercado: Supermercado,
|
||||||
}
|
}
|
||||||
|
#[derive(clap::Args)]
|
||||||
|
struct AutoArgs {}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> anyhow::Result<()> {
|
async fn main() -> anyhow::Result<()> {
|
||||||
|
@ -50,6 +56,8 @@ async fn main() -> anyhow::Result<()> {
|
||||||
Args::FetchList(a) => fetch_list_cli(a.list_path).await,
|
Args::FetchList(a) => fetch_list_cli(a.list_path).await,
|
||||||
Args::ParseFile(a) => parse_file_cli(a.file_path).await,
|
Args::ParseFile(a) => parse_file_cli(a.file_path).await,
|
||||||
Args::GetUrlList(a) => get_url_list_cli(a.supermercado).await,
|
Args::GetUrlList(a) => get_url_list_cli(a.supermercado).await,
|
||||||
|
Args::Auto(_) => auto_cli().await,
|
||||||
|
Args::Cron(_) => cron_cli().await,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -62,14 +70,18 @@ async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
|
||||||
.map(|s| s.to_owned())
|
.map(|s| s.to_owned())
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let pool = connect_db();
|
||||||
|
let counters = fetch_list(&pool, links).await;
|
||||||
|
|
||||||
|
println!("Finished: {:?}", counters);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn fetch_list(pool: &Pool<SqliteConnectionManager>, links: Vec<String>) -> Counters {
|
||||||
let (sender, receiver) = async_channel::bounded::<String>(1);
|
let (sender, receiver) = async_channel::bounded::<String>(1);
|
||||||
|
|
||||||
let db_path = env::var("DB_PATH").unwrap_or("../scraper/sqlite.db".to_string());
|
|
||||||
let manager = SqliteConnectionManager::file(db_path);
|
|
||||||
let pool = Pool::new(manager).unwrap();
|
|
||||||
|
|
||||||
let n_coroutines = env::var("N_COROUTINES")
|
let n_coroutines = env::var("N_COROUTINES")
|
||||||
.map_or(Ok(128), |s| s.parse::<usize>())
|
.map_or(Ok(24), |s| s.parse::<usize>())
|
||||||
.expect("N_COROUTINES no es un número");
|
.expect("N_COROUTINES no es un número");
|
||||||
let handles = (1..n_coroutines)
|
let handles = (1..n_coroutines)
|
||||||
.map(|_| {
|
.map(|_| {
|
||||||
|
@ -91,9 +103,14 @@ async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
|
||||||
counters.errored += c.errored;
|
counters.errored += c.errored;
|
||||||
counters.skipped += c.skipped;
|
counters.skipped += c.skipped;
|
||||||
}
|
}
|
||||||
|
counters
|
||||||
|
}
|
||||||
|
|
||||||
println!("Finished: {:?}", counters);
|
fn connect_db() -> Pool<SqliteConnectionManager> {
|
||||||
Ok(())
|
let db_path = env::var("DB_PATH").unwrap_or("../scraper/sqlite.db".to_string());
|
||||||
|
let manager = SqliteConnectionManager::file(db_path);
|
||||||
|
let pool = Pool::new(manager).unwrap();
|
||||||
|
pool
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_client() -> reqwest::Client {
|
fn build_client() -> reqwest::Client {
|
||||||
|
@ -130,7 +147,10 @@ async fn worker(rx: Receiver<String>, pool: Pool<SqliteConnectionManager>) -> Co
|
||||||
}
|
}
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
match err.downcast_ref::<FetchError>() {
|
match err.downcast_ref::<FetchError>() {
|
||||||
Some(FetchError::HttpStatus(StatusCode::NOT_FOUND)) => counters.skipped += 1,
|
Some(FetchError::Http(e)) => match e.status() {
|
||||||
|
Some(StatusCode::NOT_FOUND) => counters.skipped += 1,
|
||||||
|
_ => counters.errored += 1,
|
||||||
|
},
|
||||||
_ => counters.errored += 1,
|
_ => counters.errored += 1,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -146,20 +166,15 @@ async fn worker(rx: Receiver<String>, pool: Pool<SqliteConnectionManager>) -> Co
|
||||||
enum FetchError {
|
enum FetchError {
|
||||||
#[error("reqwest error")]
|
#[error("reqwest error")]
|
||||||
Http(#[from] reqwest::Error),
|
Http(#[from] reqwest::Error),
|
||||||
#[error("http status: {0}")]
|
|
||||||
HttpStatus(reqwest::StatusCode),
|
|
||||||
#[error("parse error")]
|
#[error("parse error")]
|
||||||
Parse(#[from] SimpleError),
|
Parse(#[from] SimpleError),
|
||||||
#[error("tl error")]
|
#[error("tl error")]
|
||||||
Tl(#[from] tl::ParseError),
|
Tl(#[from] tl::ParseError),
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn do_request(client: &reqwest::Client, url: &str) -> anyhow::Result<reqwest::Response> {
|
pub async fn do_request(client: &reqwest::Client, url: &str) -> reqwest::Result<reqwest::Response> {
|
||||||
let request = client.get(url).build()?;
|
let request = client.get(url).build()?;
|
||||||
let response = client.execute(request).await?;
|
let response = client.execute(request).await?.error_for_status()?;
|
||||||
if !response.status().is_success() {
|
|
||||||
bail!(FetchError::HttpStatus(response.status()));
|
|
||||||
}
|
|
||||||
Ok(response)
|
Ok(response)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -221,12 +236,7 @@ async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn get_url_list_cli(supermercado: Supermercado) -> anyhow::Result<()> {
|
async fn get_url_list_cli(supermercado: Supermercado) -> anyhow::Result<()> {
|
||||||
let urls = match supermercado {
|
let urls = get_urls(&supermercado).await?;
|
||||||
Supermercado::Dia => sites::dia::get_urls().await?,
|
|
||||||
Supermercado::Jumbo => sites::jumbo::get_urls().await?,
|
|
||||||
Supermercado::Carrefour => sites::carrefour::get_urls().await?,
|
|
||||||
_ => todo!(),
|
|
||||||
};
|
|
||||||
urls.iter().for_each(|s| {
|
urls.iter().for_each(|s| {
|
||||||
println!("{}", s);
|
println!("{}", s);
|
||||||
});
|
});
|
||||||
|
@ -234,6 +244,15 @@ async fn get_url_list_cli(supermercado: Supermercado) -> anyhow::Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn get_urls(supermercado: &Supermercado) -> Result<Vec<String>, anyhow::Error> {
|
||||||
|
Ok(match supermercado {
|
||||||
|
Supermercado::Dia => sites::dia::get_urls().await?,
|
||||||
|
Supermercado::Jumbo => sites::jumbo::get_urls().await?,
|
||||||
|
Supermercado::Carrefour => sites::carrefour::get_urls().await?,
|
||||||
|
Supermercado::Coto => sites::coto::get_urls().await?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
async fn scrap_url(
|
async fn scrap_url(
|
||||||
client: &reqwest::Client,
|
client: &reqwest::Client,
|
||||||
url: String,
|
url: String,
|
||||||
|
@ -255,6 +274,105 @@ async fn scrap_url(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct Auto {
|
||||||
|
pool: Pool<SqliteConnectionManager>,
|
||||||
|
telegram_token: String,
|
||||||
|
telegram_chat_id: String,
|
||||||
|
}
|
||||||
|
impl Auto {
|
||||||
|
async fn download_supermercado(self: &Self, supermercado: Supermercado) -> anyhow::Result<()> {
|
||||||
|
{
|
||||||
|
let t0 = now_sec();
|
||||||
|
self.get_and_save_urls(&supermercado).await?;
|
||||||
|
self.inform(&format!(
|
||||||
|
"Downloaded url list {:?} (took {})",
|
||||||
|
&supermercado,
|
||||||
|
now_sec() - t0
|
||||||
|
))
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
let links: Vec<String> = self
|
||||||
|
.pool
|
||||||
|
.get()?
|
||||||
|
.prepare(r#"SELECT url FROM producto_urls;"#)?
|
||||||
|
.query_map([], |r| r.get::<_, String>(0))?
|
||||||
|
.map(|r| r.unwrap())
|
||||||
|
.collect();
|
||||||
|
{
|
||||||
|
let t0 = now_sec();
|
||||||
|
let counters = fetch_list(&self.pool, links).await;
|
||||||
|
self.inform(&format!(
|
||||||
|
"Downloaded {:?}: {:?} (took {})",
|
||||||
|
&supermercado,
|
||||||
|
counters,
|
||||||
|
now_sec() - t0
|
||||||
|
))
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_and_save_urls(self: &Self, supermercado: &Supermercado) -> anyhow::Result<()> {
|
||||||
|
let urls = get_urls(supermercado).await?;
|
||||||
|
let connection = &mut self.pool.get()?;
|
||||||
|
let tx = connection.transaction()?;
|
||||||
|
{
|
||||||
|
let mut stmt = tx.prepare(
|
||||||
|
r#"INSERT INTO producto_urls(url, first_seen, last_seen)
|
||||||
|
VALUES (?1, ?2, ?2)
|
||||||
|
ON CONFLICT(url) DO UPDATE SET last_seen=?2;"#,
|
||||||
|
)?;
|
||||||
|
let now: u64 = now_ms().try_into()?;
|
||||||
|
for url in urls {
|
||||||
|
stmt.execute(rusqlite::params![url, now])?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tx.commit()?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn inform(self: &Self, msg: &str) {
|
||||||
|
println!("{}", msg);
|
||||||
|
let u = Url::parse_with_params(
|
||||||
|
&format!(
|
||||||
|
"https://api.telegram.org/bot{}/sendMessage",
|
||||||
|
self.telegram_token
|
||||||
|
),
|
||||||
|
&[
|
||||||
|
("chat_id", self.telegram_chat_id.clone()),
|
||||||
|
("text", msg.to_string()),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
reqwest::get(u).await.unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn auto_cli() -> anyhow::Result<()> {
|
||||||
|
let db = connect_db();
|
||||||
|
let auto = Auto {
|
||||||
|
pool: db,
|
||||||
|
telegram_token: env::var("TELEGRAM_BOT_TOKEN")?,
|
||||||
|
telegram_chat_id: env::var("TELEGRAM_BOT_CHAT_ID")?,
|
||||||
|
};
|
||||||
|
auto.inform("[auto] Empezando scrap").await;
|
||||||
|
stream::iter(Supermercado::value_variants().iter())
|
||||||
|
.map(|s| auto.download_supermercado(s.to_owned()))
|
||||||
|
.buffer_unordered(64)
|
||||||
|
.try_collect()
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
async fn cron_cli() -> anyhow::Result<()> {
|
||||||
|
let mut interval = time::interval(std::time::Duration::from_secs(60 * 60 * 24));
|
||||||
|
|
||||||
|
loop {
|
||||||
|
interval.tick().await;
|
||||||
|
auto_cli().await.unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
use std::time::{SystemTime, UNIX_EPOCH};
|
use std::time::{SystemTime, UNIX_EPOCH};
|
||||||
|
|
||||||
mod sites;
|
mod sites;
|
||||||
|
@ -273,9 +391,14 @@ struct PrecioPoint {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn now_sec() -> u64 {
|
fn now_sec() -> u64 {
|
||||||
let start = SystemTime::now();
|
since_the_epoch().as_secs()
|
||||||
let since_the_epoch = start
|
}
|
||||||
.duration_since(UNIX_EPOCH)
|
fn now_ms() -> u128 {
|
||||||
.expect("Time went backwards");
|
since_the_epoch().as_millis()
|
||||||
since_the_epoch.as_secs()
|
}
|
||||||
|
|
||||||
|
fn since_the_epoch() -> Duration {
|
||||||
|
SystemTime::now()
|
||||||
|
.duration_since(UNIX_EPOCH)
|
||||||
|
.expect("Time went backwards")
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
use anyhow::Context;
|
use anyhow::{anyhow, Context};
|
||||||
|
use futures::{stream, StreamExt, TryFutureExt, TryStreamExt};
|
||||||
|
use itertools::Itertools;
|
||||||
|
use reqwest::Url;
|
||||||
|
|
||||||
use crate::PrecioPoint;
|
use crate::{build_client, do_request, get_retry_policy, PrecioPoint};
|
||||||
|
|
||||||
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
let ean = dom
|
let ean = dom
|
||||||
|
@ -71,3 +74,62 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error>
|
||||||
url,
|
url,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn get_urls() -> anyhow::Result<Vec<String>> {
|
||||||
|
// let (sender, recv) = async_channel::unbounded();
|
||||||
|
let client = build_client();
|
||||||
|
let initial = Url::parse("https://www.cotodigital3.com.ar/sitios/cdigi/browse?Nf=product.endDate%7CGTEQ+1.7032032E12%7C%7Cproduct.startDate%7CLTEQ+1.7032032E12&Nr=AND%28product.sDisp_200%3A1004%2Cproduct.language%3Aespa%C3%B1ol%2COR%28product.siteId%3ACotoDigital%29%29")?;
|
||||||
|
|
||||||
|
let page_size = 100;
|
||||||
|
let handles: Vec<Vec<String>> = stream::iter(0..29000 / page_size)
|
||||||
|
.map(|i| {
|
||||||
|
let mut u = initial.clone();
|
||||||
|
u.query_pairs_mut()
|
||||||
|
.append_pair("No", &(i * page_size).to_string())
|
||||||
|
.append_pair("Nrpp", &(page_size).to_string())
|
||||||
|
.finish();
|
||||||
|
let client = &client;
|
||||||
|
async move {
|
||||||
|
let text = get_retry_policy()
|
||||||
|
.retry(|| do_request(client, u.as_str()).and_then(|r| r.text()))
|
||||||
|
.await?;
|
||||||
|
let dom = tl::parse(&text, tl::ParserOptions::default())?;
|
||||||
|
|
||||||
|
let list: Vec<String> = dom
|
||||||
|
.query_selector(".product_info_container")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.filter_map(|t| -> Option<anyhow::Result<String>> {
|
||||||
|
t.children()
|
||||||
|
.top()
|
||||||
|
.iter()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.find(|t| t.name() == "a")
|
||||||
|
.map(|t| {
|
||||||
|
t.attributes()
|
||||||
|
.get("href")
|
||||||
|
.flatten()
|
||||||
|
.ok_or(anyhow!("No tiene href="))
|
||||||
|
})
|
||||||
|
.map(|s| {
|
||||||
|
Ok(Url::options()
|
||||||
|
.base_url(Some(&Url::parse("https://www.cotodigital3.com.ar")?))
|
||||||
|
.parse(s?.as_utf8_str().as_ref())?
|
||||||
|
.to_string())
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.try_collect()?;
|
||||||
|
Ok::<Vec<String>, anyhow::Error>(list)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.buffer_unordered(8)
|
||||||
|
.try_collect()
|
||||||
|
.await?;
|
||||||
|
let mut total: Vec<String> = vec![];
|
||||||
|
for mut urls in handles {
|
||||||
|
total.append(&mut urls);
|
||||||
|
}
|
||||||
|
Ok(total.into_iter().unique().collect())
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue