From 75b2297a071df47ce559a01f08c4361a52c6ebe8 Mon Sep 17 00:00:00 2001 From: Nulo Date: Fri, 12 Jan 2024 13:51:32 -0300 Subject: [PATCH] wtf --- scraper-rs/src/main.rs | 18 ++++++++++-------- scraper-rs/src/sites/carrefour.rs | 2 +- scraper-rs/src/sites/dia.rs | 2 +- scraper-rs/src/sites/jumbo.rs | 2 +- scraper-rs/src/sites/vtex.rs | 11 ++++++++--- 5 files changed, 21 insertions(+), 14 deletions(-) diff --git a/scraper-rs/src/main.rs b/scraper-rs/src/main.rs index bc89c91..e605ffb 100644 --- a/scraper-rs/src/main.rs +++ b/scraper-rs/src/main.rs @@ -1,7 +1,7 @@ use again::RetryPolicy; use async_channel::Receiver; use clap::{Parser, ValueEnum}; -use futures::{stream, StreamExt, TryStreamExt}; +use futures::future; use nanoid::nanoid; use r2d2::Pool; use r2d2_sqlite::SqliteConnectionManager; @@ -129,7 +129,8 @@ async fn worker(rx: Receiver, pool: Pool) -> Co let mut counters = Counters::default(); while let Ok(url) = rx.recv().await { - let res = fetch_and_parse(&client, url.clone()).await; + let client = &client; + let res = fetch_and_parse(client, url.clone()).await; match res { Ok(res) => { counters.success += 1; @@ -274,13 +275,14 @@ async fn scrap_url( } } +#[derive(Clone)] struct Auto { pool: Pool, telegram_token: String, telegram_chat_id: String, } impl Auto { - async fn download_supermercado(self: &Self, supermercado: Supermercado) -> anyhow::Result<()> { + async fn download_supermercado(self: Self, supermercado: Supermercado) -> anyhow::Result<()> { { let t0 = now_sec(); self.get_and_save_urls(&supermercado).await?; @@ -357,11 +359,11 @@ async fn auto_cli() -> anyhow::Result<()> { telegram_chat_id: env::var("TELEGRAM_BOT_CHAT_ID")?, }; auto.inform("[auto] Empezando scrap").await; - stream::iter(Supermercado::value_variants().iter()) - .map(|s| auto.download_supermercado(s.to_owned())) - .buffer_unordered(64) - .try_collect() - .await?; + let handles: Vec<_> = Supermercado::value_variants() + .iter() + .map(|s| tokio::spawn(auto.clone().download_supermercado(s.to_owned()))) + .collect(); + future::try_join_all(handles).await?; Ok(()) } async fn cron_cli() -> anyhow::Result<()> { diff --git a/scraper-rs/src/sites/carrefour.rs b/scraper-rs/src/sites/carrefour.rs index 4e4ecec..77f9a59 100644 --- a/scraper-rs/src/sites/carrefour.rs +++ b/scraper-rs/src/sites/carrefour.rs @@ -80,5 +80,5 @@ pub async fn get_urls() -> anyhow::Result> { "https://www.carrefour.com.ar/sitemap/product-8.xml", "https://www.carrefour.com.ar/sitemap/product-9.xml", ]; - vtex::get_urls_from_sitemap(&urls).await + vtex::get_urls_from_sitemap(urls).await } diff --git a/scraper-rs/src/sites/dia.rs b/scraper-rs/src/sites/dia.rs index db7afb7..d4ae9b5 100644 --- a/scraper-rs/src/sites/dia.rs +++ b/scraper-rs/src/sites/dia.rs @@ -49,5 +49,5 @@ pub async fn get_urls() -> anyhow::Result> { "https://diaonline.supermercadosdia.com.ar/sitemap/product-4.xml", "https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml", ]; - vtex::get_urls_from_sitemap(&urls).await + vtex::get_urls_from_sitemap(urls).await } diff --git a/scraper-rs/src/sites/jumbo.rs b/scraper-rs/src/sites/jumbo.rs index 8a5d039..4be27cf 100644 --- a/scraper-rs/src/sites/jumbo.rs +++ b/scraper-rs/src/sites/jumbo.rs @@ -110,5 +110,5 @@ pub async fn get_urls() -> anyhow::Result> { "https://www.jumbo.com.ar/sitemap/product-8.xml", "https://www.jumbo.com.ar/sitemap/product-9.xml", ]; - vtex::get_urls_from_sitemap(&urls).await + vtex::get_urls_from_sitemap(urls).await } diff --git a/scraper-rs/src/sites/vtex.rs b/scraper-rs/src/sites/vtex.rs index 378fc81..fc50304 100644 --- a/scraper-rs/src/sites/vtex.rs +++ b/scraper-rs/src/sites/vtex.rs @@ -117,21 +117,26 @@ pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result> { .collect()) } -pub async fn get_urls_from_sitemap<'a>(sitemaps: &[&str]) -> anyhow::Result> { +pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result> { let mut total: Vec = vec![]; let client = build_client(); let handles = stream::iter(sitemaps) .map(|url| { - let client = &client; + let client = client.clone(); + let url = url.to_string(); async move { + let client = client; + let url = url; let text = get_retry_policy() - .retry(|| do_request(client, url)) + .retry(|| do_request(&client, &url)) .await? .text() .await?; parse_urls_from_sitemap(&text) } }) + // https://github.com/rust-lang/rust/issues/89976#issuecomment-1073115246 + .boxed() .buffer_unordered(8) .try_collect::>() .await?;