diff --git a/scraper-rs/src/main.rs b/scraper-rs/src/main.rs index 10ee7a3..076b1ab 100644 --- a/scraper-rs/src/main.rs +++ b/scraper-rs/src/main.rs @@ -186,18 +186,25 @@ async fn request_and_body(client: &reqwest::Client, url: &str) -> reqwest::Resul res.text().await } pub async fn fetch_body(client: &reqwest::Client, url: &str) -> reqwest::Result { - get_retry_policy() + get_fetch_retry_policy() .retry_if(|| request_and_body(client, url), retry_if_wasnt_not_found) .await } -pub fn get_retry_policy() -> again::RetryPolicy { +pub fn get_fetch_retry_policy() -> again::RetryPolicy { RetryPolicy::exponential(Duration::from_millis(300)) .with_max_retries(20) .with_max_delay(Duration::from_secs(40)) .with_jitter(true) } +pub fn get_parse_retry_policy() -> again::RetryPolicy { + RetryPolicy::exponential(Duration::from_millis(300)) + .with_max_retries(3) + .with_max_delay(Duration::from_secs(5)) + .with_jitter(true) +} + pub fn retry_if_wasnt_not_found(err: &reqwest::Error) -> bool { !err.status().is_some_and(|s| s == StatusCode::NOT_FOUND) } @@ -207,24 +214,32 @@ async fn fetch_and_parse( client: &reqwest::Client, url: String, ) -> Result { - let body = fetch_body(client, &url).await?; + async fn fetch_and_scrap( + client: &reqwest::Client, + url: String, + ) -> Result { + let body = fetch_body(client, &url).await?; + let maybe_point = { scrap_url(client, url, &body).await }; - let maybe_point = { scrap_url(client, url, &body).await }; + let point = match maybe_point { + Ok(p) => Ok(p), + Err(err) => { + let now: DateTime = Utc::now(); + let debug_path = PathBuf::from(format!("debug-{}/", now.format("%Y-%m-%d"))); + tokio::fs::create_dir_all(&debug_path).await.unwrap(); + let file_path = debug_path.join(format!("{}.html", nanoid!())); + tokio::fs::write(&file_path, &body).await.unwrap(); + tracing::debug!(error=%err, "Failed to parse, saved body at {}",file_path.display()); + Err(err) + } + }?; - let point = match maybe_point { - Ok(p) => Ok(p), - Err(err) => { - let now: DateTime = Utc::now(); - let debug_path = PathBuf::from(format!("debug-{}/", now.format("%Y-%m-%d"))); - tokio::fs::create_dir_all(&debug_path).await.unwrap(); - let file_path = debug_path.join(format!("{}.html", nanoid!())); - tokio::fs::write(&file_path, &body).await.unwrap(); - tracing::debug!(error=%err, "Failed to parse, saved body at {}",file_path.display()); - Err(err) - } - }?; + Ok(point) + } - Ok(point) + get_parse_retry_policy() + .retry(|| fetch_and_scrap(client, url.clone())) + .await } async fn parse_file_cli(file_path: String) -> anyhow::Result<()> { diff --git a/scraper-rs/src/sites/coto.rs b/scraper-rs/src/sites/coto.rs index 3f1e4d8..a225514 100644 --- a/scraper-rs/src/sites/coto.rs +++ b/scraper-rs/src/sites/coto.rs @@ -3,7 +3,9 @@ use futures::{stream, StreamExt, TryFutureExt, TryStreamExt}; use itertools::Itertools; use reqwest::Url; -use crate::{build_client, do_request, get_retry_policy, retry_if_wasnt_not_found, PrecioPoint}; +use crate::{ + build_client, do_request, get_fetch_retry_policy, retry_if_wasnt_not_found, PrecioPoint, +}; pub fn parse(url: String, dom: &tl::VDom) -> Result { let ean = dom @@ -90,7 +92,7 @@ pub async fn get_urls() -> anyhow::Result> { .finish(); let client = &client; async move { - let text = get_retry_policy() + let text = get_fetch_retry_policy() .retry_if( || do_request(client, u.as_str()).and_then(|r| r.text()), retry_if_wasnt_not_found, diff --git a/scraper-rs/src/sites/jumbo.rs b/scraper-rs/src/sites/jumbo.rs index 994c9fb..cf06e6a 100644 --- a/scraper-rs/src/sites/jumbo.rs +++ b/scraper-rs/src/sites/jumbo.rs @@ -7,7 +7,7 @@ use serde::Deserialize; use simple_error::bail; use crate::sites::common; -use crate::{do_request, get_retry_policy, PrecioPoint}; +use crate::{do_request, get_fetch_retry_policy, PrecioPoint}; use super::vtex; @@ -31,7 +31,7 @@ async fn get_ean_from_search( url.set_query(Some(&format!("fq=skuId:{}", retailer_sku))); url }; - let s = get_retry_policy() + let s = get_fetch_retry_policy() .retry(|| do_request(client, url.as_str()).and_then(|r| r.text())) .await?; let ean = { diff --git a/scraper-rs/src/sites/vtex.rs b/scraper-rs/src/sites/vtex.rs index 1fdb44e..3fb2a26 100644 --- a/scraper-rs/src/sites/vtex.rs +++ b/scraper-rs/src/sites/vtex.rs @@ -10,7 +10,7 @@ use serde_json::json; use simple_error::SimpleError; use tl::VDom; -use crate::{build_client, do_request, get_retry_policy, retry_if_wasnt_not_found}; +use crate::{build_client, do_request, get_fetch_retry_policy, retry_if_wasnt_not_found}; use super::common; @@ -132,7 +132,7 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result) -> anyhow::Result(client: &reqwest::Client, url: &str) -> anyhow::Result { - let body = get_retry_policy() + let body = get_fetch_retry_policy() .retry_if(|| do_request(client, url), retry_if_wasnt_not_found) .await? .text()