From b807de8eb4f2d14056b9270670b8a69ad6842e6f Mon Sep 17 00:00:00 2001 From: Nulo Date: Tue, 6 Feb 2024 22:48:44 -0300 Subject: [PATCH 1/4] reintentar si falla parsear muchos sitios vtex devuelven un html sin casi nada aleatoriamente --- scraper-rs/src/main.rs | 49 +++++++++++++++++++++++------------ scraper-rs/src/sites/coto.rs | 6 +++-- scraper-rs/src/sites/jumbo.rs | 4 +-- scraper-rs/src/sites/vtex.rs | 6 ++--- 4 files changed, 41 insertions(+), 24 deletions(-) diff --git a/scraper-rs/src/main.rs b/scraper-rs/src/main.rs index 10ee7a3..076b1ab 100644 --- a/scraper-rs/src/main.rs +++ b/scraper-rs/src/main.rs @@ -186,18 +186,25 @@ async fn request_and_body(client: &reqwest::Client, url: &str) -> reqwest::Resul res.text().await } pub async fn fetch_body(client: &reqwest::Client, url: &str) -> reqwest::Result { - get_retry_policy() + get_fetch_retry_policy() .retry_if(|| request_and_body(client, url), retry_if_wasnt_not_found) .await } -pub fn get_retry_policy() -> again::RetryPolicy { +pub fn get_fetch_retry_policy() -> again::RetryPolicy { RetryPolicy::exponential(Duration::from_millis(300)) .with_max_retries(20) .with_max_delay(Duration::from_secs(40)) .with_jitter(true) } +pub fn get_parse_retry_policy() -> again::RetryPolicy { + RetryPolicy::exponential(Duration::from_millis(300)) + .with_max_retries(3) + .with_max_delay(Duration::from_secs(5)) + .with_jitter(true) +} + pub fn retry_if_wasnt_not_found(err: &reqwest::Error) -> bool { !err.status().is_some_and(|s| s == StatusCode::NOT_FOUND) } @@ -207,24 +214,32 @@ async fn fetch_and_parse( client: &reqwest::Client, url: String, ) -> Result { - let body = fetch_body(client, &url).await?; + async fn fetch_and_scrap( + client: &reqwest::Client, + url: String, + ) -> Result { + let body = fetch_body(client, &url).await?; + let maybe_point = { scrap_url(client, url, &body).await }; - let maybe_point = { scrap_url(client, url, &body).await }; + let point = match maybe_point { + Ok(p) => Ok(p), + Err(err) => { + let now: DateTime = Utc::now(); + let debug_path = PathBuf::from(format!("debug-{}/", now.format("%Y-%m-%d"))); + tokio::fs::create_dir_all(&debug_path).await.unwrap(); + let file_path = debug_path.join(format!("{}.html", nanoid!())); + tokio::fs::write(&file_path, &body).await.unwrap(); + tracing::debug!(error=%err, "Failed to parse, saved body at {}",file_path.display()); + Err(err) + } + }?; - let point = match maybe_point { - Ok(p) => Ok(p), - Err(err) => { - let now: DateTime = Utc::now(); - let debug_path = PathBuf::from(format!("debug-{}/", now.format("%Y-%m-%d"))); - tokio::fs::create_dir_all(&debug_path).await.unwrap(); - let file_path = debug_path.join(format!("{}.html", nanoid!())); - tokio::fs::write(&file_path, &body).await.unwrap(); - tracing::debug!(error=%err, "Failed to parse, saved body at {}",file_path.display()); - Err(err) - } - }?; + Ok(point) + } - Ok(point) + get_parse_retry_policy() + .retry(|| fetch_and_scrap(client, url.clone())) + .await } async fn parse_file_cli(file_path: String) -> anyhow::Result<()> { diff --git a/scraper-rs/src/sites/coto.rs b/scraper-rs/src/sites/coto.rs index 3f1e4d8..a225514 100644 --- a/scraper-rs/src/sites/coto.rs +++ b/scraper-rs/src/sites/coto.rs @@ -3,7 +3,9 @@ use futures::{stream, StreamExt, TryFutureExt, TryStreamExt}; use itertools::Itertools; use reqwest::Url; -use crate::{build_client, do_request, get_retry_policy, retry_if_wasnt_not_found, PrecioPoint}; +use crate::{ + build_client, do_request, get_fetch_retry_policy, retry_if_wasnt_not_found, PrecioPoint, +}; pub fn parse(url: String, dom: &tl::VDom) -> Result { let ean = dom @@ -90,7 +92,7 @@ pub async fn get_urls() -> anyhow::Result> { .finish(); let client = &client; async move { - let text = get_retry_policy() + let text = get_fetch_retry_policy() .retry_if( || do_request(client, u.as_str()).and_then(|r| r.text()), retry_if_wasnt_not_found, diff --git a/scraper-rs/src/sites/jumbo.rs b/scraper-rs/src/sites/jumbo.rs index 994c9fb..cf06e6a 100644 --- a/scraper-rs/src/sites/jumbo.rs +++ b/scraper-rs/src/sites/jumbo.rs @@ -7,7 +7,7 @@ use serde::Deserialize; use simple_error::bail; use crate::sites::common; -use crate::{do_request, get_retry_policy, PrecioPoint}; +use crate::{do_request, get_fetch_retry_policy, PrecioPoint}; use super::vtex; @@ -31,7 +31,7 @@ async fn get_ean_from_search( url.set_query(Some(&format!("fq=skuId:{}", retailer_sku))); url }; - let s = get_retry_policy() + let s = get_fetch_retry_policy() .retry(|| do_request(client, url.as_str()).and_then(|r| r.text())) .await?; let ean = { diff --git a/scraper-rs/src/sites/vtex.rs b/scraper-rs/src/sites/vtex.rs index 1fdb44e..3fb2a26 100644 --- a/scraper-rs/src/sites/vtex.rs +++ b/scraper-rs/src/sites/vtex.rs @@ -10,7 +10,7 @@ use serde_json::json; use simple_error::SimpleError; use tl::VDom; -use crate::{build_client, do_request, get_retry_policy, retry_if_wasnt_not_found}; +use crate::{build_client, do_request, get_fetch_retry_policy, retry_if_wasnt_not_found}; use super::common; @@ -132,7 +132,7 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result) -> anyhow::Result(client: &reqwest::Client, url: &str) -> anyhow::Result { - let body = get_retry_policy() + let body = get_fetch_retry_policy() .retry_if(|| do_request(client, url), retry_if_wasnt_not_found) .await? .text() From c66c325a4f0807c43c95216a69c7d3f54b0e78da Mon Sep 17 00:00:00 2001 From: Nulo Date: Tue, 6 Feb 2024 22:53:46 -0300 Subject: [PATCH 2/4] no reintentar junto a parse si es 404 --- scraper-rs/src/main.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scraper-rs/src/main.rs b/scraper-rs/src/main.rs index 076b1ab..2eb5f08 100644 --- a/scraper-rs/src/main.rs +++ b/scraper-rs/src/main.rs @@ -238,7 +238,13 @@ async fn fetch_and_parse( } get_parse_retry_policy() - .retry(|| fetch_and_scrap(client, url.clone())) + .retry_if( + || fetch_and_scrap(client, url.clone()), + |err: &anyhow::Error| match err.downcast_ref::() { + Some(e) => !e.status().is_some_and(|s| s == StatusCode::NOT_FOUND), + None => true, + }, + ) .await } From d1d496514c66ef4079816f4610941cb89c1e41e2 Mon Sep 17 00:00:00 2001 From: Nulo Date: Tue, 6 Feb 2024 23:17:24 -0300 Subject: [PATCH 3/4] mejorar timing parseo --- scraper-rs/src/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scraper-rs/src/main.rs b/scraper-rs/src/main.rs index 2eb5f08..b5d75b9 100644 --- a/scraper-rs/src/main.rs +++ b/scraper-rs/src/main.rs @@ -199,8 +199,8 @@ pub fn get_fetch_retry_policy() -> again::RetryPolicy { } pub fn get_parse_retry_policy() -> again::RetryPolicy { - RetryPolicy::exponential(Duration::from_millis(300)) - .with_max_retries(3) + RetryPolicy::exponential(Duration::from_millis(1500)) + .with_max_retries(5) .with_max_delay(Duration::from_secs(5)) .with_jitter(true) } From 1118bcf75d4f596392d466c0a69f73b95eaaab5f Mon Sep 17 00:00:00 2001 From: Nulo Date: Tue, 6 Feb 2024 23:20:04 -0300 Subject: [PATCH 4/4] bringup farmacity --- db-datos/supermercado.ts | 6 +++ scraper-rs/src/main.rs | 4 ++ scraper-rs/src/sites/farmacity.rs | 50 +++++++++++++++++++++++++ scraper-rs/src/sites/mod.rs | 1 + scraper-rs/src/supermercado.rs | 2 + sitio/src/routes/ean/[ean]/+page.svelte | 1 + 6 files changed, 64 insertions(+) create mode 100644 scraper-rs/src/sites/farmacity.rs diff --git a/db-datos/supermercado.ts b/db-datos/supermercado.ts index ec4c8c3..992e1bf 100644 --- a/db-datos/supermercado.ts +++ b/db-datos/supermercado.ts @@ -3,25 +3,31 @@ export enum Supermercado { Carrefour = "Carrefour", Coto = "Coto", Jumbo = "Jumbo", + Farmacity = "Farmacity", } export const supermercados: Supermercado[] = [ Supermercado.Carrefour, Supermercado.Coto, Supermercado.Dia, Supermercado.Jumbo, + Supermercado.Farmacity, ]; export const hosts: { [host: string]: Supermercado } = { "diaonline.supermercadosdia.com.ar": Supermercado.Dia, "www.carrefour.com.ar": Supermercado.Carrefour, "www.cotodigital3.com.ar": Supermercado.Coto, "www.jumbo.com.ar": Supermercado.Jumbo, + "www.farmacity.com": Supermercado.Farmacity, }; export const hostBySupermercado = Object.fromEntries( Object.entries(hosts).map(([a, b]) => [b, a]) ) as Record; + +// tambiƩn actualizar en sitio/src/routes/ean/[ean]/+page.svelte export const colorBySupermercado: { [supermercado in Supermercado]: string } = { [Supermercado.Dia]: "#d52b1e", [Supermercado.Carrefour]: "#19549d", [Supermercado.Coto]: "#e20025", [Supermercado.Jumbo]: "#2dc850", + [Supermercado.Farmacity]: "#EF7603", }; diff --git a/scraper-rs/src/main.rs b/scraper-rs/src/main.rs index b5d75b9..0f9efbf 100644 --- a/scraper-rs/src/main.rs +++ b/scraper-rs/src/main.rs @@ -285,6 +285,7 @@ async fn get_urls(supermercado: &Supermercado) -> Result, anyhow::Er Supermercado::Jumbo => sites::jumbo::get_urls().await?, Supermercado::Carrefour => sites::carrefour::get_urls().await?, Supermercado::Coto => sites::coto::get_urls().await?, + Supermercado::Farmacity => sites::farmacity::get_urls().await?, }) } @@ -305,6 +306,9 @@ async fn scrap_url( sites::coto::parse(url, &tl::parse(body, tl::ParserOptions::default())?) } "www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await, + "www.farmacity.com" => { + sites::farmacity::parse(url, &tl::parse(body, tl::ParserOptions::default())?) + } s => bail!("Unknown host {}", s), } } diff --git a/scraper-rs/src/sites/farmacity.rs b/scraper-rs/src/sites/farmacity.rs new file mode 100644 index 0000000..5e9cc11 --- /dev/null +++ b/scraper-rs/src/sites/farmacity.rs @@ -0,0 +1,50 @@ +use anyhow::Context; +use simple_error::bail; + +use crate::sites::common; +use crate::PrecioPoint; + +use super::vtex; +use super::vtex::find_product_ld; +use super::vtex::AvailabilityLd; + +pub fn parse(url: String, dom: &tl::VDom) -> Result { + let ean = common::get_meta_content(dom, "product:retailer_item_id") + .context("Parsing EAN")? + .to_string(); + let precio_centavos = common::price_from_meta(dom)?; + + let (name, image_url, in_stock) = match find_product_ld(dom) { + Some(pm) => { + let p = pm?; + ( + Some(p.name), + Some(p.image), + Some( + p.offers.offers.first().context("No offer")?.availability + == AvailabilityLd::InStock, + ), + ) + } + None => bail!("No JSON/LD"), + }; + + Ok(PrecioPoint { + ean, + fetched_at: crate::now_sec(), + in_stock, + name, + image_url, + parser_version: 5, + precio_centavos, + url, + }) +} + +pub async fn get_urls() -> anyhow::Result> { + let urls = vec![ + "https://www.farmacity.com/sitemap/product-0.xml", + "https://www.farmacity.com/sitemap/product-1.xml", + ]; + vtex::get_urls_from_sitemap(urls).await +} diff --git a/scraper-rs/src/sites/mod.rs b/scraper-rs/src/sites/mod.rs index e305f94..d96e38e 100644 --- a/scraper-rs/src/sites/mod.rs +++ b/scraper-rs/src/sites/mod.rs @@ -2,5 +2,6 @@ pub mod carrefour; mod common; pub mod coto; pub mod dia; +pub mod farmacity; pub mod jumbo; pub mod vtex; diff --git a/scraper-rs/src/supermercado.rs b/scraper-rs/src/supermercado.rs index d7cdbc0..19bbecb 100644 --- a/scraper-rs/src/supermercado.rs +++ b/scraper-rs/src/supermercado.rs @@ -6,6 +6,7 @@ pub enum Supermercado { Jumbo, Carrefour, Coto, + Farmacity, } impl Supermercado { pub fn host(&self) -> &'static str { @@ -14,6 +15,7 @@ impl Supermercado { Self::Carrefour => "www.carrefour.com.ar", Self::Coto => "www.cotodigital3.com.ar", Self::Jumbo => "www.jumbo.com.ar", + Self::Farmacity => "www.farmacity.com", } } } diff --git a/sitio/src/routes/ean/[ean]/+page.svelte b/sitio/src/routes/ean/[ean]/+page.svelte index d3ce771..78cc6a8 100644 --- a/sitio/src/routes/ean/[ean]/+page.svelte +++ b/sitio/src/routes/ean/[ean]/+page.svelte @@ -18,6 +18,7 @@ [Supermercado.Carrefour]: "bg-[#19549d] focus:ring-[#19549d]", [Supermercado.Coto]: "bg-[#e20025] focus:ring-[#e20025]", [Supermercado.Jumbo]: "bg-[#2dc850] focus:ring-[#2dc850]", + [Supermercado.Farmacity]: "bg-[#EF7603] focus:ring-[#EF7603]", };