reintentar si falla parsear

muchos sitios vtex devuelven un html sin casi nada aleatoriamente
This commit is contained in:
Cat /dev/Nulo 2024-02-06 22:48:44 -03:00
parent 18771cb944
commit b807de8eb4
4 changed files with 41 additions and 24 deletions

View file

@ -186,18 +186,25 @@ async fn request_and_body(client: &reqwest::Client, url: &str) -> reqwest::Resul
res.text().await res.text().await
} }
pub async fn fetch_body(client: &reqwest::Client, url: &str) -> reqwest::Result<String> { pub async fn fetch_body(client: &reqwest::Client, url: &str) -> reqwest::Result<String> {
get_retry_policy() get_fetch_retry_policy()
.retry_if(|| request_and_body(client, url), retry_if_wasnt_not_found) .retry_if(|| request_and_body(client, url), retry_if_wasnt_not_found)
.await .await
} }
pub fn get_retry_policy() -> again::RetryPolicy { pub fn get_fetch_retry_policy() -> again::RetryPolicy {
RetryPolicy::exponential(Duration::from_millis(300)) RetryPolicy::exponential(Duration::from_millis(300))
.with_max_retries(20) .with_max_retries(20)
.with_max_delay(Duration::from_secs(40)) .with_max_delay(Duration::from_secs(40))
.with_jitter(true) .with_jitter(true)
} }
pub fn get_parse_retry_policy() -> again::RetryPolicy {
RetryPolicy::exponential(Duration::from_millis(300))
.with_max_retries(3)
.with_max_delay(Duration::from_secs(5))
.with_jitter(true)
}
pub fn retry_if_wasnt_not_found(err: &reqwest::Error) -> bool { pub fn retry_if_wasnt_not_found(err: &reqwest::Error) -> bool {
!err.status().is_some_and(|s| s == StatusCode::NOT_FOUND) !err.status().is_some_and(|s| s == StatusCode::NOT_FOUND)
} }
@ -207,24 +214,32 @@ async fn fetch_and_parse(
client: &reqwest::Client, client: &reqwest::Client,
url: String, url: String,
) -> Result<PrecioPoint, anyhow::Error> { ) -> Result<PrecioPoint, anyhow::Error> {
let body = fetch_body(client, &url).await?; async fn fetch_and_scrap(
client: &reqwest::Client,
url: String,
) -> Result<PrecioPoint, anyhow::Error> {
let body = fetch_body(client, &url).await?;
let maybe_point = { scrap_url(client, url, &body).await };
let maybe_point = { scrap_url(client, url, &body).await }; let point = match maybe_point {
Ok(p) => Ok(p),
Err(err) => {
let now: DateTime<Utc> = Utc::now();
let debug_path = PathBuf::from(format!("debug-{}/", now.format("%Y-%m-%d")));
tokio::fs::create_dir_all(&debug_path).await.unwrap();
let file_path = debug_path.join(format!("{}.html", nanoid!()));
tokio::fs::write(&file_path, &body).await.unwrap();
tracing::debug!(error=%err, "Failed to parse, saved body at {}",file_path.display());
Err(err)
}
}?;
let point = match maybe_point { Ok(point)
Ok(p) => Ok(p), }
Err(err) => {
let now: DateTime<Utc> = Utc::now();
let debug_path = PathBuf::from(format!("debug-{}/", now.format("%Y-%m-%d")));
tokio::fs::create_dir_all(&debug_path).await.unwrap();
let file_path = debug_path.join(format!("{}.html", nanoid!()));
tokio::fs::write(&file_path, &body).await.unwrap();
tracing::debug!(error=%err, "Failed to parse, saved body at {}",file_path.display());
Err(err)
}
}?;
Ok(point) get_parse_retry_policy()
.retry(|| fetch_and_scrap(client, url.clone()))
.await
} }
async fn parse_file_cli(file_path: String) -> anyhow::Result<()> { async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {

View file

@ -3,7 +3,9 @@ use futures::{stream, StreamExt, TryFutureExt, TryStreamExt};
use itertools::Itertools; use itertools::Itertools;
use reqwest::Url; use reqwest::Url;
use crate::{build_client, do_request, get_retry_policy, retry_if_wasnt_not_found, PrecioPoint}; use crate::{
build_client, do_request, get_fetch_retry_policy, retry_if_wasnt_not_found, PrecioPoint,
};
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> { pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
let ean = dom let ean = dom
@ -90,7 +92,7 @@ pub async fn get_urls() -> anyhow::Result<Vec<String>> {
.finish(); .finish();
let client = &client; let client = &client;
async move { async move {
let text = get_retry_policy() let text = get_fetch_retry_policy()
.retry_if( .retry_if(
|| do_request(client, u.as_str()).and_then(|r| r.text()), || do_request(client, u.as_str()).and_then(|r| r.text()),
retry_if_wasnt_not_found, retry_if_wasnt_not_found,

View file

@ -7,7 +7,7 @@ use serde::Deserialize;
use simple_error::bail; use simple_error::bail;
use crate::sites::common; use crate::sites::common;
use crate::{do_request, get_retry_policy, PrecioPoint}; use crate::{do_request, get_fetch_retry_policy, PrecioPoint};
use super::vtex; use super::vtex;
@ -31,7 +31,7 @@ async fn get_ean_from_search(
url.set_query(Some(&format!("fq=skuId:{}", retailer_sku))); url.set_query(Some(&format!("fq=skuId:{}", retailer_sku)));
url url
}; };
let s = get_retry_policy() let s = get_fetch_retry_policy()
.retry(|| do_request(client, url.as_str()).and_then(|r| r.text())) .retry(|| do_request(client, url.as_str()).and_then(|r| r.text()))
.await?; .await?;
let ean = { let ean = {

View file

@ -10,7 +10,7 @@ use serde_json::json;
use simple_error::SimpleError; use simple_error::SimpleError;
use tl::VDom; use tl::VDom;
use crate::{build_client, do_request, get_retry_policy, retry_if_wasnt_not_found}; use crate::{build_client, do_request, get_fetch_retry_policy, retry_if_wasnt_not_found};
use super::common; use super::common;
@ -132,7 +132,7 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<St
let url = url.to_string(); let url = url.to_string();
async move { async move {
let client = client; let client = client;
let text = get_retry_policy() let text = get_fetch_retry_policy()
.retry_if(|| do_request(&client, &url), retry_if_wasnt_not_found) .retry_if(|| do_request(&client, &url), retry_if_wasnt_not_found)
.await? .await?
.text() .text()
@ -152,7 +152,7 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<St
} }
async fn fetch_body<'a>(client: &reqwest::Client, url: &str) -> anyhow::Result<String> { async fn fetch_body<'a>(client: &reqwest::Client, url: &str) -> anyhow::Result<String> {
let body = get_retry_policy() let body = get_fetch_retry_policy()
.retry_if(|| do_request(client, url), retry_if_wasnt_not_found) .retry_if(|| do_request(client, url), retry_if_wasnt_not_found)
.await? .await?
.text() .text()