mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-22 14:16:19 +00:00
reintentar si falla parsear
muchos sitios vtex devuelven un html sin casi nada aleatoriamente
This commit is contained in:
parent
18771cb944
commit
b807de8eb4
4 changed files with 41 additions and 24 deletions
|
@ -186,18 +186,25 @@ async fn request_and_body(client: &reqwest::Client, url: &str) -> reqwest::Resul
|
||||||
res.text().await
|
res.text().await
|
||||||
}
|
}
|
||||||
pub async fn fetch_body(client: &reqwest::Client, url: &str) -> reqwest::Result<String> {
|
pub async fn fetch_body(client: &reqwest::Client, url: &str) -> reqwest::Result<String> {
|
||||||
get_retry_policy()
|
get_fetch_retry_policy()
|
||||||
.retry_if(|| request_and_body(client, url), retry_if_wasnt_not_found)
|
.retry_if(|| request_and_body(client, url), retry_if_wasnt_not_found)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_retry_policy() -> again::RetryPolicy {
|
pub fn get_fetch_retry_policy() -> again::RetryPolicy {
|
||||||
RetryPolicy::exponential(Duration::from_millis(300))
|
RetryPolicy::exponential(Duration::from_millis(300))
|
||||||
.with_max_retries(20)
|
.with_max_retries(20)
|
||||||
.with_max_delay(Duration::from_secs(40))
|
.with_max_delay(Duration::from_secs(40))
|
||||||
.with_jitter(true)
|
.with_jitter(true)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_parse_retry_policy() -> again::RetryPolicy {
|
||||||
|
RetryPolicy::exponential(Duration::from_millis(300))
|
||||||
|
.with_max_retries(3)
|
||||||
|
.with_max_delay(Duration::from_secs(5))
|
||||||
|
.with_jitter(true)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn retry_if_wasnt_not_found(err: &reqwest::Error) -> bool {
|
pub fn retry_if_wasnt_not_found(err: &reqwest::Error) -> bool {
|
||||||
!err.status().is_some_and(|s| s == StatusCode::NOT_FOUND)
|
!err.status().is_some_and(|s| s == StatusCode::NOT_FOUND)
|
||||||
}
|
}
|
||||||
|
@ -207,24 +214,32 @@ async fn fetch_and_parse(
|
||||||
client: &reqwest::Client,
|
client: &reqwest::Client,
|
||||||
url: String,
|
url: String,
|
||||||
) -> Result<PrecioPoint, anyhow::Error> {
|
) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
let body = fetch_body(client, &url).await?;
|
async fn fetch_and_scrap(
|
||||||
|
client: &reqwest::Client,
|
||||||
|
url: String,
|
||||||
|
) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
|
let body = fetch_body(client, &url).await?;
|
||||||
|
let maybe_point = { scrap_url(client, url, &body).await };
|
||||||
|
|
||||||
let maybe_point = { scrap_url(client, url, &body).await };
|
let point = match maybe_point {
|
||||||
|
Ok(p) => Ok(p),
|
||||||
|
Err(err) => {
|
||||||
|
let now: DateTime<Utc> = Utc::now();
|
||||||
|
let debug_path = PathBuf::from(format!("debug-{}/", now.format("%Y-%m-%d")));
|
||||||
|
tokio::fs::create_dir_all(&debug_path).await.unwrap();
|
||||||
|
let file_path = debug_path.join(format!("{}.html", nanoid!()));
|
||||||
|
tokio::fs::write(&file_path, &body).await.unwrap();
|
||||||
|
tracing::debug!(error=%err, "Failed to parse, saved body at {}",file_path.display());
|
||||||
|
Err(err)
|
||||||
|
}
|
||||||
|
}?;
|
||||||
|
|
||||||
let point = match maybe_point {
|
Ok(point)
|
||||||
Ok(p) => Ok(p),
|
}
|
||||||
Err(err) => {
|
|
||||||
let now: DateTime<Utc> = Utc::now();
|
|
||||||
let debug_path = PathBuf::from(format!("debug-{}/", now.format("%Y-%m-%d")));
|
|
||||||
tokio::fs::create_dir_all(&debug_path).await.unwrap();
|
|
||||||
let file_path = debug_path.join(format!("{}.html", nanoid!()));
|
|
||||||
tokio::fs::write(&file_path, &body).await.unwrap();
|
|
||||||
tracing::debug!(error=%err, "Failed to parse, saved body at {}",file_path.display());
|
|
||||||
Err(err)
|
|
||||||
}
|
|
||||||
}?;
|
|
||||||
|
|
||||||
Ok(point)
|
get_parse_retry_policy()
|
||||||
|
.retry(|| fetch_and_scrap(client, url.clone()))
|
||||||
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
|
async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
|
||||||
|
|
|
@ -3,7 +3,9 @@ use futures::{stream, StreamExt, TryFutureExt, TryStreamExt};
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
|
|
||||||
use crate::{build_client, do_request, get_retry_policy, retry_if_wasnt_not_found, PrecioPoint};
|
use crate::{
|
||||||
|
build_client, do_request, get_fetch_retry_policy, retry_if_wasnt_not_found, PrecioPoint,
|
||||||
|
};
|
||||||
|
|
||||||
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
let ean = dom
|
let ean = dom
|
||||||
|
@ -90,7 +92,7 @@ pub async fn get_urls() -> anyhow::Result<Vec<String>> {
|
||||||
.finish();
|
.finish();
|
||||||
let client = &client;
|
let client = &client;
|
||||||
async move {
|
async move {
|
||||||
let text = get_retry_policy()
|
let text = get_fetch_retry_policy()
|
||||||
.retry_if(
|
.retry_if(
|
||||||
|| do_request(client, u.as_str()).and_then(|r| r.text()),
|
|| do_request(client, u.as_str()).and_then(|r| r.text()),
|
||||||
retry_if_wasnt_not_found,
|
retry_if_wasnt_not_found,
|
||||||
|
|
|
@ -7,7 +7,7 @@ use serde::Deserialize;
|
||||||
use simple_error::bail;
|
use simple_error::bail;
|
||||||
|
|
||||||
use crate::sites::common;
|
use crate::sites::common;
|
||||||
use crate::{do_request, get_retry_policy, PrecioPoint};
|
use crate::{do_request, get_fetch_retry_policy, PrecioPoint};
|
||||||
|
|
||||||
use super::vtex;
|
use super::vtex;
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ async fn get_ean_from_search(
|
||||||
url.set_query(Some(&format!("fq=skuId:{}", retailer_sku)));
|
url.set_query(Some(&format!("fq=skuId:{}", retailer_sku)));
|
||||||
url
|
url
|
||||||
};
|
};
|
||||||
let s = get_retry_policy()
|
let s = get_fetch_retry_policy()
|
||||||
.retry(|| do_request(client, url.as_str()).and_then(|r| r.text()))
|
.retry(|| do_request(client, url.as_str()).and_then(|r| r.text()))
|
||||||
.await?;
|
.await?;
|
||||||
let ean = {
|
let ean = {
|
||||||
|
|
|
@ -10,7 +10,7 @@ use serde_json::json;
|
||||||
use simple_error::SimpleError;
|
use simple_error::SimpleError;
|
||||||
use tl::VDom;
|
use tl::VDom;
|
||||||
|
|
||||||
use crate::{build_client, do_request, get_retry_policy, retry_if_wasnt_not_found};
|
use crate::{build_client, do_request, get_fetch_retry_policy, retry_if_wasnt_not_found};
|
||||||
|
|
||||||
use super::common;
|
use super::common;
|
||||||
|
|
||||||
|
@ -132,7 +132,7 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<St
|
||||||
let url = url.to_string();
|
let url = url.to_string();
|
||||||
async move {
|
async move {
|
||||||
let client = client;
|
let client = client;
|
||||||
let text = get_retry_policy()
|
let text = get_fetch_retry_policy()
|
||||||
.retry_if(|| do_request(&client, &url), retry_if_wasnt_not_found)
|
.retry_if(|| do_request(&client, &url), retry_if_wasnt_not_found)
|
||||||
.await?
|
.await?
|
||||||
.text()
|
.text()
|
||||||
|
@ -152,7 +152,7 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<St
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn fetch_body<'a>(client: &reqwest::Client, url: &str) -> anyhow::Result<String> {
|
async fn fetch_body<'a>(client: &reqwest::Client, url: &str) -> anyhow::Result<String> {
|
||||||
let body = get_retry_policy()
|
let body = get_fetch_retry_policy()
|
||||||
.retry_if(|| do_request(client, url), retry_if_wasnt_not_found)
|
.retry_if(|| do_request(client, url), retry_if_wasnt_not_found)
|
||||||
.await?
|
.await?
|
||||||
.text()
|
.text()
|
||||||
|
|
Loading…
Reference in a new issue