mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-29 13:06:19 +00:00
scraper-rs: WIP: fetchear urls
This commit is contained in:
parent
7806c0ba6f
commit
ba484709f8
7 changed files with 136 additions and 12 deletions
10
scraper-rs/Cargo.lock
generated
10
scraper-rs/Cargo.lock
generated
|
@ -671,6 +671,15 @@ version = "2.9.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "25db6b064527c5d482d0423354fcd07a89a2dfe07b67892e62411946db7f07b0"
|
||||
dependencies = [
|
||||
"either",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.10"
|
||||
|
@ -1156,6 +1165,7 @@ dependencies = [
|
|||
"anyhow",
|
||||
"async-channel",
|
||||
"clap",
|
||||
"itertools",
|
||||
"nanoid",
|
||||
"r2d2",
|
||||
"r2d2_sqlite",
|
||||
|
|
|
@ -10,6 +10,7 @@ again = "0.1.2"
|
|||
anyhow = "1.0.79"
|
||||
async-channel = "2.1.1"
|
||||
clap = { version = "4.4.15", features = ["derive"] }
|
||||
itertools = "0.12.0"
|
||||
nanoid = "0.4.0"
|
||||
r2d2 = "0.8.10"
|
||||
r2d2_sqlite = "0.23.0"
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
use again::RetryPolicy;
|
||||
use async_channel::Receiver;
|
||||
use clap::Parser;
|
||||
use clap::{Parser, ValueEnum};
|
||||
use nanoid::nanoid;
|
||||
use r2d2::Pool;
|
||||
use r2d2_sqlite::SqliteConnectionManager;
|
||||
|
@ -14,10 +14,19 @@ use std::{
|
|||
};
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(ValueEnum, Clone)]
|
||||
enum Supermercado {
|
||||
Dia,
|
||||
Jumbo,
|
||||
Carrefour,
|
||||
Coto,
|
||||
}
|
||||
|
||||
#[derive(Parser)] // requires `derive` feature
|
||||
enum Args {
|
||||
FetchList(FetchListArgs),
|
||||
ParseFile(ParseFileArgs),
|
||||
GetUrlList(GetUrlListArgs),
|
||||
}
|
||||
#[derive(clap::Args)]
|
||||
struct FetchListArgs {
|
||||
|
@ -27,6 +36,11 @@ struct FetchListArgs {
|
|||
struct ParseFileArgs {
|
||||
file_path: String,
|
||||
}
|
||||
#[derive(clap::Args)]
|
||||
struct GetUrlListArgs {
|
||||
#[arg(value_enum)]
|
||||
supermercado: Supermercado,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
|
@ -35,6 +49,7 @@ async fn main() -> anyhow::Result<()> {
|
|||
match Args::parse() {
|
||||
Args::FetchList(a) => fetch_list_cli(a.list_path).await,
|
||||
Args::ParseFile(a) => parse_file_cli(a.file_path).await,
|
||||
Args::GetUrlList(a) => get_url_list_cli(a.supermercado).await,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -139,9 +154,19 @@ enum FetchError {
|
|||
Tl(#[from] tl::ParseError),
|
||||
}
|
||||
|
||||
async fn do_request(client: &reqwest::Client, url: &str) -> reqwest::Result<reqwest::Response> {
|
||||
pub async fn do_request(client: &reqwest::Client, url: &str) -> anyhow::Result<reqwest::Response> {
|
||||
let request = client.get(url).build()?;
|
||||
client.execute(request).await
|
||||
let response = client.execute(request).await?;
|
||||
if !response.status().is_success() {
|
||||
bail!(FetchError::HttpStatus(response.status()));
|
||||
}
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
pub fn get_retry_policy() -> again::RetryPolicy {
|
||||
RetryPolicy::exponential(Duration::from_millis(300))
|
||||
.with_max_retries(10)
|
||||
.with_jitter(true)
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip(client))]
|
||||
|
@ -149,18 +174,12 @@ async fn fetch_and_parse(
|
|||
client: &reqwest::Client,
|
||||
url: String,
|
||||
) -> Result<PrecioPoint, anyhow::Error> {
|
||||
let policy = RetryPolicy::exponential(Duration::from_millis(300))
|
||||
.with_max_retries(10)
|
||||
.with_jitter(true);
|
||||
|
||||
let response = policy
|
||||
let body = get_retry_policy()
|
||||
.retry(|| do_request(client, &url))
|
||||
.await?
|
||||
.text()
|
||||
.await
|
||||
.map_err(FetchError::Http)?;
|
||||
if !response.status().is_success() {
|
||||
bail!(FetchError::HttpStatus(response.status()));
|
||||
}
|
||||
let body = response.text().await.map_err(FetchError::Http)?;
|
||||
|
||||
let maybe_point = { scrap_url(client, url, &body).await };
|
||||
|
||||
|
@ -202,6 +221,20 @@ async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_url_list_cli(supermercado: Supermercado) -> anyhow::Result<()> {
|
||||
let urls = match supermercado {
|
||||
Supermercado::Dia => sites::dia::get_urls().await?,
|
||||
Supermercado::Jumbo => sites::jumbo::get_urls().await?,
|
||||
Supermercado::Carrefour => sites::carrefour::get_urls().await?,
|
||||
_ => todo!(),
|
||||
};
|
||||
urls.iter().for_each(|s| {
|
||||
println!("{}", s);
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn scrap_url(
|
||||
client: &reqwest::Client,
|
||||
url: String,
|
||||
|
|
|
@ -66,3 +66,19 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error>
|
|||
url,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn get_urls() -> anyhow::Result<Vec<String>> {
|
||||
let urls = vec![
|
||||
"https://www.carrefour.com.ar/sitemap/product-0.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-1.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-2.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-3.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-4.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-5.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-6.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-7.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-8.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-9.xml",
|
||||
];
|
||||
vtex::get_urls_from_sitemap(&urls).await
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@ use simple_error::bail;
|
|||
use crate::sites::common;
|
||||
use crate::PrecioPoint;
|
||||
|
||||
use super::vtex;
|
||||
use super::vtex::find_product_ld;
|
||||
use super::vtex::AvailabilityLd;
|
||||
|
||||
|
@ -39,3 +40,14 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error>
|
|||
url,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn get_urls() -> anyhow::Result<Vec<String>> {
|
||||
let urls = vec![
|
||||
"https://diaonline.supermercadosdia.com.ar/sitemap/product-1.xml",
|
||||
"https://diaonline.supermercadosdia.com.ar/sitemap/product-2.xml",
|
||||
"https://diaonline.supermercadosdia.com.ar/sitemap/product-3.xml",
|
||||
"https://diaonline.supermercadosdia.com.ar/sitemap/product-4.xml",
|
||||
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
|
||||
];
|
||||
vtex::get_urls_from_sitemap(&urls).await
|
||||
}
|
||||
|
|
|
@ -90,3 +90,25 @@ pub async fn scrap(
|
|||
url,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn get_urls() -> anyhow::Result<Vec<String>> {
|
||||
// de https://www.jumbo.com.ar/sitemap.xml
|
||||
let urls = vec![
|
||||
"https://www.jumbo.com.ar/sitemap/product-1.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-10.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-11.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-12.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-13.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-14.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-15.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-2.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-3.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-4.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-5.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-6.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-7.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-8.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-9.xml",
|
||||
];
|
||||
vtex::get_urls_from_sitemap(&urls).await
|
||||
}
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
use anyhow::{bail, Context};
|
||||
use itertools::Itertools;
|
||||
use serde::Deserialize;
|
||||
use simple_error::SimpleError;
|
||||
use tl::VDom;
|
||||
|
||||
use crate::{build_client, do_request, get_retry_policy};
|
||||
|
||||
use super::common;
|
||||
|
||||
pub fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, anyhow::Error> {
|
||||
|
@ -100,3 +103,30 @@ pub fn in_stock_from_meta(dom: &VDom) -> anyhow::Result<bool> {
|
|||
},
|
||||
)
|
||||
}
|
||||
|
||||
pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> {
|
||||
let dom = tl::parse(sitemap, tl::ParserOptions::default())?;
|
||||
Ok(dom
|
||||
.query_selector("loc")
|
||||
.unwrap()
|
||||
.filter_map(|h| h.get(dom.parser()))
|
||||
.filter_map(|n| n.as_tag())
|
||||
.map(|t| t.inner_text(dom.parser()))
|
||||
.map(|s| s.to_string())
|
||||
.collect())
|
||||
}
|
||||
|
||||
pub async fn get_urls_from_sitemap(sitemaps: &[&str]) -> anyhow::Result<Vec<String>> {
|
||||
let mut total = vec![];
|
||||
let client = &build_client();
|
||||
for url in sitemaps {
|
||||
let text = get_retry_policy()
|
||||
.retry(|| do_request(client, url))
|
||||
.await?
|
||||
.text()
|
||||
.await?;
|
||||
let mut urls = parse_urls_from_sitemap(&text)?;
|
||||
total.append(&mut urls);
|
||||
}
|
||||
Ok(total.into_iter().unique().collect())
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue