scraper-rs: WIP: fetchear urls

This commit is contained in:
Cat /dev/Nulo 2024-01-12 09:47:56 -03:00
parent 7806c0ba6f
commit ba484709f8
7 changed files with 136 additions and 12 deletions

10
scraper-rs/Cargo.lock generated
View file

@ -671,6 +671,15 @@ version = "2.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
[[package]]
name = "itertools"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25db6b064527c5d482d0423354fcd07a89a2dfe07b67892e62411946db7f07b0"
dependencies = [
"either",
]
[[package]] [[package]]
name = "itoa" name = "itoa"
version = "1.0.10" version = "1.0.10"
@ -1156,6 +1165,7 @@ dependencies = [
"anyhow", "anyhow",
"async-channel", "async-channel",
"clap", "clap",
"itertools",
"nanoid", "nanoid",
"r2d2", "r2d2",
"r2d2_sqlite", "r2d2_sqlite",

View file

@ -10,6 +10,7 @@ again = "0.1.2"
anyhow = "1.0.79" anyhow = "1.0.79"
async-channel = "2.1.1" async-channel = "2.1.1"
clap = { version = "4.4.15", features = ["derive"] } clap = { version = "4.4.15", features = ["derive"] }
itertools = "0.12.0"
nanoid = "0.4.0" nanoid = "0.4.0"
r2d2 = "0.8.10" r2d2 = "0.8.10"
r2d2_sqlite = "0.23.0" r2d2_sqlite = "0.23.0"

View file

@ -1,6 +1,6 @@
use again::RetryPolicy; use again::RetryPolicy;
use async_channel::Receiver; use async_channel::Receiver;
use clap::Parser; use clap::{Parser, ValueEnum};
use nanoid::nanoid; use nanoid::nanoid;
use r2d2::Pool; use r2d2::Pool;
use r2d2_sqlite::SqliteConnectionManager; use r2d2_sqlite::SqliteConnectionManager;
@ -14,10 +14,19 @@ use std::{
}; };
use thiserror::Error; use thiserror::Error;
#[derive(ValueEnum, Clone)]
enum Supermercado {
Dia,
Jumbo,
Carrefour,
Coto,
}
#[derive(Parser)] // requires `derive` feature #[derive(Parser)] // requires `derive` feature
enum Args { enum Args {
FetchList(FetchListArgs), FetchList(FetchListArgs),
ParseFile(ParseFileArgs), ParseFile(ParseFileArgs),
GetUrlList(GetUrlListArgs),
} }
#[derive(clap::Args)] #[derive(clap::Args)]
struct FetchListArgs { struct FetchListArgs {
@ -27,6 +36,11 @@ struct FetchListArgs {
struct ParseFileArgs { struct ParseFileArgs {
file_path: String, file_path: String,
} }
#[derive(clap::Args)]
struct GetUrlListArgs {
#[arg(value_enum)]
supermercado: Supermercado,
}
#[tokio::main] #[tokio::main]
async fn main() -> anyhow::Result<()> { async fn main() -> anyhow::Result<()> {
@ -35,6 +49,7 @@ async fn main() -> anyhow::Result<()> {
match Args::parse() { match Args::parse() {
Args::FetchList(a) => fetch_list_cli(a.list_path).await, Args::FetchList(a) => fetch_list_cli(a.list_path).await,
Args::ParseFile(a) => parse_file_cli(a.file_path).await, Args::ParseFile(a) => parse_file_cli(a.file_path).await,
Args::GetUrlList(a) => get_url_list_cli(a.supermercado).await,
} }
} }
@ -139,9 +154,19 @@ enum FetchError {
Tl(#[from] tl::ParseError), Tl(#[from] tl::ParseError),
} }
async fn do_request(client: &reqwest::Client, url: &str) -> reqwest::Result<reqwest::Response> { pub async fn do_request(client: &reqwest::Client, url: &str) -> anyhow::Result<reqwest::Response> {
let request = client.get(url).build()?; let request = client.get(url).build()?;
client.execute(request).await let response = client.execute(request).await?;
if !response.status().is_success() {
bail!(FetchError::HttpStatus(response.status()));
}
Ok(response)
}
pub fn get_retry_policy() -> again::RetryPolicy {
RetryPolicy::exponential(Duration::from_millis(300))
.with_max_retries(10)
.with_jitter(true)
} }
#[tracing::instrument(skip(client))] #[tracing::instrument(skip(client))]
@ -149,18 +174,12 @@ async fn fetch_and_parse(
client: &reqwest::Client, client: &reqwest::Client,
url: String, url: String,
) -> Result<PrecioPoint, anyhow::Error> { ) -> Result<PrecioPoint, anyhow::Error> {
let policy = RetryPolicy::exponential(Duration::from_millis(300)) let body = get_retry_policy()
.with_max_retries(10)
.with_jitter(true);
let response = policy
.retry(|| do_request(client, &url)) .retry(|| do_request(client, &url))
.await?
.text()
.await .await
.map_err(FetchError::Http)?; .map_err(FetchError::Http)?;
if !response.status().is_success() {
bail!(FetchError::HttpStatus(response.status()));
}
let body = response.text().await.map_err(FetchError::Http)?;
let maybe_point = { scrap_url(client, url, &body).await }; let maybe_point = { scrap_url(client, url, &body).await };
@ -202,6 +221,20 @@ async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
Ok(()) Ok(())
} }
async fn get_url_list_cli(supermercado: Supermercado) -> anyhow::Result<()> {
let urls = match supermercado {
Supermercado::Dia => sites::dia::get_urls().await?,
Supermercado::Jumbo => sites::jumbo::get_urls().await?,
Supermercado::Carrefour => sites::carrefour::get_urls().await?,
_ => todo!(),
};
urls.iter().for_each(|s| {
println!("{}", s);
});
Ok(())
}
async fn scrap_url( async fn scrap_url(
client: &reqwest::Client, client: &reqwest::Client,
url: String, url: String,

View file

@ -66,3 +66,19 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error>
url, url,
}) })
} }
pub async fn get_urls() -> anyhow::Result<Vec<String>> {
let urls = vec![
"https://www.carrefour.com.ar/sitemap/product-0.xml",
"https://www.carrefour.com.ar/sitemap/product-1.xml",
"https://www.carrefour.com.ar/sitemap/product-2.xml",
"https://www.carrefour.com.ar/sitemap/product-3.xml",
"https://www.carrefour.com.ar/sitemap/product-4.xml",
"https://www.carrefour.com.ar/sitemap/product-5.xml",
"https://www.carrefour.com.ar/sitemap/product-6.xml",
"https://www.carrefour.com.ar/sitemap/product-7.xml",
"https://www.carrefour.com.ar/sitemap/product-8.xml",
"https://www.carrefour.com.ar/sitemap/product-9.xml",
];
vtex::get_urls_from_sitemap(&urls).await
}

View file

@ -4,6 +4,7 @@ use simple_error::bail;
use crate::sites::common; use crate::sites::common;
use crate::PrecioPoint; use crate::PrecioPoint;
use super::vtex;
use super::vtex::find_product_ld; use super::vtex::find_product_ld;
use super::vtex::AvailabilityLd; use super::vtex::AvailabilityLd;
@ -39,3 +40,14 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error>
url, url,
}) })
} }
pub async fn get_urls() -> anyhow::Result<Vec<String>> {
let urls = vec![
"https://diaonline.supermercadosdia.com.ar/sitemap/product-1.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-2.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-3.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-4.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
];
vtex::get_urls_from_sitemap(&urls).await
}

View file

@ -90,3 +90,25 @@ pub async fn scrap(
url, url,
}) })
} }
pub async fn get_urls() -> anyhow::Result<Vec<String>> {
// de https://www.jumbo.com.ar/sitemap.xml
let urls = vec![
"https://www.jumbo.com.ar/sitemap/product-1.xml",
"https://www.jumbo.com.ar/sitemap/product-10.xml",
"https://www.jumbo.com.ar/sitemap/product-11.xml",
"https://www.jumbo.com.ar/sitemap/product-12.xml",
"https://www.jumbo.com.ar/sitemap/product-13.xml",
"https://www.jumbo.com.ar/sitemap/product-14.xml",
"https://www.jumbo.com.ar/sitemap/product-15.xml",
"https://www.jumbo.com.ar/sitemap/product-2.xml",
"https://www.jumbo.com.ar/sitemap/product-3.xml",
"https://www.jumbo.com.ar/sitemap/product-4.xml",
"https://www.jumbo.com.ar/sitemap/product-5.xml",
"https://www.jumbo.com.ar/sitemap/product-6.xml",
"https://www.jumbo.com.ar/sitemap/product-7.xml",
"https://www.jumbo.com.ar/sitemap/product-8.xml",
"https://www.jumbo.com.ar/sitemap/product-9.xml",
];
vtex::get_urls_from_sitemap(&urls).await
}

View file

@ -1,8 +1,11 @@
use anyhow::{bail, Context}; use anyhow::{bail, Context};
use itertools::Itertools;
use serde::Deserialize; use serde::Deserialize;
use simple_error::SimpleError; use simple_error::SimpleError;
use tl::VDom; use tl::VDom;
use crate::{build_client, do_request, get_retry_policy};
use super::common; use super::common;
pub fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, anyhow::Error> { pub fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, anyhow::Error> {
@ -100,3 +103,30 @@ pub fn in_stock_from_meta(dom: &VDom) -> anyhow::Result<bool> {
}, },
) )
} }
pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> {
let dom = tl::parse(sitemap, tl::ParserOptions::default())?;
Ok(dom
.query_selector("loc")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.map(|t| t.inner_text(dom.parser()))
.map(|s| s.to_string())
.collect())
}
pub async fn get_urls_from_sitemap(sitemaps: &[&str]) -> anyhow::Result<Vec<String>> {
let mut total = vec![];
let client = &build_client();
for url in sitemaps {
let text = get_retry_policy()
.retry(|| do_request(client, url))
.await?
.text()
.await?;
let mut urls = parse_urls_from_sitemap(&text)?;
total.append(&mut urls);
}
Ok(total.into_iter().unique().collect())
}