mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 03:26:19 +00:00
scraper-rs: WIP: fetchear urls
This commit is contained in:
parent
7806c0ba6f
commit
ba484709f8
7 changed files with 136 additions and 12 deletions
10
scraper-rs/Cargo.lock
generated
10
scraper-rs/Cargo.lock
generated
|
@ -671,6 +671,15 @@ version = "2.9.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
|
checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "itertools"
|
||||||
|
version = "0.12.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "25db6b064527c5d482d0423354fcd07a89a2dfe07b67892e62411946db7f07b0"
|
||||||
|
dependencies = [
|
||||||
|
"either",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "itoa"
|
name = "itoa"
|
||||||
version = "1.0.10"
|
version = "1.0.10"
|
||||||
|
@ -1156,6 +1165,7 @@ dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"async-channel",
|
"async-channel",
|
||||||
"clap",
|
"clap",
|
||||||
|
"itertools",
|
||||||
"nanoid",
|
"nanoid",
|
||||||
"r2d2",
|
"r2d2",
|
||||||
"r2d2_sqlite",
|
"r2d2_sqlite",
|
||||||
|
|
|
@ -10,6 +10,7 @@ again = "0.1.2"
|
||||||
anyhow = "1.0.79"
|
anyhow = "1.0.79"
|
||||||
async-channel = "2.1.1"
|
async-channel = "2.1.1"
|
||||||
clap = { version = "4.4.15", features = ["derive"] }
|
clap = { version = "4.4.15", features = ["derive"] }
|
||||||
|
itertools = "0.12.0"
|
||||||
nanoid = "0.4.0"
|
nanoid = "0.4.0"
|
||||||
r2d2 = "0.8.10"
|
r2d2 = "0.8.10"
|
||||||
r2d2_sqlite = "0.23.0"
|
r2d2_sqlite = "0.23.0"
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
use again::RetryPolicy;
|
use again::RetryPolicy;
|
||||||
use async_channel::Receiver;
|
use async_channel::Receiver;
|
||||||
use clap::Parser;
|
use clap::{Parser, ValueEnum};
|
||||||
use nanoid::nanoid;
|
use nanoid::nanoid;
|
||||||
use r2d2::Pool;
|
use r2d2::Pool;
|
||||||
use r2d2_sqlite::SqliteConnectionManager;
|
use r2d2_sqlite::SqliteConnectionManager;
|
||||||
|
@ -14,10 +14,19 @@ use std::{
|
||||||
};
|
};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
|
||||||
|
#[derive(ValueEnum, Clone)]
|
||||||
|
enum Supermercado {
|
||||||
|
Dia,
|
||||||
|
Jumbo,
|
||||||
|
Carrefour,
|
||||||
|
Coto,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Parser)] // requires `derive` feature
|
#[derive(Parser)] // requires `derive` feature
|
||||||
enum Args {
|
enum Args {
|
||||||
FetchList(FetchListArgs),
|
FetchList(FetchListArgs),
|
||||||
ParseFile(ParseFileArgs),
|
ParseFile(ParseFileArgs),
|
||||||
|
GetUrlList(GetUrlListArgs),
|
||||||
}
|
}
|
||||||
#[derive(clap::Args)]
|
#[derive(clap::Args)]
|
||||||
struct FetchListArgs {
|
struct FetchListArgs {
|
||||||
|
@ -27,6 +36,11 @@ struct FetchListArgs {
|
||||||
struct ParseFileArgs {
|
struct ParseFileArgs {
|
||||||
file_path: String,
|
file_path: String,
|
||||||
}
|
}
|
||||||
|
#[derive(clap::Args)]
|
||||||
|
struct GetUrlListArgs {
|
||||||
|
#[arg(value_enum)]
|
||||||
|
supermercado: Supermercado,
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> anyhow::Result<()> {
|
async fn main() -> anyhow::Result<()> {
|
||||||
|
@ -35,6 +49,7 @@ async fn main() -> anyhow::Result<()> {
|
||||||
match Args::parse() {
|
match Args::parse() {
|
||||||
Args::FetchList(a) => fetch_list_cli(a.list_path).await,
|
Args::FetchList(a) => fetch_list_cli(a.list_path).await,
|
||||||
Args::ParseFile(a) => parse_file_cli(a.file_path).await,
|
Args::ParseFile(a) => parse_file_cli(a.file_path).await,
|
||||||
|
Args::GetUrlList(a) => get_url_list_cli(a.supermercado).await,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -139,9 +154,19 @@ enum FetchError {
|
||||||
Tl(#[from] tl::ParseError),
|
Tl(#[from] tl::ParseError),
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn do_request(client: &reqwest::Client, url: &str) -> reqwest::Result<reqwest::Response> {
|
pub async fn do_request(client: &reqwest::Client, url: &str) -> anyhow::Result<reqwest::Response> {
|
||||||
let request = client.get(url).build()?;
|
let request = client.get(url).build()?;
|
||||||
client.execute(request).await
|
let response = client.execute(request).await?;
|
||||||
|
if !response.status().is_success() {
|
||||||
|
bail!(FetchError::HttpStatus(response.status()));
|
||||||
|
}
|
||||||
|
Ok(response)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_retry_policy() -> again::RetryPolicy {
|
||||||
|
RetryPolicy::exponential(Duration::from_millis(300))
|
||||||
|
.with_max_retries(10)
|
||||||
|
.with_jitter(true)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tracing::instrument(skip(client))]
|
#[tracing::instrument(skip(client))]
|
||||||
|
@ -149,18 +174,12 @@ async fn fetch_and_parse(
|
||||||
client: &reqwest::Client,
|
client: &reqwest::Client,
|
||||||
url: String,
|
url: String,
|
||||||
) -> Result<PrecioPoint, anyhow::Error> {
|
) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
let policy = RetryPolicy::exponential(Duration::from_millis(300))
|
let body = get_retry_policy()
|
||||||
.with_max_retries(10)
|
|
||||||
.with_jitter(true);
|
|
||||||
|
|
||||||
let response = policy
|
|
||||||
.retry(|| do_request(client, &url))
|
.retry(|| do_request(client, &url))
|
||||||
|
.await?
|
||||||
|
.text()
|
||||||
.await
|
.await
|
||||||
.map_err(FetchError::Http)?;
|
.map_err(FetchError::Http)?;
|
||||||
if !response.status().is_success() {
|
|
||||||
bail!(FetchError::HttpStatus(response.status()));
|
|
||||||
}
|
|
||||||
let body = response.text().await.map_err(FetchError::Http)?;
|
|
||||||
|
|
||||||
let maybe_point = { scrap_url(client, url, &body).await };
|
let maybe_point = { scrap_url(client, url, &body).await };
|
||||||
|
|
||||||
|
@ -202,6 +221,20 @@ async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn get_url_list_cli(supermercado: Supermercado) -> anyhow::Result<()> {
|
||||||
|
let urls = match supermercado {
|
||||||
|
Supermercado::Dia => sites::dia::get_urls().await?,
|
||||||
|
Supermercado::Jumbo => sites::jumbo::get_urls().await?,
|
||||||
|
Supermercado::Carrefour => sites::carrefour::get_urls().await?,
|
||||||
|
_ => todo!(),
|
||||||
|
};
|
||||||
|
urls.iter().for_each(|s| {
|
||||||
|
println!("{}", s);
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
async fn scrap_url(
|
async fn scrap_url(
|
||||||
client: &reqwest::Client,
|
client: &reqwest::Client,
|
||||||
url: String,
|
url: String,
|
||||||
|
|
|
@ -66,3 +66,19 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error>
|
||||||
url,
|
url,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn get_urls() -> anyhow::Result<Vec<String>> {
|
||||||
|
let urls = vec![
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-0.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-1.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-2.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-3.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-4.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-5.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-6.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-7.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-8.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-9.xml",
|
||||||
|
];
|
||||||
|
vtex::get_urls_from_sitemap(&urls).await
|
||||||
|
}
|
||||||
|
|
|
@ -4,6 +4,7 @@ use simple_error::bail;
|
||||||
use crate::sites::common;
|
use crate::sites::common;
|
||||||
use crate::PrecioPoint;
|
use crate::PrecioPoint;
|
||||||
|
|
||||||
|
use super::vtex;
|
||||||
use super::vtex::find_product_ld;
|
use super::vtex::find_product_ld;
|
||||||
use super::vtex::AvailabilityLd;
|
use super::vtex::AvailabilityLd;
|
||||||
|
|
||||||
|
@ -39,3 +40,14 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error>
|
||||||
url,
|
url,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn get_urls() -> anyhow::Result<Vec<String>> {
|
||||||
|
let urls = vec![
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/sitemap/product-1.xml",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/sitemap/product-2.xml",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/sitemap/product-3.xml",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/sitemap/product-4.xml",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
|
||||||
|
];
|
||||||
|
vtex::get_urls_from_sitemap(&urls).await
|
||||||
|
}
|
||||||
|
|
|
@ -90,3 +90,25 @@ pub async fn scrap(
|
||||||
url,
|
url,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn get_urls() -> anyhow::Result<Vec<String>> {
|
||||||
|
// de https://www.jumbo.com.ar/sitemap.xml
|
||||||
|
let urls = vec![
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-1.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-10.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-11.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-12.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-13.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-14.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-15.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-2.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-3.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-4.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-5.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-6.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-7.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-8.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-9.xml",
|
||||||
|
];
|
||||||
|
vtex::get_urls_from_sitemap(&urls).await
|
||||||
|
}
|
||||||
|
|
|
@ -1,8 +1,11 @@
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
|
use itertools::Itertools;
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use simple_error::SimpleError;
|
use simple_error::SimpleError;
|
||||||
use tl::VDom;
|
use tl::VDom;
|
||||||
|
|
||||||
|
use crate::{build_client, do_request, get_retry_policy};
|
||||||
|
|
||||||
use super::common;
|
use super::common;
|
||||||
|
|
||||||
pub fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, anyhow::Error> {
|
pub fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, anyhow::Error> {
|
||||||
|
@ -100,3 +103,30 @@ pub fn in_stock_from_meta(dom: &VDom) -> anyhow::Result<bool> {
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> {
|
||||||
|
let dom = tl::parse(sitemap, tl::ParserOptions::default())?;
|
||||||
|
Ok(dom
|
||||||
|
.query_selector("loc")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.map(|t| t.inner_text(dom.parser()))
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.collect())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_urls_from_sitemap(sitemaps: &[&str]) -> anyhow::Result<Vec<String>> {
|
||||||
|
let mut total = vec![];
|
||||||
|
let client = &build_client();
|
||||||
|
for url in sitemaps {
|
||||||
|
let text = get_retry_policy()
|
||||||
|
.retry(|| do_request(client, url))
|
||||||
|
.await?
|
||||||
|
.text()
|
||||||
|
.await?;
|
||||||
|
let mut urls = parse_urls_from_sitemap(&text)?;
|
||||||
|
total.append(&mut urls);
|
||||||
|
}
|
||||||
|
Ok(total.into_iter().unique().collect())
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue