From f2401aa965357d08f017a5dd6e12abc75a87ee05 Mon Sep 17 00:00:00 2001 From: Nulo Date: Thu, 11 Jan 2024 14:09:18 -0300 Subject: [PATCH] parse file y init coto (WIP --- scraper-rs/src/main.rs | 48 ++++++++++++++++++---- scraper-rs/src/sites/coto.rs | 78 ++++++++++++++++++++++++++++++++++++ scraper-rs/src/sites/mod.rs | 1 + 3 files changed, 119 insertions(+), 8 deletions(-) create mode 100644 scraper-rs/src/sites/coto.rs diff --git a/scraper-rs/src/main.rs b/scraper-rs/src/main.rs index 9ef4c61..2dbe0fa 100644 --- a/scraper-rs/src/main.rs +++ b/scraper-rs/src/main.rs @@ -12,26 +12,33 @@ use std::{ time::Duration, }; use thiserror::Error; +use tl::VDom; #[derive(Parser)] // requires `derive` feature enum Args { FetchList(FetchListArgs), + ParseFile(ParseFileArgs), } #[derive(clap::Args)] struct FetchListArgs { list_path: String, } +#[derive(clap::Args)] +struct ParseFileArgs { + file_path: String, +} #[tokio::main] async fn main() -> anyhow::Result<()> { tracing_subscriber::fmt::init(); match Args::parse() { - Args::FetchList(a) => fetch_list(a.list_path).await, + Args::FetchList(a) => fetch_list_cli(a.list_path).await, + Args::ParseFile(a) => parse_file_cli(a.file_path).await, } } -async fn fetch_list(links_list_path: String) -> anyhow::Result<()> { +async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> { let links_str = fs::read_to_string(links_list_path).unwrap(); let links = links_str .split('\n') @@ -103,7 +110,6 @@ async fn fetch_and_parse( client: &reqwest::Client, url: String, ) -> Result { - let url_p = Url::parse(&url).unwrap(); let policy = RetryPolicy::exponential(Duration::from_millis(300)) .with_max_retries(10) .with_jitter(true); @@ -122,11 +128,7 @@ async fn fetch_and_parse( let maybe_point = { let dom = tl::parse(&body, tl::ParserOptions::default()).map_err(FetchError::Tl)?; - match url_p.host_str().unwrap() { - "www.carrefour.com.ar" => sites::carrefour::parse(url, &dom), - "diaonline.supermercadosdia.com.ar" => sites::dia::parse(url, &dom), - s => bail!("Unknown host {}", s), - } + parse_url(url, &dom) }; let point = match maybe_point { @@ -144,6 +146,36 @@ async fn fetch_and_parse( Ok(point) } +async fn parse_file_cli(file_path: String) -> anyhow::Result<()> { + let file = tokio::fs::read_to_string(file_path).await?; + let dom = tl::parse(&file, tl::ParserOptions::default())?; + + let url = dom + .query_selector("link[rel=\"canonical\"]") + .unwrap() + .filter_map(|h| h.get(dom.parser())) + .filter_map(|n| n.as_tag()) + .next() + .and_then(|t| t.attributes().get("href").flatten()) + .expect("No meta canonical") + .as_utf8_str() + .to_string(); + + println!("URL: {}", &url); + println!("{:?}", parse_url(url, &dom)); + Ok(()) +} + +fn parse_url(url: String, dom: &VDom) -> anyhow::Result { + let url_p = Url::parse(&url).unwrap(); + match url_p.host_str().unwrap() { + "www.carrefour.com.ar" => sites::carrefour::parse(url, dom), + "diaonline.supermercadosdia.com.ar" => sites::dia::parse(url, dom), + "www.cotodigital3.com.ar" => sites::coto::parse(url, dom), + s => bail!("Unknown host {}", s), + } +} + async fn db_writer(rx: Receiver) { // let conn = Connection::open("../scraper/sqlite.db").unwrap(); // let mut stmt = conn.prepare("SELECT id, name, data FROM person")?; diff --git a/scraper-rs/src/sites/coto.rs b/scraper-rs/src/sites/coto.rs new file mode 100644 index 0000000..5611341 --- /dev/null +++ b/scraper-rs/src/sites/coto.rs @@ -0,0 +1,78 @@ +use anyhow::Context; + +use crate::PrecioPoint; + +#[tracing::instrument(skip(dom))] +pub fn parse(url: String, dom: &tl::VDom) -> Result { + let ean = dom + .query_selector("div#brandText") + .unwrap() + .filter_map(|h| h.get(dom.parser())) + .filter_map(|n| n.as_tag()) + .find(|t| t.inner_text(dom.parser()).as_ref().contains("| EAN: ")) + .context("No encuentro eanparent")? + .query_selector(dom.parser(), "span.span_codigoplu") + .unwrap() + .filter_map(|h| h.get(dom.parser())) + .filter_map(|n| n.as_tag()) + .nth(1) + .context("no encuentro el ean")? + .inner_text(dom.parser()) + .trim() + .to_string(); + + let precio_centavos = dom + .query_selector(".atg_store_newPrice") + .unwrap() + .filter_map(|h| h.get(dom.parser())) + .filter_map(|n| n.as_tag()) + .next() + .map(|t| t.inner_text(dom.parser())) + .filter(|s| !s.is_empty()) + .map(|s| { + let s = s.replacen('$', "", 1).replace('.', "").replace(',', "."); + let s = s.trim(); + s.parse::() + }) + .transpose() + .context("Parseando precio")? + .map(|f| (f * 100.0) as u64); + + let in_stock = Some( + dom.query_selector(".product_not_available") + .unwrap() + .filter_map(|h| h.get(dom.parser())) + .filter_map(|n| n.as_tag()) + .next() + .is_some(), + ); + + let name = dom + .query_selector("h1.product_page") + .unwrap() + .filter_map(|h| h.get(dom.parser())) + .filter_map(|n| n.as_tag()) + .next() + .map(|t| t.inner_text(dom.parser())) + .map(|s| s.trim().to_string()); + + let image_url = dom + .query_selector(".zoom img") + .unwrap() + .filter_map(|h| h.get(dom.parser())) + .filter_map(|n| n.as_tag()) + .next() + .and_then(|t| t.attributes().get("src").flatten()) + .map(|s| s.as_utf8_str().to_string()); + + Ok(PrecioPoint { + ean, + fetched_at: crate::now_sec(), + in_stock, + name, + image_url, + parser_version: 5, + precio_centavos, + url, + }) +} diff --git a/scraper-rs/src/sites/mod.rs b/scraper-rs/src/sites/mod.rs index e7f5705..70771c9 100644 --- a/scraper-rs/src/sites/mod.rs +++ b/scraper-rs/src/sites/mod.rs @@ -1,4 +1,5 @@ pub mod carrefour; mod common; +pub mod coto; pub mod dia; mod vtex;