From f2401aa965357d08f017a5dd6e12abc75a87ee05 Mon Sep 17 00:00:00 2001
From: Nulo <git@nulo.in>
Date: Thu, 11 Jan 2024 14:09:18 -0300
Subject: [PATCH] parse file y init coto (WIP

---
 scraper-rs/src/main.rs       | 48 ++++++++++++++++++----
 scraper-rs/src/sites/coto.rs | 78 ++++++++++++++++++++++++++++++++++++
 scraper-rs/src/sites/mod.rs  |  1 +
 3 files changed, 119 insertions(+), 8 deletions(-)
 create mode 100644 scraper-rs/src/sites/coto.rs
diff --git a/scraper-rs/src/main.rs b/scraper-rs/src/main.rs
index 9ef4c61..2dbe0fa 100644
--- a/scraper-rs/src/main.rs
+++ b/scraper-rs/src/main.rs
@@ -12,26 +12,33 @@ use std::{
     time::Duration,
 };
 use thiserror::Error;
+use tl::VDom;
 
 #[derive(Parser)] // requires `derive` feature
 enum Args {
     FetchList(FetchListArgs),
+    ParseFile(ParseFileArgs),
 }
 #[derive(clap::Args)]
 struct FetchListArgs {
     list_path: String,
 }
+#[derive(clap::Args)]
+struct ParseFileArgs {
+    file_path: String,
+}
 
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
     tracing_subscriber::fmt::init();
 
     match Args::parse() {
-        Args::FetchList(a) => fetch_list(a.list_path).await,
+        Args::FetchList(a) => fetch_list_cli(a.list_path).await,
+        Args::ParseFile(a) => parse_file_cli(a.file_path).await,
     }
 }
 
-async fn fetch_list(links_list_path: String) -> anyhow::Result<()> {
+async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
     let links_str = fs::read_to_string(links_list_path).unwrap();
     let links = links_str
         .split('\n')
@@ -103,7 +110,6 @@ async fn fetch_and_parse(
     client: &reqwest::Client,
     url: String,
 ) -> Result<PrecioPoint, anyhow::Error> {
-    let url_p = Url::parse(&url).unwrap();
     let policy = RetryPolicy::exponential(Duration::from_millis(300))
         .with_max_retries(10)
         .with_jitter(true);
@@ -122,11 +128,7 @@ async fn fetch_and_parse(
 
     let maybe_point = {
         let dom = tl::parse(&body, tl::ParserOptions::default()).map_err(FetchError::Tl)?;
-        match url_p.host_str().unwrap() {
-            "www.carrefour.com.ar" => sites::carrefour::parse(url, &dom),
-            "diaonline.supermercadosdia.com.ar" => sites::dia::parse(url, &dom),
-            s => bail!("Unknown host {}", s),
-        }
+        parse_url(url, &dom)
     };
 
     let point = match maybe_point {
@@ -144,6 +146,36 @@ async fn fetch_and_parse(
     Ok(point)
 }
 
+async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
+    let file = tokio::fs::read_to_string(file_path).await?;
+    let dom = tl::parse(&file, tl::ParserOptions::default())?;
+
+    let url = dom
+        .query_selector("link[rel=\"canonical\"]")
+        .unwrap()
+        .filter_map(|h| h.get(dom.parser()))
+        .filter_map(|n| n.as_tag())
+        .next()
+        .and_then(|t| t.attributes().get("href").flatten())
+        .expect("No meta canonical")
+        .as_utf8_str()
+        .to_string();
+
+    println!("URL: {}", &url);
+    println!("{:?}", parse_url(url, &dom));
+    Ok(())
+}
+
+fn parse_url(url: String, dom: &VDom) -> anyhow::Result<PrecioPoint> {
+    let url_p = Url::parse(&url).unwrap();
+    match url_p.host_str().unwrap() {
+        "www.carrefour.com.ar" => sites::carrefour::parse(url, dom),
+        "diaonline.supermercadosdia.com.ar" => sites::dia::parse(url, dom),
+        "www.cotodigital3.com.ar" => sites::coto::parse(url, dom),
+        s => bail!("Unknown host {}", s),
+    }
+}
+
 async fn db_writer(rx: Receiver<PrecioPoint>) {
     // let conn = Connection::open("../scraper/sqlite.db").unwrap();
     // let mut stmt = conn.prepare("SELECT id, name, data FROM person")?;
diff --git a/scraper-rs/src/sites/coto.rs b/scraper-rs/src/sites/coto.rs
new file mode 100644
index 0000000..5611341
--- /dev/null
+++ b/scraper-rs/src/sites/coto.rs
@@ -0,0 +1,78 @@
+use anyhow::Context;
+
+use crate::PrecioPoint;
+
+#[tracing::instrument(skip(dom))]
+pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
+    let ean = dom
+        .query_selector("div#brandText")
+        .unwrap()
+        .filter_map(|h| h.get(dom.parser()))
+        .filter_map(|n| n.as_tag())
+        .find(|t| t.inner_text(dom.parser()).as_ref().contains("| EAN: "))
+        .context("No encuentro eanparent")?
+        .query_selector(dom.parser(), "span.span_codigoplu")
+        .unwrap()
+        .filter_map(|h| h.get(dom.parser()))
+        .filter_map(|n| n.as_tag())
+        .nth(1)
+        .context("no encuentro el ean")?
+        .inner_text(dom.parser())
+        .trim()
+        .to_string();
+
+    let precio_centavos = dom
+        .query_selector(".atg_store_newPrice")
+        .unwrap()
+        .filter_map(|h| h.get(dom.parser()))
+        .filter_map(|n| n.as_tag())
+        .next()
+        .map(|t| t.inner_text(dom.parser()))
+        .filter(|s| !s.is_empty())
+        .map(|s| {
+            let s = s.replacen('$', "", 1).replace('.', "").replace(',', ".");
+            let s = s.trim();
+            s.parse::<f64>()
+        })
+        .transpose()
+        .context("Parseando precio")?
+        .map(|f| (f * 100.0) as u64);
+
+    let in_stock = Some(
+        dom.query_selector(".product_not_available")
+            .unwrap()
+            .filter_map(|h| h.get(dom.parser()))
+            .filter_map(|n| n.as_tag())
+            .next()
+            .is_some(),
+    );
+
+    let name = dom
+        .query_selector("h1.product_page")
+        .unwrap()
+        .filter_map(|h| h.get(dom.parser()))
+        .filter_map(|n| n.as_tag())
+        .next()
+        .map(|t| t.inner_text(dom.parser()))
+        .map(|s| s.trim().to_string());
+
+    let image_url = dom
+        .query_selector(".zoom img")
+        .unwrap()
+        .filter_map(|h| h.get(dom.parser()))
+        .filter_map(|n| n.as_tag())
+        .next()
+        .and_then(|t| t.attributes().get("src").flatten())
+        .map(|s| s.as_utf8_str().to_string());
+
+    Ok(PrecioPoint {
+        ean,
+        fetched_at: crate::now_sec(),
+        in_stock,
+        name,
+        image_url,
+        parser_version: 5,
+        precio_centavos,
+        url,
+    })
+}
diff --git a/scraper-rs/src/sites/mod.rs b/scraper-rs/src/sites/mod.rs
index e7f5705..70771c9 100644
--- a/scraper-rs/src/sites/mod.rs
+++ b/scraper-rs/src/sites/mod.rs
@@ -1,4 +1,5 @@
 pub mod carrefour;
 mod common;
+pub mod coto;
 pub mod dia;
 mod vtex;