From 972d5ade18eb75b0ea18d428c225b0f7d4080049 Mon Sep 17 00:00:00 2001 From: Nulo Date: Thu, 11 Jan 2024 15:48:20 -0300 Subject: [PATCH] jumbo --- scraper-rs/src/main.rs | 61 ++++++++++++-------- scraper-rs/src/sites/carrefour.rs | 9 +-- scraper-rs/src/sites/jumbo.rs | 92 +++++++++++++++++++++++++++++++ scraper-rs/src/sites/mod.rs | 1 + scraper-rs/src/sites/vtex.rs | 17 +++++- 5 files changed, 148 insertions(+), 32 deletions(-) create mode 100644 scraper-rs/src/sites/jumbo.rs diff --git a/scraper-rs/src/main.rs b/scraper-rs/src/main.rs index 2dbe0fa..e36459e 100644 --- a/scraper-rs/src/main.rs +++ b/scraper-rs/src/main.rs @@ -78,8 +78,12 @@ async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> { Ok(()) } +fn build_client() -> reqwest::Client { + reqwest::ClientBuilder::default().build().unwrap() +} + async fn worker(rx: Receiver, tx: Sender) { - let client = reqwest::ClientBuilder::default().build().unwrap(); + let client = build_client(); while let Ok(url) = rx.recv().await { let res = fetch_and_parse(&client, url.clone()).await; match res { @@ -126,10 +130,7 @@ async fn fetch_and_parse( } let body = response.text().await.map_err(FetchError::Http)?; - let maybe_point = { - let dom = tl::parse(&body, tl::ParserOptions::default()).map_err(FetchError::Tl)?; - parse_url(url, &dom) - }; + let maybe_point = { scrap_url(client, url, &body).await }; let point = match maybe_point { Ok(p) => Ok(p), @@ -148,30 +149,44 @@ async fn fetch_and_parse( async fn parse_file_cli(file_path: String) -> anyhow::Result<()> { let file = tokio::fs::read_to_string(file_path).await?; - let dom = tl::parse(&file, tl::ParserOptions::default())?; - let url = dom - .query_selector("link[rel=\"canonical\"]") - .unwrap() - .filter_map(|h| h.get(dom.parser())) - .filter_map(|n| n.as_tag()) - .next() - .and_then(|t| t.attributes().get("href").flatten()) - .expect("No meta canonical") - .as_utf8_str() - .to_string(); + let client = build_client(); + + let url = { + let dom = tl::parse(&file, tl::ParserOptions::default())?; + dom.query_selector("link[rel=\"canonical\"]") + .unwrap() + .filter_map(|h| h.get(dom.parser())) + .filter_map(|n| n.as_tag()) + .next() + .and_then(|t| t.attributes().get("href").flatten()) + .expect("No meta canonical") + .as_utf8_str() + .to_string() + }; println!("URL: {}", &url); - println!("{:?}", parse_url(url, &dom)); + println!("{:?}", scrap_url(&client, url, &file).await); Ok(()) } -fn parse_url(url: String, dom: &VDom) -> anyhow::Result { +async fn scrap_url( + client: &reqwest::Client, + url: String, + body: &str, +) -> anyhow::Result { let url_p = Url::parse(&url).unwrap(); match url_p.host_str().unwrap() { - "www.carrefour.com.ar" => sites::carrefour::parse(url, dom), - "diaonline.supermercadosdia.com.ar" => sites::dia::parse(url, dom), - "www.cotodigital3.com.ar" => sites::coto::parse(url, dom), + "www.carrefour.com.ar" => { + sites::carrefour::parse(url, &tl::parse(&body, tl::ParserOptions::default())?) + } + "diaonline.supermercadosdia.com.ar" => { + sites::dia::parse(url, &tl::parse(&body, tl::ParserOptions::default())?) + } + "www.cotodigital3.com.ar" => { + sites::coto::parse(url, &tl::parse(&body, tl::ParserOptions::default())?) + } + "www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await, s => bail!("Unknown host {}", s), } } @@ -182,8 +197,8 @@ async fn db_writer(rx: Receiver) { let mut n = 0; while let Ok(res) = rx.recv().await { n += 1; - // println!("{}", n); - println!("{:?}", res) + println!("{}", n); + // println!("{:?}", res) } } diff --git a/scraper-rs/src/sites/carrefour.rs b/scraper-rs/src/sites/carrefour.rs index 109fb41..fe3a5a7 100644 --- a/scraper-rs/src/sites/carrefour.rs +++ b/scraper-rs/src/sites/carrefour.rs @@ -10,14 +10,7 @@ use super::vtex::find_product_ld; pub fn parse(url: String, dom: &tl::VDom) -> Result { let precio_centavos = common::price_from_meta(dom)?; - let in_stock = match common::get_meta_content(dom, "product:availability") { - Some(s) => match s.as_ref() { - "oos" => false, - "instock" => true, - _ => bail!("Not a valid product:availability"), - }, - None => bail!("No product:availability in carrefour"), - }; + let in_stock = vtex::in_stock_from_meta(dom)?; let ean = { let json = &vtex::parse_script_json(dom, "__STATE__")?; diff --git a/scraper-rs/src/sites/jumbo.rs b/scraper-rs/src/sites/jumbo.rs new file mode 100644 index 0000000..6247ead --- /dev/null +++ b/scraper-rs/src/sites/jumbo.rs @@ -0,0 +1,92 @@ +use std::str::FromStr; + +use anyhow::Context; +use reqwest::Url; +use serde::Deserialize; +use simple_error::bail; + +use crate::sites::common; +use crate::PrecioPoint; + +use super::vtex; + +#[derive(Deserialize)] +struct JumboSearch { + items: Vec, +} +#[derive(Deserialize)] +struct JumboSearchItem { + ean: String, +} + +async fn get_ean_from_search( + client: &reqwest::Client, + retailer_sku: String, +) -> anyhow::Result { + let s = client + .get({ + let mut url = + Url::from_str("https://www.jumbo.com.ar/api/catalog_system/pub/products/search") + .unwrap(); + url.set_query(Some(&format!("fq=skuId:{}", retailer_sku))); + url + }) + .send() + .await? + .text() + .await?; + let ean = { + let search: Vec = serde_json::from_str(&s)?; + let result = search.first().context("No search result")?; + let ean = result + .items + .first() + .context("No search result")? + .ean + .clone(); + if !result.items.iter().all(|i| i.ean == ean) { + bail!("Inesperado: no todos los items tienen el mismo EAN") + } + ean + }; + Ok(ean) +} + +pub async fn scrap( + client: &reqwest::Client, + url: String, + body: &str, +) -> Result { + let (name, image_url, sku, precio_centavos, in_stock) = { + let dom = tl::parse(body, tl::ParserOptions::default())?; + let precio_centavos = common::price_from_meta(&dom)?; + let in_stock = vtex::in_stock_from_meta(&dom)?; + + match vtex::find_product_ld(&dom) { + Some(pm) => { + let p = pm?; + ( + Some(p.name), + Some(p.image), + p.sku.context("No retailer SKU in Product LD")?, + precio_centavos, + in_stock, + ) + } + None => bail!("No JSON/LD"), + } + }; + + let ean = get_ean_from_search(client, sku).await?; + + Ok(PrecioPoint { + ean, + fetched_at: crate::now_sec(), + in_stock: Some(in_stock), + name, + image_url, + parser_version: 5, + precio_centavos, + url, + }) +} diff --git a/scraper-rs/src/sites/mod.rs b/scraper-rs/src/sites/mod.rs index 70771c9..019de83 100644 --- a/scraper-rs/src/sites/mod.rs +++ b/scraper-rs/src/sites/mod.rs @@ -2,4 +2,5 @@ pub mod carrefour; mod common; pub mod coto; pub mod dia; +pub mod jumbo; mod vtex; diff --git a/scraper-rs/src/sites/vtex.rs b/scraper-rs/src/sites/vtex.rs index 1189e3e..855e18e 100644 --- a/scraper-rs/src/sites/vtex.rs +++ b/scraper-rs/src/sites/vtex.rs @@ -1,8 +1,10 @@ -use anyhow::Context; +use anyhow::{bail, Context}; use serde::Deserialize; use simple_error::SimpleError; use tl::VDom; +use super::common; + pub fn parse_script_json(dom: &VDom, varname: &str) -> Result { let inner_html = &dom .query_selector("template[data-type=\"json\"]") @@ -85,3 +87,16 @@ pub enum AvailabilityLd { #[serde(rename = "http://schema.org/OutOfStock")] OutOfStock, } + +pub fn in_stock_from_meta(dom: &VDom) -> anyhow::Result { + Ok( + match common::get_meta_content(dom, "product:availability") { + Some(s) => match s.as_ref() { + "oos" => false, + "instock" => true, + _ => bail!("Not a valid product:availability"), + }, + None => bail!("No product:availability in carrefour"), + }, + ) +}