mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-22 22:26:19 +00:00
jumbo
This commit is contained in:
parent
1348bee6c7
commit
972d5ade18
5 changed files with 148 additions and 32 deletions
|
@ -78,8 +78,12 @@ async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn build_client() -> reqwest::Client {
|
||||||
|
reqwest::ClientBuilder::default().build().unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
async fn worker(rx: Receiver<String>, tx: Sender<PrecioPoint>) {
|
async fn worker(rx: Receiver<String>, tx: Sender<PrecioPoint>) {
|
||||||
let client = reqwest::ClientBuilder::default().build().unwrap();
|
let client = build_client();
|
||||||
while let Ok(url) = rx.recv().await {
|
while let Ok(url) = rx.recv().await {
|
||||||
let res = fetch_and_parse(&client, url.clone()).await;
|
let res = fetch_and_parse(&client, url.clone()).await;
|
||||||
match res {
|
match res {
|
||||||
|
@ -126,10 +130,7 @@ async fn fetch_and_parse(
|
||||||
}
|
}
|
||||||
let body = response.text().await.map_err(FetchError::Http)?;
|
let body = response.text().await.map_err(FetchError::Http)?;
|
||||||
|
|
||||||
let maybe_point = {
|
let maybe_point = { scrap_url(client, url, &body).await };
|
||||||
let dom = tl::parse(&body, tl::ParserOptions::default()).map_err(FetchError::Tl)?;
|
|
||||||
parse_url(url, &dom)
|
|
||||||
};
|
|
||||||
|
|
||||||
let point = match maybe_point {
|
let point = match maybe_point {
|
||||||
Ok(p) => Ok(p),
|
Ok(p) => Ok(p),
|
||||||
|
@ -148,10 +149,12 @@ async fn fetch_and_parse(
|
||||||
|
|
||||||
async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
|
async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
|
||||||
let file = tokio::fs::read_to_string(file_path).await?;
|
let file = tokio::fs::read_to_string(file_path).await?;
|
||||||
let dom = tl::parse(&file, tl::ParserOptions::default())?;
|
|
||||||
|
|
||||||
let url = dom
|
let client = build_client();
|
||||||
.query_selector("link[rel=\"canonical\"]")
|
|
||||||
|
let url = {
|
||||||
|
let dom = tl::parse(&file, tl::ParserOptions::default())?;
|
||||||
|
dom.query_selector("link[rel=\"canonical\"]")
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.filter_map(|h| h.get(dom.parser()))
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
.filter_map(|n| n.as_tag())
|
.filter_map(|n| n.as_tag())
|
||||||
|
@ -159,19 +162,31 @@ async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
|
||||||
.and_then(|t| t.attributes().get("href").flatten())
|
.and_then(|t| t.attributes().get("href").flatten())
|
||||||
.expect("No meta canonical")
|
.expect("No meta canonical")
|
||||||
.as_utf8_str()
|
.as_utf8_str()
|
||||||
.to_string();
|
.to_string()
|
||||||
|
};
|
||||||
|
|
||||||
println!("URL: {}", &url);
|
println!("URL: {}", &url);
|
||||||
println!("{:?}", parse_url(url, &dom));
|
println!("{:?}", scrap_url(&client, url, &file).await);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_url(url: String, dom: &VDom) -> anyhow::Result<PrecioPoint> {
|
async fn scrap_url(
|
||||||
|
client: &reqwest::Client,
|
||||||
|
url: String,
|
||||||
|
body: &str,
|
||||||
|
) -> anyhow::Result<PrecioPoint> {
|
||||||
let url_p = Url::parse(&url).unwrap();
|
let url_p = Url::parse(&url).unwrap();
|
||||||
match url_p.host_str().unwrap() {
|
match url_p.host_str().unwrap() {
|
||||||
"www.carrefour.com.ar" => sites::carrefour::parse(url, dom),
|
"www.carrefour.com.ar" => {
|
||||||
"diaonline.supermercadosdia.com.ar" => sites::dia::parse(url, dom),
|
sites::carrefour::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
|
||||||
"www.cotodigital3.com.ar" => sites::coto::parse(url, dom),
|
}
|
||||||
|
"diaonline.supermercadosdia.com.ar" => {
|
||||||
|
sites::dia::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
|
||||||
|
}
|
||||||
|
"www.cotodigital3.com.ar" => {
|
||||||
|
sites::coto::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
|
||||||
|
}
|
||||||
|
"www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await,
|
||||||
s => bail!("Unknown host {}", s),
|
s => bail!("Unknown host {}", s),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -182,8 +197,8 @@ async fn db_writer(rx: Receiver<PrecioPoint>) {
|
||||||
let mut n = 0;
|
let mut n = 0;
|
||||||
while let Ok(res) = rx.recv().await {
|
while let Ok(res) = rx.recv().await {
|
||||||
n += 1;
|
n += 1;
|
||||||
// println!("{}", n);
|
println!("{}", n);
|
||||||
println!("{:?}", res)
|
// println!("{:?}", res)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,14 +10,7 @@ use super::vtex::find_product_ld;
|
||||||
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
let precio_centavos = common::price_from_meta(dom)?;
|
let precio_centavos = common::price_from_meta(dom)?;
|
||||||
|
|
||||||
let in_stock = match common::get_meta_content(dom, "product:availability") {
|
let in_stock = vtex::in_stock_from_meta(dom)?;
|
||||||
Some(s) => match s.as_ref() {
|
|
||||||
"oos" => false,
|
|
||||||
"instock" => true,
|
|
||||||
_ => bail!("Not a valid product:availability"),
|
|
||||||
},
|
|
||||||
None => bail!("No product:availability in carrefour"),
|
|
||||||
};
|
|
||||||
|
|
||||||
let ean = {
|
let ean = {
|
||||||
let json = &vtex::parse_script_json(dom, "__STATE__")?;
|
let json = &vtex::parse_script_json(dom, "__STATE__")?;
|
||||||
|
|
92
scraper-rs/src/sites/jumbo.rs
Normal file
92
scraper-rs/src/sites/jumbo.rs
Normal file
|
@ -0,0 +1,92 @@
|
||||||
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use anyhow::Context;
|
||||||
|
use reqwest::Url;
|
||||||
|
use serde::Deserialize;
|
||||||
|
use simple_error::bail;
|
||||||
|
|
||||||
|
use crate::sites::common;
|
||||||
|
use crate::PrecioPoint;
|
||||||
|
|
||||||
|
use super::vtex;
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct JumboSearch {
|
||||||
|
items: Vec<JumboSearchItem>,
|
||||||
|
}
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct JumboSearchItem {
|
||||||
|
ean: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_ean_from_search(
|
||||||
|
client: &reqwest::Client,
|
||||||
|
retailer_sku: String,
|
||||||
|
) -> anyhow::Result<String> {
|
||||||
|
let s = client
|
||||||
|
.get({
|
||||||
|
let mut url =
|
||||||
|
Url::from_str("https://www.jumbo.com.ar/api/catalog_system/pub/products/search")
|
||||||
|
.unwrap();
|
||||||
|
url.set_query(Some(&format!("fq=skuId:{}", retailer_sku)));
|
||||||
|
url
|
||||||
|
})
|
||||||
|
.send()
|
||||||
|
.await?
|
||||||
|
.text()
|
||||||
|
.await?;
|
||||||
|
let ean = {
|
||||||
|
let search: Vec<JumboSearch> = serde_json::from_str(&s)?;
|
||||||
|
let result = search.first().context("No search result")?;
|
||||||
|
let ean = result
|
||||||
|
.items
|
||||||
|
.first()
|
||||||
|
.context("No search result")?
|
||||||
|
.ean
|
||||||
|
.clone();
|
||||||
|
if !result.items.iter().all(|i| i.ean == ean) {
|
||||||
|
bail!("Inesperado: no todos los items tienen el mismo EAN")
|
||||||
|
}
|
||||||
|
ean
|
||||||
|
};
|
||||||
|
Ok(ean)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn scrap(
|
||||||
|
client: &reqwest::Client,
|
||||||
|
url: String,
|
||||||
|
body: &str,
|
||||||
|
) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
|
let (name, image_url, sku, precio_centavos, in_stock) = {
|
||||||
|
let dom = tl::parse(body, tl::ParserOptions::default())?;
|
||||||
|
let precio_centavos = common::price_from_meta(&dom)?;
|
||||||
|
let in_stock = vtex::in_stock_from_meta(&dom)?;
|
||||||
|
|
||||||
|
match vtex::find_product_ld(&dom) {
|
||||||
|
Some(pm) => {
|
||||||
|
let p = pm?;
|
||||||
|
(
|
||||||
|
Some(p.name),
|
||||||
|
Some(p.image),
|
||||||
|
p.sku.context("No retailer SKU in Product LD")?,
|
||||||
|
precio_centavos,
|
||||||
|
in_stock,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
None => bail!("No JSON/LD"),
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let ean = get_ean_from_search(client, sku).await?;
|
||||||
|
|
||||||
|
Ok(PrecioPoint {
|
||||||
|
ean,
|
||||||
|
fetched_at: crate::now_sec(),
|
||||||
|
in_stock: Some(in_stock),
|
||||||
|
name,
|
||||||
|
image_url,
|
||||||
|
parser_version: 5,
|
||||||
|
precio_centavos,
|
||||||
|
url,
|
||||||
|
})
|
||||||
|
}
|
|
@ -2,4 +2,5 @@ pub mod carrefour;
|
||||||
mod common;
|
mod common;
|
||||||
pub mod coto;
|
pub mod coto;
|
||||||
pub mod dia;
|
pub mod dia;
|
||||||
|
pub mod jumbo;
|
||||||
mod vtex;
|
mod vtex;
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
use anyhow::Context;
|
use anyhow::{bail, Context};
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use simple_error::SimpleError;
|
use simple_error::SimpleError;
|
||||||
use tl::VDom;
|
use tl::VDom;
|
||||||
|
|
||||||
|
use super::common;
|
||||||
|
|
||||||
pub fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, anyhow::Error> {
|
pub fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, anyhow::Error> {
|
||||||
let inner_html = &dom
|
let inner_html = &dom
|
||||||
.query_selector("template[data-type=\"json\"]")
|
.query_selector("template[data-type=\"json\"]")
|
||||||
|
@ -85,3 +87,16 @@ pub enum AvailabilityLd {
|
||||||
#[serde(rename = "http://schema.org/OutOfStock")]
|
#[serde(rename = "http://schema.org/OutOfStock")]
|
||||||
OutOfStock,
|
OutOfStock,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn in_stock_from_meta(dom: &VDom) -> anyhow::Result<bool> {
|
||||||
|
Ok(
|
||||||
|
match common::get_meta_content(dom, "product:availability") {
|
||||||
|
Some(s) => match s.as_ref() {
|
||||||
|
"oos" => false,
|
||||||
|
"instock" => true,
|
||||||
|
_ => bail!("Not a valid product:availability"),
|
||||||
|
},
|
||||||
|
None => bail!("No product:availability in carrefour"),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue