mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 03:26:19 +00:00
scraper-rs: simplificar y parsear json ld
This commit is contained in:
parent
348d054b7b
commit
27aee01c1a
7 changed files with 218 additions and 216 deletions
7
scraper-rs/Cargo.lock
generated
7
scraper-rs/Cargo.lock
generated
|
@ -61,6 +61,12 @@ version = "0.2.16"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.79"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca"
|
||||
|
||||
[[package]]
|
||||
name = "async-channel"
|
||||
version = "2.1.1"
|
||||
|
@ -1016,6 +1022,7 @@ name = "scraper-rs"
|
|||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"again",
|
||||
"anyhow",
|
||||
"async-channel",
|
||||
"nanoid",
|
||||
"rand 0.8.5",
|
||||
|
|
|
@ -7,6 +7,7 @@ edition = "2021"
|
|||
|
||||
[dependencies]
|
||||
again = "0.1.2"
|
||||
anyhow = "1.0.79"
|
||||
async-channel = "2.1.1"
|
||||
nanoid = "0.4.0"
|
||||
rand = "0.8.5"
|
||||
|
|
|
@ -1,105 +1,15 @@
|
|||
use again::RetryPolicy;
|
||||
use async_channel::{Receiver, Sender};
|
||||
use nanoid::nanoid;
|
||||
use rand::seq::SliceRandom;
|
||||
use reqwest::Url;
|
||||
use rusqlite::Connection;
|
||||
use simple_error::{bail, SimpleError};
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
env::{self, args},
|
||||
fs,
|
||||
path::PathBuf,
|
||||
time::{Duration, SystemTime, UNIX_EPOCH},
|
||||
time::Duration,
|
||||
};
|
||||
use thiserror::Error;
|
||||
use tl::VDom;
|
||||
use tokio::io::{stderr, AsyncWriteExt};
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PrecioPoint {
|
||||
ean: String,
|
||||
// unix
|
||||
fetched_at: u64,
|
||||
precio_centavos: Option<u64>,
|
||||
in_stock: Option<bool>,
|
||||
url: String,
|
||||
parser_version: u16,
|
||||
name: Option<String>,
|
||||
image_url: Option<String>,
|
||||
}
|
||||
|
||||
// fn main() {
|
||||
// let arg = args().skip(1).next().unwrap();
|
||||
|
||||
// let file_iter = fs::read_dir(arg)
|
||||
// .unwrap()
|
||||
// .filter(|pr| {
|
||||
// if let Ok(p) = pr {
|
||||
// !p.file_name().to_str().unwrap().ends_with(".link")
|
||||
// } else {
|
||||
// false
|
||||
// }
|
||||
// })
|
||||
// .take(1000)
|
||||
// .map(|f| fs::read(f.unwrap().path()).unwrap());
|
||||
|
||||
// let mut i = 0;
|
||||
// for item in file_iter {
|
||||
// i = i + 1;
|
||||
// {
|
||||
// // let mut text: Option<String> = None;
|
||||
// // let mut price_str: Option<String> = None;
|
||||
// // let mut rewriter = HtmlRewriter::new(
|
||||
// // Settings {
|
||||
// // element_content_handlers: vec![
|
||||
// // // Rewrite insecure hyperlinks
|
||||
// // element!("a[href]", |el| {
|
||||
// // let href = el.get_attribute("href").unwrap().replace("http:", "https:");
|
||||
|
||||
// // el.set_attribute("href", &href).unwrap();
|
||||
|
||||
// // Ok(())
|
||||
// // }),
|
||||
// // (
|
||||
// // Cow::Owned("a".parse().unwrap()),
|
||||
// // ElementContentHandlers::default().text(extract_first_text(&mut text)),
|
||||
// // ),
|
||||
// // element!(
|
||||
// // "meta[property=\"product:price:amount\"]",
|
||||
// // extract_first_attr(&mut price_str, "content")
|
||||
// // ),
|
||||
// // ],
|
||||
// // memory_settings: lol_html::MemorySettings {
|
||||
// // preallocated_parsing_buffer_size: 1024 * 16,
|
||||
// // max_allowed_memory_usage: std::usize::MAX,
|
||||
// // },
|
||||
// // ..Settings::default()
|
||||
// // },
|
||||
// // |_: &[u8]| {},
|
||||
// // );
|
||||
|
||||
// // rewriter.write(&item).unwrap();
|
||||
// // rewriter.end().unwrap();
|
||||
// // println!("{:#?}", price_str);
|
||||
|
||||
// // let html = scraper::Html::parse_document(&String::from_utf8(item).unwrap());
|
||||
|
||||
// let html = String::from_utf8(item).unwrap();
|
||||
// let dom = tl::parse(&html, tl::ParserOptions::default()).unwrap();
|
||||
|
||||
// match parse_carrefour("".into(), &dom) {
|
||||
// Ok(point) => {
|
||||
// // println!("{:?}", point);
|
||||
// }
|
||||
// Err(err) => {
|
||||
// // println!("Error {:#?}: {}", err, html);
|
||||
// }
|
||||
// };
|
||||
// }
|
||||
// }
|
||||
// println!("n={}", i);
|
||||
// }
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
|
@ -174,7 +84,10 @@ enum FetchError {
|
|||
}
|
||||
|
||||
#[tracing::instrument(skip(client))]
|
||||
async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result<PrecioPoint, FetchError> {
|
||||
async fn fetch_and_parse(
|
||||
client: &reqwest::Client,
|
||||
url: String,
|
||||
) -> Result<PrecioPoint, anyhow::Error> {
|
||||
let policy = RetryPolicy::exponential(Duration::from_millis(300))
|
||||
.with_max_retries(10)
|
||||
.with_jitter(true);
|
||||
|
@ -187,13 +100,13 @@ async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result<Precio
|
|||
.await
|
||||
.map_err(FetchError::Http)?;
|
||||
if !response.status().is_success() {
|
||||
return Err(FetchError::HttpStatus(response.status()));
|
||||
bail!(FetchError::HttpStatus(response.status()));
|
||||
}
|
||||
let body = response.text().await.map_err(FetchError::Http)?;
|
||||
|
||||
let maybe_point = {
|
||||
let dom = tl::parse(&body, tl::ParserOptions::default()).map_err(FetchError::Tl)?;
|
||||
parse_carrefour(url, &dom)
|
||||
sites::carrefour::parse(url, &dom)
|
||||
};
|
||||
|
||||
let point = match maybe_point {
|
||||
|
@ -211,120 +124,32 @@ async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result<Precio
|
|||
Ok(point)
|
||||
}
|
||||
|
||||
fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, SimpleError> {
|
||||
let precio_centavos = {
|
||||
get_meta_content(dom, "product:price:amount")?
|
||||
.map(|s| {
|
||||
s.parse::<f64>()
|
||||
.map_err(|_| SimpleError::new("Failed to parse number"))
|
||||
})
|
||||
.transpose()
|
||||
.map(|f| f.map(|f| (f * 100.0) as u64))
|
||||
}?;
|
||||
|
||||
let in_stock_meta = get_meta_content(dom, "product:availability")?.map(|s| s.into_owned());
|
||||
let in_stock = match in_stock_meta {
|
||||
Some(s) => match s.as_ref() {
|
||||
"oos" => Some(false),
|
||||
"instock" => Some(true),
|
||||
_ => return Err(SimpleError::new("Not a valid product:availability")),
|
||||
},
|
||||
None => None,
|
||||
};
|
||||
|
||||
let ean = {
|
||||
let json = &parse_script_json(dom, "__STATE__")?;
|
||||
let state = json
|
||||
.as_object()
|
||||
.ok_or(SimpleError::new("Seed state not an object"))?;
|
||||
if state.is_empty() {
|
||||
bail!("Seed state is an empty object")
|
||||
}
|
||||
let (_, product_json) = state
|
||||
.into_iter()
|
||||
.find(|(key, val)| {
|
||||
key.starts_with("Product:")
|
||||
&& val
|
||||
.as_object()
|
||||
.and_then(|val| val.get("__typename"))
|
||||
.map_or(false, |typename| typename == "Product")
|
||||
})
|
||||
.ok_or(SimpleError::new("No product in seed state"))?;
|
||||
let cache_id = product_json
|
||||
.get("cacheId")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or(SimpleError::new("No cacheId in seed state"))?;
|
||||
let (_, product_sku_json) = state
|
||||
.iter()
|
||||
.find(|(key, val)| {
|
||||
key.starts_with(&format!("Product:{}", cache_id))
|
||||
&& val.as_object().map_or(false, |obj| {
|
||||
obj.get("__typename")
|
||||
.map_or(false, |typename| typename == "SKU")
|
||||
})
|
||||
})
|
||||
.ok_or(SimpleError::new("No Product:cacheId* found"))?;
|
||||
product_sku_json
|
||||
.get("ean")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or(SimpleError::new("No product SKU in seed state"))?
|
||||
.to_string()
|
||||
};
|
||||
|
||||
Ok(PrecioPoint {
|
||||
ean,
|
||||
fetched_at: now_sec(),
|
||||
in_stock,
|
||||
name: None,
|
||||
image_url: None,
|
||||
parser_version: 5,
|
||||
precio_centavos,
|
||||
url,
|
||||
})
|
||||
}
|
||||
|
||||
fn get_meta_content<'a>(
|
||||
dom: &'a VDom<'a>,
|
||||
prop: &str,
|
||||
) -> Result<Option<Cow<'a, str>>, SimpleError> {
|
||||
let tag = &dom
|
||||
.query_selector(&format!("meta[property=\"{}\"]", prop))
|
||||
.and_then(|mut iter| iter.next())
|
||||
.and_then(|h| h.get(dom.parser()))
|
||||
.and_then(|n| n.as_tag());
|
||||
match tag {
|
||||
Some(tag) => Ok(Some(
|
||||
tag.attributes()
|
||||
.get("content")
|
||||
.flatten()
|
||||
.ok_or(SimpleError::new("Failed to get content attr"))?
|
||||
.as_utf8_str(),
|
||||
)),
|
||||
None => Ok(None),
|
||||
async fn db_writer(rx: Receiver<PrecioPoint>) {
|
||||
// let conn = Connection::open("../scraper/sqlite.db").unwrap();
|
||||
// let mut stmt = conn.prepare("SELECT id, name, data FROM person")?;
|
||||
let mut n = 0;
|
||||
while let Ok(res) = rx.recv().await {
|
||||
n += 1;
|
||||
println!("{}", n);
|
||||
println!("{:?}", res)
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, SimpleError> {
|
||||
let parser = dom.parser();
|
||||
let inner_html = &dom
|
||||
.query_selector(&format!(
|
||||
"template[data-type=\"json\"][data-varname=\"{}\"]",
|
||||
varname
|
||||
))
|
||||
.and_then(|mut iter| iter.next())
|
||||
.and_then(|h| h.get(parser))
|
||||
.and_then(|n| n.as_tag())
|
||||
.and_then(|t| {
|
||||
t.children()
|
||||
.all(parser)
|
||||
.iter()
|
||||
.find(|n| n.as_tag().is_some())
|
||||
})
|
||||
.ok_or(SimpleError::new("Failed to get script tag"))?
|
||||
.inner_html(parser);
|
||||
inner_html
|
||||
.parse()
|
||||
.map_err(|_| SimpleError::new("Couldn't parse JSON in script"))
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
|
||||
mod sites;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PrecioPoint {
|
||||
ean: String,
|
||||
// unix
|
||||
fetched_at: u64,
|
||||
precio_centavos: Option<u64>,
|
||||
in_stock: Option<bool>,
|
||||
url: String,
|
||||
parser_version: u16,
|
||||
name: Option<String>,
|
||||
image_url: Option<String>,
|
||||
}
|
||||
|
||||
fn now_sec() -> u64 {
|
||||
|
@ -334,14 +159,3 @@ fn now_sec() -> u64 {
|
|||
.expect("Time went backwards");
|
||||
since_the_epoch.as_secs()
|
||||
}
|
||||
|
||||
async fn db_writer(rx: Receiver<PrecioPoint>) {
|
||||
// let conn = Connection::open("../scraper/sqlite.db").unwrap();
|
||||
// let mut stmt = conn.prepare("SELECT id, name, data FROM person")?;
|
||||
let mut n = 0;
|
||||
while let Ok(res) = rx.recv().await {
|
||||
n += 1;
|
||||
println!("{}", n);
|
||||
// println!("{:?}", res)
|
||||
}
|
||||
}
|
||||
|
|
77
scraper-rs/src/sites/carrefour.rs
Normal file
77
scraper-rs/src/sites/carrefour.rs
Normal file
|
@ -0,0 +1,77 @@
|
|||
use simple_error::bail;
|
||||
use simple_error::SimpleError;
|
||||
|
||||
use crate::sites::common;
|
||||
use crate::sites::vtex;
|
||||
use crate::PrecioPoint;
|
||||
|
||||
use super::vtex::find_product_ld;
|
||||
|
||||
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||
let precio_centavos = common::get_meta_content(dom, "product:price:amount")
|
||||
.map(|s| s.parse::<f64>().map(|f| (f * 100.0) as u64))
|
||||
.transpose()?;
|
||||
|
||||
let in_stock = match common::get_meta_content(dom, "product:availability") {
|
||||
Some(s) => match s.as_ref() {
|
||||
"oos" => false,
|
||||
"instock" => true,
|
||||
_ => bail!("Not a valid product:availability"),
|
||||
},
|
||||
None => bail!("No product:availability in carrefour"),
|
||||
};
|
||||
|
||||
let ean = {
|
||||
let json = &vtex::parse_script_json(dom, "__STATE__")?;
|
||||
let state = json
|
||||
.as_object()
|
||||
.ok_or(SimpleError::new("Seed state not an object"))?;
|
||||
if state.is_empty() {
|
||||
bail!("Seed state is an empty object")
|
||||
}
|
||||
let (_, product_json) = state
|
||||
.iter()
|
||||
.find(|(key, val)| {
|
||||
key.starts_with("Product:") && val.get("__typename").is_some_and(|t| t == "Product")
|
||||
})
|
||||
.ok_or(SimpleError::new("No product in seed state"))?;
|
||||
let cache_id = product_json
|
||||
.get("cacheId")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or(SimpleError::new("No cacheId in seed state"))?;
|
||||
let (_, product_sku_json) = state
|
||||
.iter()
|
||||
.find(|(key, val)| {
|
||||
key.starts_with(&format!("Product:{}", cache_id))
|
||||
&& val.get("__typename").is_some_and(|t| t == "SKU")
|
||||
})
|
||||
.ok_or(SimpleError::new("No Product:cacheId* found"))?;
|
||||
product_sku_json
|
||||
.get("ean")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or(SimpleError::new("No product SKU in seed state"))?
|
||||
.to_string()
|
||||
};
|
||||
|
||||
let (name, image_url) = match find_product_ld(dom) {
|
||||
Some(pm) => {
|
||||
let p = pm?;
|
||||
(Some(p.name), Some(p.image))
|
||||
}
|
||||
None => match in_stock {
|
||||
true => bail!("No JSONLD product in in stock product"),
|
||||
false => (None, None),
|
||||
},
|
||||
};
|
||||
|
||||
Ok(PrecioPoint {
|
||||
ean,
|
||||
fetched_at: crate::now_sec(),
|
||||
in_stock: Some(in_stock),
|
||||
name,
|
||||
image_url,
|
||||
parser_version: 5,
|
||||
precio_centavos,
|
||||
url,
|
||||
})
|
||||
}
|
12
scraper-rs/src/sites/common.rs
Normal file
12
scraper-rs/src/sites/common.rs
Normal file
|
@ -0,0 +1,12 @@
|
|||
use std::borrow::Cow;
|
||||
|
||||
use tl::VDom;
|
||||
|
||||
pub fn get_meta_content<'a>(dom: &'a VDom<'a>, prop: &str) -> Option<Cow<'a, str>> {
|
||||
dom.query_selector(&format!("meta[property=\"{}\"]", prop))
|
||||
.and_then(|mut iter| iter.next())
|
||||
.and_then(|h| h.get(dom.parser()))
|
||||
.and_then(|n| n.as_tag())
|
||||
.and_then(|tag| tag.attributes().get("content").flatten())
|
||||
.map(|s| s.as_utf8_str())
|
||||
}
|
3
scraper-rs/src/sites/mod.rs
Normal file
3
scraper-rs/src/sites/mod.rs
Normal file
|
@ -0,0 +1,3 @@
|
|||
pub mod carrefour;
|
||||
mod common;
|
||||
mod vtex;
|
88
scraper-rs/src/sites/vtex.rs
Normal file
88
scraper-rs/src/sites/vtex.rs
Normal file
|
@ -0,0 +1,88 @@
|
|||
use anyhow::Context;
|
||||
use serde::Deserialize;
|
||||
use simple_error::SimpleError;
|
||||
use tl::VDom;
|
||||
|
||||
pub fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, anyhow::Error> {
|
||||
let inner_html = &dom
|
||||
.query_selector("template[data-type=\"json\"]")
|
||||
.unwrap()
|
||||
.filter_map(|h| h.get(dom.parser()).and_then(|n| n.as_tag()))
|
||||
.find(|t| {
|
||||
t.attributes()
|
||||
.get("data-varname")
|
||||
.flatten()
|
||||
.map_or(false, |v| v.as_utf8_str() == varname)
|
||||
})
|
||||
.ok_or(SimpleError::new("Failed to get template tag"))?
|
||||
.query_selector(dom.parser(), "script")
|
||||
.and_then(|mut it| it.next())
|
||||
.and_then(|h| h.get(dom.parser()))
|
||||
.ok_or(SimpleError::new("Failed to get script tag"))?
|
||||
.inner_html(dom.parser());
|
||||
inner_html.parse().context("Couldn't parse JSON in script")
|
||||
}
|
||||
|
||||
pub fn get_json_lds<'a>(
|
||||
dom: &'a VDom,
|
||||
) -> impl Iterator<Item = std::result::Result<serde_json::Value, serde_json::Error>> + 'a {
|
||||
dom.query_selector("script[type=\"application/ld+json\"]")
|
||||
.unwrap()
|
||||
.filter_map(|h| h.get(dom.parser()))
|
||||
.filter_map(|n| n.as_tag())
|
||||
.map(|t| serde_json::from_str(&t.inner_html(dom.parser())))
|
||||
}
|
||||
#[tracing::instrument]
|
||||
pub fn find_json_ld(dom: &VDom, typ: &str) -> Option<Result<Ld, serde_json::Error>> {
|
||||
get_json_lds(dom)
|
||||
.filter_map(|v| v.ok())
|
||||
.find(|v| v.get("@type").is_some_and(|t| t == typ))
|
||||
.map(serde_json::from_value)
|
||||
}
|
||||
pub fn find_product_ld(dom: &VDom) -> Option<Result<ProductLd, serde_json::Error>> {
|
||||
find_json_ld(dom, "Product").map(|l| {
|
||||
l.map(|l| match l {
|
||||
Ld::Product(p) => p,
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
#[serde(tag = "@type")]
|
||||
pub enum Ld {
|
||||
Product(ProductLd),
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct ProductLd {
|
||||
pub name: String,
|
||||
pub image: String,
|
||||
pub sku: Option<String>,
|
||||
pub offers: OffersLd,
|
||||
}
|
||||
#[derive(Deserialize)]
|
||||
pub struct OffersLd {
|
||||
pub offers: Vec<OfferLd>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct OfferLd {
|
||||
#[serde(rename = "@type")]
|
||||
_type: OfferTypeLd,
|
||||
pub price: f64,
|
||||
pub price_currency: String,
|
||||
pub availability: AvailabilityLd,
|
||||
}
|
||||
#[derive(Deserialize)]
|
||||
pub enum OfferTypeLd {
|
||||
Offer,
|
||||
}
|
||||
#[derive(Deserialize)]
|
||||
pub enum AvailabilityLd {
|
||||
#[serde(rename = "http://schema.org/InStock")]
|
||||
InStock,
|
||||
#[serde(rename = "http://schema.org/OutOfStock")]
|
||||
OutOfStock,
|
||||
}
|
Loading…
Reference in a new issue