Compare commits

...

9 commits

Author SHA1 Message Date
972d5ade18 jumbo 2024-01-11 15:48:20 -03:00
1348bee6c7 no instrumentar funciones pesadas
arregla problemas de perf
2024-01-11 15:47:54 -03:00
8e8fe8ddaf lista jumbo actualizada
cambian las urls muy rápido
2024-01-11 15:30:25 -03:00
37ceb15e74 arreglar image_url coto 2024-01-11 14:45:50 -03:00
f2401aa965 parse file y init coto (WIP 2024-01-11 14:09:18 -03:00
3a31586193 limpiar 2024-01-11 14:02:13 -03:00
8f6f62a261 clap cli 2024-01-11 14:02:02 -03:00
b696551949 scraper-rs: dia 2024-01-11 13:05:51 -03:00
27aee01c1a scraper-rs: simplificar y parsear json ld 2024-01-11 12:55:14 -03:00
11 changed files with 702 additions and 228 deletions

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6299b470d9debc9a173b40c2ba91208eb43a6c8cde02a4819e7bcd76368e4363
size 922185
oid sha256:f231884c2b9fd0b633746892a00824379b4d8aa110e6348309197b83b0d1c555
size 926218

214
scraper-rs/Cargo.lock generated
View file

@ -61,6 +61,60 @@ version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
[[package]]
name = "anstream"
version = "0.6.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d664a92ecae85fd0a7392615844904654d1d5f5514837f471ddef4a057aba1b6"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87"
[[package]]
name = "anstyle-parse"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648"
dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7"
dependencies = [
"anstyle",
"windows-sys 0.52.0",
]
[[package]]
name = "anyhow"
version = "1.0.79"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca"
[[package]]
name = "async-channel"
version = "2.1.1"
@ -175,6 +229,52 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "clap"
version = "4.4.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c12ed66a79a555082f595f7eb980d08669de95009dd4b3d61168c573ebe38fc9"
dependencies = [
"clap_builder",
"clap_derive",
]
[[package]]
name = "clap_builder"
version = "4.4.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f4645eab3431e5a8403a96bea02506a8b35d28cd0f0330977dd5d22f9c84f43"
dependencies = [
"anstream",
"anstyle",
"clap_lex",
"strsim",
]
[[package]]
name = "clap_derive"
version = "4.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "clap_lex"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1"
[[package]]
name = "colorchoice"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
[[package]]
name = "concurrent-queue"
version = "2.4.0"
@ -452,6 +552,12 @@ dependencies = [
"hashbrown",
]
[[package]]
name = "heck"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "hermit-abi"
version = "0.3.3"
@ -647,7 +753,7 @@ checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
dependencies = [
"libc",
"wasi 0.11.0+wasi-snapshot-preview1",
"windows-sys",
"windows-sys 0.48.0",
]
[[package]]
@ -751,7 +857,7 @@ dependencies = [
"libc",
"redox_syscall 0.4.1",
"smallvec",
"windows-targets",
"windows-targets 0.48.5",
]
[[package]]
@ -945,7 +1051,7 @@ dependencies = [
"libc",
"spin",
"untrusted",
"windows-sys",
"windows-sys 0.48.0",
]
[[package]]
@ -1016,7 +1122,9 @@ name = "scraper-rs"
version = "0.1.0"
dependencies = [
"again",
"anyhow",
"async-channel",
"clap",
"nanoid",
"rand 0.8.5",
"reqwest",
@ -1130,7 +1238,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9"
dependencies = [
"libc",
"windows-sys",
"windows-sys 0.48.0",
]
[[package]]
@ -1139,6 +1247,12 @@ version = "0.9.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
[[package]]
name = "strsim"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
[[package]]
name = "syn"
version = "2.0.48"
@ -1237,7 +1351,7 @@ dependencies = [
"signal-hook-registry",
"socket2",
"tokio-macros",
"windows-sys",
"windows-sys 0.48.0",
]
[[package]]
@ -1394,6 +1508,12 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "utf8parse"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
[[package]]
name = "valuable"
version = "0.1.0"
@ -1558,7 +1678,16 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
dependencies = [
"windows-targets",
"windows-targets 0.48.5",
]
[[package]]
name = "windows-sys"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
dependencies = [
"windows-targets 0.52.0",
]
[[package]]
@ -1567,13 +1696,28 @@ version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
"windows_aarch64_gnullvm 0.48.5",
"windows_aarch64_msvc 0.48.5",
"windows_i686_gnu 0.48.5",
"windows_i686_msvc 0.48.5",
"windows_x86_64_gnu 0.48.5",
"windows_x86_64_gnullvm 0.48.5",
"windows_x86_64_msvc 0.48.5",
]
[[package]]
name = "windows-targets"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd"
dependencies = [
"windows_aarch64_gnullvm 0.52.0",
"windows_aarch64_msvc 0.52.0",
"windows_i686_gnu 0.52.0",
"windows_i686_msvc 0.52.0",
"windows_x86_64_gnu 0.52.0",
"windows_x86_64_gnullvm 0.52.0",
"windows_x86_64_msvc 0.52.0",
]
[[package]]
@ -1582,42 +1726,84 @@ version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea"
[[package]]
name = "windows_aarch64_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef"
[[package]]
name = "windows_i686_gnu"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
[[package]]
name = "windows_i686_gnu"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313"
[[package]]
name = "windows_i686_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
[[package]]
name = "windows_i686_msvc"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a"
[[package]]
name = "windows_x86_64_gnu"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e"
[[package]]
name = "windows_x86_64_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04"
[[package]]
name = "winreg"
version = "0.50.0"
@ -1625,7 +1811,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1"
dependencies = [
"cfg-if",
"windows-sys",
"windows-sys 0.48.0",
]
[[package]]

View file

@ -7,7 +7,9 @@ edition = "2021"
[dependencies]
again = "0.1.2"
anyhow = "1.0.79"
async-channel = "2.1.1"
clap = { version = "4.4.15", features = ["derive"] }
nanoid = "0.4.0"
rand = "0.8.5"
# lol_html = "1.2.0"

View file

@ -1,112 +1,44 @@
use again::RetryPolicy;
use async_channel::{Receiver, Sender};
use clap::Parser;
use nanoid::nanoid;
use rand::seq::SliceRandom;
use reqwest::Url;
use rusqlite::Connection;
use simple_error::{bail, SimpleError};
use std::{
borrow::Cow,
env::{self, args},
env::{self},
fs,
path::PathBuf,
time::{Duration, SystemTime, UNIX_EPOCH},
time::Duration,
};
use thiserror::Error;
use tl::VDom;
use tokio::io::{stderr, AsyncWriteExt};
#[derive(Debug)]
struct PrecioPoint {
ean: String,
// unix
fetched_at: u64,
precio_centavos: Option<u64>,
in_stock: Option<bool>,
url: String,
parser_version: u16,
name: Option<String>,
image_url: Option<String>,
#[derive(Parser)] // requires `derive` feature
enum Args {
FetchList(FetchListArgs),
ParseFile(ParseFileArgs),
}
#[derive(clap::Args)]
struct FetchListArgs {
list_path: String,
}
#[derive(clap::Args)]
struct ParseFileArgs {
file_path: String,
}
// fn main() {
// let arg = args().skip(1).next().unwrap();
// let file_iter = fs::read_dir(arg)
// .unwrap()
// .filter(|pr| {
// if let Ok(p) = pr {
// !p.file_name().to_str().unwrap().ends_with(".link")
// } else {
// false
// }
// })
// .take(1000)
// .map(|f| fs::read(f.unwrap().path()).unwrap());
// let mut i = 0;
// for item in file_iter {
// i = i + 1;
// {
// // let mut text: Option<String> = None;
// // let mut price_str: Option<String> = None;
// // let mut rewriter = HtmlRewriter::new(
// // Settings {
// // element_content_handlers: vec![
// // // Rewrite insecure hyperlinks
// // element!("a[href]", |el| {
// // let href = el.get_attribute("href").unwrap().replace("http:", "https:");
// // el.set_attribute("href", &href).unwrap();
// // Ok(())
// // }),
// // (
// // Cow::Owned("a".parse().unwrap()),
// // ElementContentHandlers::default().text(extract_first_text(&mut text)),
// // ),
// // element!(
// // "meta[property=\"product:price:amount\"]",
// // extract_first_attr(&mut price_str, "content")
// // ),
// // ],
// // memory_settings: lol_html::MemorySettings {
// // preallocated_parsing_buffer_size: 1024 * 16,
// // max_allowed_memory_usage: std::usize::MAX,
// // },
// // ..Settings::default()
// // },
// // |_: &[u8]| {},
// // );
// // rewriter.write(&item).unwrap();
// // rewriter.end().unwrap();
// // println!("{:#?}", price_str);
// // let html = scraper::Html::parse_document(&String::from_utf8(item).unwrap());
// let html = String::from_utf8(item).unwrap();
// let dom = tl::parse(&html, tl::ParserOptions::default()).unwrap();
// match parse_carrefour("".into(), &dom) {
// Ok(point) => {
// // println!("{:?}", point);
// }
// Err(err) => {
// // println!("Error {:#?}: {}", err, html);
// }
// };
// }
// }
// println!("n={}", i);
// }
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
async fn main() -> anyhow::Result<()> {
tracing_subscriber::fmt::init();
let mut args = args().skip(1);
let links_list_path = args.next().expect("Falta arg para path de lista de urls");
match Args::parse() {
Args::FetchList(a) => fetch_list_cli(a.list_path).await,
Args::ParseFile(a) => parse_file_cli(a.file_path).await,
}
}
async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
let links_str = fs::read_to_string(links_list_path).unwrap();
let links = links_str
.split('\n')
@ -146,8 +78,12 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
Ok(())
}
fn build_client() -> reqwest::Client {
reqwest::ClientBuilder::default().build().unwrap()
}
async fn worker(rx: Receiver<String>, tx: Sender<PrecioPoint>) {
let client = reqwest::ClientBuilder::default().build().unwrap();
let client = build_client();
while let Ok(url) = rx.recv().await {
let res = fetch_and_parse(&client, url.clone()).await;
match res {
@ -174,7 +110,10 @@ enum FetchError {
}
#[tracing::instrument(skip(client))]
async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result<PrecioPoint, FetchError> {
async fn fetch_and_parse(
client: &reqwest::Client,
url: String,
) -> Result<PrecioPoint, anyhow::Error> {
let policy = RetryPolicy::exponential(Duration::from_millis(300))
.with_max_retries(10)
.with_jitter(true);
@ -187,14 +126,11 @@ async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result<Precio
.await
.map_err(FetchError::Http)?;
if !response.status().is_success() {
return Err(FetchError::HttpStatus(response.status()));
bail!(FetchError::HttpStatus(response.status()));
}
let body = response.text().await.map_err(FetchError::Http)?;
let maybe_point = {
let dom = tl::parse(&body, tl::ParserOptions::default()).map_err(FetchError::Tl)?;
parse_carrefour(url, &dom)
};
let maybe_point = { scrap_url(client, url, &body).await };
let point = match maybe_point {
Ok(p) => Ok(p),
@ -211,130 +147,50 @@ async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result<Precio
Ok(point)
}
fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, SimpleError> {
let precio_centavos = {
get_meta_content(dom, "product:price:amount")?
.map(|s| {
s.parse::<f64>()
.map_err(|_| SimpleError::new("Failed to parse number"))
})
.transpose()
.map(|f| f.map(|f| (f * 100.0) as u64))
}?;
async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
let file = tokio::fs::read_to_string(file_path).await?;
let in_stock_meta = get_meta_content(dom, "product:availability")?.map(|s| s.into_owned());
let in_stock = match in_stock_meta {
Some(s) => match s.as_ref() {
"oos" => Some(false),
"instock" => Some(true),
_ => return Err(SimpleError::new("Not a valid product:availability")),
},
None => None,
};
let client = build_client();
let ean = {
let json = &parse_script_json(dom, "__STATE__")?;
let state = json
.as_object()
.ok_or(SimpleError::new("Seed state not an object"))?;
if state.is_empty() {
bail!("Seed state is an empty object")
}
let (_, product_json) = state
.into_iter()
.find(|(key, val)| {
key.starts_with("Product:")
&& val
.as_object()
.and_then(|val| val.get("__typename"))
.map_or(false, |typename| typename == "Product")
})
.ok_or(SimpleError::new("No product in seed state"))?;
let cache_id = product_json
.get("cacheId")
.and_then(|v| v.as_str())
.ok_or(SimpleError::new("No cacheId in seed state"))?;
let (_, product_sku_json) = state
.iter()
.find(|(key, val)| {
key.starts_with(&format!("Product:{}", cache_id))
&& val.as_object().map_or(false, |obj| {
obj.get("__typename")
.map_or(false, |typename| typename == "SKU")
})
})
.ok_or(SimpleError::new("No Product:cacheId* found"))?;
product_sku_json
.get("ean")
.and_then(|v| v.as_str())
.ok_or(SimpleError::new("No product SKU in seed state"))?
let url = {
let dom = tl::parse(&file, tl::ParserOptions::default())?;
dom.query_selector("link[rel=\"canonical\"]")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.next()
.and_then(|t| t.attributes().get("href").flatten())
.expect("No meta canonical")
.as_utf8_str()
.to_string()
};
Ok(PrecioPoint {
ean,
fetched_at: now_sec(),
in_stock,
name: None,
image_url: None,
parser_version: 5,
precio_centavos,
url,
})
println!("URL: {}", &url);
println!("{:?}", scrap_url(&client, url, &file).await);
Ok(())
}
fn get_meta_content<'a>(
dom: &'a VDom<'a>,
prop: &str,
) -> Result<Option<Cow<'a, str>>, SimpleError> {
let tag = &dom
.query_selector(&format!("meta[property=\"{}\"]", prop))
.and_then(|mut iter| iter.next())
.and_then(|h| h.get(dom.parser()))
.and_then(|n| n.as_tag());
match tag {
Some(tag) => Ok(Some(
tag.attributes()
.get("content")
.flatten()
.ok_or(SimpleError::new("Failed to get content attr"))?
.as_utf8_str(),
)),
None => Ok(None),
async fn scrap_url(
client: &reqwest::Client,
url: String,
body: &str,
) -> anyhow::Result<PrecioPoint> {
let url_p = Url::parse(&url).unwrap();
match url_p.host_str().unwrap() {
"www.carrefour.com.ar" => {
sites::carrefour::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
}
"diaonline.supermercadosdia.com.ar" => {
sites::dia::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
}
"www.cotodigital3.com.ar" => {
sites::coto::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
}
"www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await,
s => bail!("Unknown host {}", s),
}
}
fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, SimpleError> {
let parser = dom.parser();
let inner_html = &dom
.query_selector(&format!(
"template[data-type=\"json\"][data-varname=\"{}\"]",
varname
))
.and_then(|mut iter| iter.next())
.and_then(|h| h.get(parser))
.and_then(|n| n.as_tag())
.and_then(|t| {
t.children()
.all(parser)
.iter()
.find(|n| n.as_tag().is_some())
})
.ok_or(SimpleError::new("Failed to get script tag"))?
.inner_html(parser);
inner_html
.parse()
.map_err(|_| SimpleError::new("Couldn't parse JSON in script"))
}
fn now_sec() -> u64 {
let start = SystemTime::now();
let since_the_epoch = start
.duration_since(UNIX_EPOCH)
.expect("Time went backwards");
since_the_epoch.as_secs()
}
async fn db_writer(rx: Receiver<PrecioPoint>) {
// let conn = Connection::open("../scraper/sqlite.db").unwrap();
// let mut stmt = conn.prepare("SELECT id, name, data FROM person")?;
@ -345,3 +201,28 @@ async fn db_writer(rx: Receiver<PrecioPoint>) {
// println!("{:?}", res)
}
}
use std::time::{SystemTime, UNIX_EPOCH};
mod sites;
#[derive(Debug)]
struct PrecioPoint {
ean: String,
// unix
fetched_at: u64,
precio_centavos: Option<u64>,
in_stock: Option<bool>,
url: String,
parser_version: u16,
name: Option<String>,
image_url: Option<String>,
}
fn now_sec() -> u64 {
let start = SystemTime::now();
let since_the_epoch = start
.duration_since(UNIX_EPOCH)
.expect("Time went backwards");
since_the_epoch.as_secs()
}

View file

@ -0,0 +1,68 @@
use simple_error::bail;
use simple_error::SimpleError;
use crate::sites::common;
use crate::sites::vtex;
use crate::PrecioPoint;
use super::vtex::find_product_ld;
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
let precio_centavos = common::price_from_meta(dom)?;
let in_stock = vtex::in_stock_from_meta(dom)?;
let ean = {
let json = &vtex::parse_script_json(dom, "__STATE__")?;
let state = json
.as_object()
.ok_or(SimpleError::new("Seed state not an object"))?;
if state.is_empty() {
bail!("Seed state is an empty object")
}
let (_, product_json) = state
.iter()
.find(|(key, val)| {
key.starts_with("Product:") && val.get("__typename").is_some_and(|t| t == "Product")
})
.ok_or(SimpleError::new("No product in seed state"))?;
let cache_id = product_json
.get("cacheId")
.and_then(|v| v.as_str())
.ok_or(SimpleError::new("No cacheId in seed state"))?;
let (_, product_sku_json) = state
.iter()
.find(|(key, val)| {
key.starts_with(&format!("Product:{}", cache_id))
&& val.get("__typename").is_some_and(|t| t == "SKU")
})
.ok_or(SimpleError::new("No Product:cacheId* found"))?;
product_sku_json
.get("ean")
.and_then(|v| v.as_str())
.ok_or(SimpleError::new("No product SKU in seed state"))?
.to_string()
};
let (name, image_url) = match find_product_ld(dom) {
Some(pm) => {
let p = pm?;
(Some(p.name), Some(p.image))
}
None => match in_stock {
true => bail!("No JSONLD product in in stock product"),
false => (None, None),
},
};
Ok(PrecioPoint {
ean,
fetched_at: crate::now_sec(),
in_stock: Some(in_stock),
name,
image_url,
parser_version: 5,
precio_centavos,
url,
})
}

View file

@ -0,0 +1,19 @@
use std::borrow::Cow;
use tl::VDom;
pub fn get_meta_content<'a>(dom: &'a VDom<'a>, prop: &str) -> Option<Cow<'a, str>> {
dom.query_selector(&format!("meta[property=\"{}\"]", prop))
.and_then(|mut iter| iter.next())
.and_then(|h| h.get(dom.parser()))
.and_then(|n| n.as_tag())
.and_then(|tag| tag.attributes().get("content").flatten())
.map(|s| s.as_utf8_str())
}
pub fn price_from_meta(dom: &tl::VDom<'_>) -> Result<Option<u64>, anyhow::Error> {
let precio_centavos = get_meta_content(dom, "product:price:amount")
.map(|s| s.parse::<f64>().map(|f| (f * 100.0) as u64))
.transpose()?;
Ok(precio_centavos)
}

View file

@ -0,0 +1,77 @@
use anyhow::Context;
use crate::PrecioPoint;
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
let ean = dom
.query_selector("div#brandText")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.find(|t| t.inner_text(dom.parser()).as_ref().contains("| EAN: "))
.context("No encuentro eanparent")?
.query_selector(dom.parser(), "span.span_codigoplu")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.nth(1)
.context("no encuentro el ean")?
.inner_text(dom.parser())
.trim()
.to_string();
let precio_centavos = dom
.query_selector(".atg_store_newPrice")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.next()
.map(|t| t.inner_text(dom.parser()))
.filter(|s| !s.is_empty())
.map(|s| {
let s = s.replacen('$', "", 1).replace('.', "").replace(',', ".");
let s = s.trim();
s.parse::<f64>()
})
.transpose()
.context("Parseando precio")?
.map(|f| (f * 100.0) as u64);
let in_stock = Some(
dom.query_selector(".product_not_available")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.next()
.is_some(),
);
let name = dom
.query_selector("h1.product_page")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.next()
.map(|t| t.inner_text(dom.parser()))
.map(|s| s.trim().to_string());
let image_url = dom
.query_selector(".zoomImage1")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.next()
.and_then(|t| t.attributes().get("src").flatten())
.map(|s| s.as_utf8_str().to_string());
Ok(PrecioPoint {
ean,
fetched_at: crate::now_sec(),
in_stock,
name,
image_url,
parser_version: 5,
precio_centavos,
url,
})
}

View file

@ -0,0 +1,41 @@
use anyhow::Context;
use simple_error::bail;
use crate::sites::common;
use crate::PrecioPoint;
use super::vtex::find_product_ld;
use super::vtex::AvailabilityLd;
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
let ean = common::get_meta_content(dom, "product:retailer_item_id")
.context("Parsing EAN")?
.to_string();
let precio_centavos = common::price_from_meta(dom)?;
let (name, image_url, in_stock) = match find_product_ld(dom) {
Some(pm) => {
let p = pm?;
(
Some(p.name),
Some(p.image),
Some(
p.offers.offers.first().context("No offer")?.availability
== AvailabilityLd::InStock,
),
)
}
None => bail!("No JSON/LD"),
};
Ok(PrecioPoint {
ean,
fetched_at: crate::now_sec(),
in_stock,
name,
image_url,
parser_version: 5,
precio_centavos,
url,
})
}

View file

@ -0,0 +1,92 @@
use std::str::FromStr;
use anyhow::Context;
use reqwest::Url;
use serde::Deserialize;
use simple_error::bail;
use crate::sites::common;
use crate::PrecioPoint;
use super::vtex;
#[derive(Deserialize)]
struct JumboSearch {
items: Vec<JumboSearchItem>,
}
#[derive(Deserialize)]
struct JumboSearchItem {
ean: String,
}
async fn get_ean_from_search(
client: &reqwest::Client,
retailer_sku: String,
) -> anyhow::Result<String> {
let s = client
.get({
let mut url =
Url::from_str("https://www.jumbo.com.ar/api/catalog_system/pub/products/search")
.unwrap();
url.set_query(Some(&format!("fq=skuId:{}", retailer_sku)));
url
})
.send()
.await?
.text()
.await?;
let ean = {
let search: Vec<JumboSearch> = serde_json::from_str(&s)?;
let result = search.first().context("No search result")?;
let ean = result
.items
.first()
.context("No search result")?
.ean
.clone();
if !result.items.iter().all(|i| i.ean == ean) {
bail!("Inesperado: no todos los items tienen el mismo EAN")
}
ean
};
Ok(ean)
}
pub async fn scrap(
client: &reqwest::Client,
url: String,
body: &str,
) -> Result<PrecioPoint, anyhow::Error> {
let (name, image_url, sku, precio_centavos, in_stock) = {
let dom = tl::parse(body, tl::ParserOptions::default())?;
let precio_centavos = common::price_from_meta(&dom)?;
let in_stock = vtex::in_stock_from_meta(&dom)?;
match vtex::find_product_ld(&dom) {
Some(pm) => {
let p = pm?;
(
Some(p.name),
Some(p.image),
p.sku.context("No retailer SKU in Product LD")?,
precio_centavos,
in_stock,
)
}
None => bail!("No JSON/LD"),
}
};
let ean = get_ean_from_search(client, sku).await?;
Ok(PrecioPoint {
ean,
fetched_at: crate::now_sec(),
in_stock: Some(in_stock),
name,
image_url,
parser_version: 5,
precio_centavos,
url,
})
}

View file

@ -0,0 +1,6 @@
pub mod carrefour;
mod common;
pub mod coto;
pub mod dia;
pub mod jumbo;
mod vtex;

View file

@ -0,0 +1,102 @@
use anyhow::{bail, Context};
use serde::Deserialize;
use simple_error::SimpleError;
use tl::VDom;
use super::common;
pub fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, anyhow::Error> {
let inner_html = &dom
.query_selector("template[data-type=\"json\"]")
.unwrap()
.filter_map(|h| h.get(dom.parser()).and_then(|n| n.as_tag()))
.find(|t| {
t.attributes()
.get("data-varname")
.flatten()
.map_or(false, |v| v.as_utf8_str() == varname)
})
.ok_or(SimpleError::new("Failed to get template tag"))?
.query_selector(dom.parser(), "script")
.and_then(|mut it| it.next())
.and_then(|h| h.get(dom.parser()))
.ok_or(SimpleError::new("Failed to get script tag"))?
.inner_html(dom.parser());
inner_html.parse().context("Couldn't parse JSON in script")
}
pub fn get_json_lds<'a>(
dom: &'a VDom,
) -> impl Iterator<Item = std::result::Result<serde_json::Value, serde_json::Error>> + 'a {
dom.query_selector("script[type=\"application/ld+json\"]")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.map(|t| serde_json::from_str(&t.inner_html(dom.parser())))
}
pub fn find_json_ld(dom: &VDom, typ: &str) -> Option<Result<Ld, serde_json::Error>> {
get_json_lds(dom)
.filter_map(|v| v.ok())
.find(|v| v.get("@type").is_some_and(|t| t == typ))
.map(serde_json::from_value)
}
pub fn find_product_ld(dom: &VDom) -> Option<Result<ProductLd, serde_json::Error>> {
find_json_ld(dom, "Product").map(|l| {
l.map(|l| match l {
Ld::Product(p) => p,
})
})
}
#[derive(Deserialize)]
#[serde(tag = "@type")]
pub enum Ld {
Product(ProductLd),
}
#[derive(Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ProductLd {
pub name: String,
pub image: String,
pub sku: Option<String>,
pub offers: OffersLd,
}
#[derive(Deserialize)]
pub struct OffersLd {
pub offers: Vec<OfferLd>,
}
#[derive(Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct OfferLd {
#[serde(rename = "@type")]
_type: OfferTypeLd,
pub price: f64,
pub price_currency: String,
pub availability: AvailabilityLd,
}
#[derive(Deserialize)]
pub enum OfferTypeLd {
Offer,
}
#[derive(Deserialize, PartialEq)]
pub enum AvailabilityLd {
#[serde(rename = "http://schema.org/InStock")]
InStock,
#[serde(rename = "http://schema.org/OutOfStock")]
OutOfStock,
}
pub fn in_stock_from_meta(dom: &VDom) -> anyhow::Result<bool> {
Ok(
match common::get_meta_content(dom, "product:availability") {
Some(s) => match s.as_ref() {
"oos" => false,
"instock" => true,
_ => bail!("Not a valid product:availability"),
},
None => bail!("No product:availability in carrefour"),
},
)
}