parse file y init coto (WIP

This commit is contained in:
Cat /dev/Nulo 2024-01-11 14:09:18 -03:00
parent 3a31586193
commit f2401aa965
3 changed files with 119 additions and 8 deletions

View file

@ -12,26 +12,33 @@ use std::{
time::Duration, time::Duration,
}; };
use thiserror::Error; use thiserror::Error;
use tl::VDom;
#[derive(Parser)] // requires `derive` feature #[derive(Parser)] // requires `derive` feature
enum Args { enum Args {
FetchList(FetchListArgs), FetchList(FetchListArgs),
ParseFile(ParseFileArgs),
} }
#[derive(clap::Args)] #[derive(clap::Args)]
struct FetchListArgs { struct FetchListArgs {
list_path: String, list_path: String,
} }
#[derive(clap::Args)]
struct ParseFileArgs {
file_path: String,
}
#[tokio::main] #[tokio::main]
async fn main() -> anyhow::Result<()> { async fn main() -> anyhow::Result<()> {
tracing_subscriber::fmt::init(); tracing_subscriber::fmt::init();
match Args::parse() { match Args::parse() {
Args::FetchList(a) => fetch_list(a.list_path).await, Args::FetchList(a) => fetch_list_cli(a.list_path).await,
Args::ParseFile(a) => parse_file_cli(a.file_path).await,
} }
} }
async fn fetch_list(links_list_path: String) -> anyhow::Result<()> { async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
let links_str = fs::read_to_string(links_list_path).unwrap(); let links_str = fs::read_to_string(links_list_path).unwrap();
let links = links_str let links = links_str
.split('\n') .split('\n')
@ -103,7 +110,6 @@ async fn fetch_and_parse(
client: &reqwest::Client, client: &reqwest::Client,
url: String, url: String,
) -> Result<PrecioPoint, anyhow::Error> { ) -> Result<PrecioPoint, anyhow::Error> {
let url_p = Url::parse(&url).unwrap();
let policy = RetryPolicy::exponential(Duration::from_millis(300)) let policy = RetryPolicy::exponential(Duration::from_millis(300))
.with_max_retries(10) .with_max_retries(10)
.with_jitter(true); .with_jitter(true);
@ -122,11 +128,7 @@ async fn fetch_and_parse(
let maybe_point = { let maybe_point = {
let dom = tl::parse(&body, tl::ParserOptions::default()).map_err(FetchError::Tl)?; let dom = tl::parse(&body, tl::ParserOptions::default()).map_err(FetchError::Tl)?;
match url_p.host_str().unwrap() { parse_url(url, &dom)
"www.carrefour.com.ar" => sites::carrefour::parse(url, &dom),
"diaonline.supermercadosdia.com.ar" => sites::dia::parse(url, &dom),
s => bail!("Unknown host {}", s),
}
}; };
let point = match maybe_point { let point = match maybe_point {
@ -144,6 +146,36 @@ async fn fetch_and_parse(
Ok(point) Ok(point)
} }
async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
let file = tokio::fs::read_to_string(file_path).await?;
let dom = tl::parse(&file, tl::ParserOptions::default())?;
let url = dom
.query_selector("link[rel=\"canonical\"]")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.next()
.and_then(|t| t.attributes().get("href").flatten())
.expect("No meta canonical")
.as_utf8_str()
.to_string();
println!("URL: {}", &url);
println!("{:?}", parse_url(url, &dom));
Ok(())
}
fn parse_url(url: String, dom: &VDom) -> anyhow::Result<PrecioPoint> {
let url_p = Url::parse(&url).unwrap();
match url_p.host_str().unwrap() {
"www.carrefour.com.ar" => sites::carrefour::parse(url, dom),
"diaonline.supermercadosdia.com.ar" => sites::dia::parse(url, dom),
"www.cotodigital3.com.ar" => sites::coto::parse(url, dom),
s => bail!("Unknown host {}", s),
}
}
async fn db_writer(rx: Receiver<PrecioPoint>) { async fn db_writer(rx: Receiver<PrecioPoint>) {
// let conn = Connection::open("../scraper/sqlite.db").unwrap(); // let conn = Connection::open("../scraper/sqlite.db").unwrap();
// let mut stmt = conn.prepare("SELECT id, name, data FROM person")?; // let mut stmt = conn.prepare("SELECT id, name, data FROM person")?;

View file

@ -0,0 +1,78 @@
use anyhow::Context;
use crate::PrecioPoint;
#[tracing::instrument(skip(dom))]
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
let ean = dom
.query_selector("div#brandText")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.find(|t| t.inner_text(dom.parser()).as_ref().contains("| EAN: "))
.context("No encuentro eanparent")?
.query_selector(dom.parser(), "span.span_codigoplu")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.nth(1)
.context("no encuentro el ean")?
.inner_text(dom.parser())
.trim()
.to_string();
let precio_centavos = dom
.query_selector(".atg_store_newPrice")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.next()
.map(|t| t.inner_text(dom.parser()))
.filter(|s| !s.is_empty())
.map(|s| {
let s = s.replacen('$', "", 1).replace('.', "").replace(',', ".");
let s = s.trim();
s.parse::<f64>()
})
.transpose()
.context("Parseando precio")?
.map(|f| (f * 100.0) as u64);
let in_stock = Some(
dom.query_selector(".product_not_available")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.next()
.is_some(),
);
let name = dom
.query_selector("h1.product_page")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.next()
.map(|t| t.inner_text(dom.parser()))
.map(|s| s.trim().to_string());
let image_url = dom
.query_selector(".zoom img")
.unwrap()
.filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag())
.next()
.and_then(|t| t.attributes().get("src").flatten())
.map(|s| s.as_utf8_str().to_string());
Ok(PrecioPoint {
ean,
fetched_at: crate::now_sec(),
in_stock,
name,
image_url,
parser_version: 5,
precio_centavos,
url,
})
}

View file

@ -1,4 +1,5 @@
pub mod carrefour; pub mod carrefour;
mod common; mod common;
pub mod coto;
pub mod dia; pub mod dia;
mod vtex; mod vtex;