mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-25 19:16:19 +00:00
parse file y init coto (WIP
This commit is contained in:
parent
3a31586193
commit
f2401aa965
3 changed files with 119 additions and 8 deletions
|
@ -12,26 +12,33 @@ use std::{
|
||||||
time::Duration,
|
time::Duration,
|
||||||
};
|
};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
use tl::VDom;
|
||||||
|
|
||||||
#[derive(Parser)] // requires `derive` feature
|
#[derive(Parser)] // requires `derive` feature
|
||||||
enum Args {
|
enum Args {
|
||||||
FetchList(FetchListArgs),
|
FetchList(FetchListArgs),
|
||||||
|
ParseFile(ParseFileArgs),
|
||||||
}
|
}
|
||||||
#[derive(clap::Args)]
|
#[derive(clap::Args)]
|
||||||
struct FetchListArgs {
|
struct FetchListArgs {
|
||||||
list_path: String,
|
list_path: String,
|
||||||
}
|
}
|
||||||
|
#[derive(clap::Args)]
|
||||||
|
struct ParseFileArgs {
|
||||||
|
file_path: String,
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> anyhow::Result<()> {
|
async fn main() -> anyhow::Result<()> {
|
||||||
tracing_subscriber::fmt::init();
|
tracing_subscriber::fmt::init();
|
||||||
|
|
||||||
match Args::parse() {
|
match Args::parse() {
|
||||||
Args::FetchList(a) => fetch_list(a.list_path).await,
|
Args::FetchList(a) => fetch_list_cli(a.list_path).await,
|
||||||
|
Args::ParseFile(a) => parse_file_cli(a.file_path).await,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn fetch_list(links_list_path: String) -> anyhow::Result<()> {
|
async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
|
||||||
let links_str = fs::read_to_string(links_list_path).unwrap();
|
let links_str = fs::read_to_string(links_list_path).unwrap();
|
||||||
let links = links_str
|
let links = links_str
|
||||||
.split('\n')
|
.split('\n')
|
||||||
|
@ -103,7 +110,6 @@ async fn fetch_and_parse(
|
||||||
client: &reqwest::Client,
|
client: &reqwest::Client,
|
||||||
url: String,
|
url: String,
|
||||||
) -> Result<PrecioPoint, anyhow::Error> {
|
) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
let url_p = Url::parse(&url).unwrap();
|
|
||||||
let policy = RetryPolicy::exponential(Duration::from_millis(300))
|
let policy = RetryPolicy::exponential(Duration::from_millis(300))
|
||||||
.with_max_retries(10)
|
.with_max_retries(10)
|
||||||
.with_jitter(true);
|
.with_jitter(true);
|
||||||
|
@ -122,11 +128,7 @@ async fn fetch_and_parse(
|
||||||
|
|
||||||
let maybe_point = {
|
let maybe_point = {
|
||||||
let dom = tl::parse(&body, tl::ParserOptions::default()).map_err(FetchError::Tl)?;
|
let dom = tl::parse(&body, tl::ParserOptions::default()).map_err(FetchError::Tl)?;
|
||||||
match url_p.host_str().unwrap() {
|
parse_url(url, &dom)
|
||||||
"www.carrefour.com.ar" => sites::carrefour::parse(url, &dom),
|
|
||||||
"diaonline.supermercadosdia.com.ar" => sites::dia::parse(url, &dom),
|
|
||||||
s => bail!("Unknown host {}", s),
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let point = match maybe_point {
|
let point = match maybe_point {
|
||||||
|
@ -144,6 +146,36 @@ async fn fetch_and_parse(
|
||||||
Ok(point)
|
Ok(point)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
|
||||||
|
let file = tokio::fs::read_to_string(file_path).await?;
|
||||||
|
let dom = tl::parse(&file, tl::ParserOptions::default())?;
|
||||||
|
|
||||||
|
let url = dom
|
||||||
|
.query_selector("link[rel=\"canonical\"]")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.next()
|
||||||
|
.and_then(|t| t.attributes().get("href").flatten())
|
||||||
|
.expect("No meta canonical")
|
||||||
|
.as_utf8_str()
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
println!("URL: {}", &url);
|
||||||
|
println!("{:?}", parse_url(url, &dom));
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_url(url: String, dom: &VDom) -> anyhow::Result<PrecioPoint> {
|
||||||
|
let url_p = Url::parse(&url).unwrap();
|
||||||
|
match url_p.host_str().unwrap() {
|
||||||
|
"www.carrefour.com.ar" => sites::carrefour::parse(url, dom),
|
||||||
|
"diaonline.supermercadosdia.com.ar" => sites::dia::parse(url, dom),
|
||||||
|
"www.cotodigital3.com.ar" => sites::coto::parse(url, dom),
|
||||||
|
s => bail!("Unknown host {}", s),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn db_writer(rx: Receiver<PrecioPoint>) {
|
async fn db_writer(rx: Receiver<PrecioPoint>) {
|
||||||
// let conn = Connection::open("../scraper/sqlite.db").unwrap();
|
// let conn = Connection::open("../scraper/sqlite.db").unwrap();
|
||||||
// let mut stmt = conn.prepare("SELECT id, name, data FROM person")?;
|
// let mut stmt = conn.prepare("SELECT id, name, data FROM person")?;
|
||||||
|
|
78
scraper-rs/src/sites/coto.rs
Normal file
78
scraper-rs/src/sites/coto.rs
Normal file
|
@ -0,0 +1,78 @@
|
||||||
|
use anyhow::Context;
|
||||||
|
|
||||||
|
use crate::PrecioPoint;
|
||||||
|
|
||||||
|
#[tracing::instrument(skip(dom))]
|
||||||
|
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
|
let ean = dom
|
||||||
|
.query_selector("div#brandText")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.find(|t| t.inner_text(dom.parser()).as_ref().contains("| EAN: "))
|
||||||
|
.context("No encuentro eanparent")?
|
||||||
|
.query_selector(dom.parser(), "span.span_codigoplu")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.nth(1)
|
||||||
|
.context("no encuentro el ean")?
|
||||||
|
.inner_text(dom.parser())
|
||||||
|
.trim()
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
let precio_centavos = dom
|
||||||
|
.query_selector(".atg_store_newPrice")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.next()
|
||||||
|
.map(|t| t.inner_text(dom.parser()))
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
.map(|s| {
|
||||||
|
let s = s.replacen('$', "", 1).replace('.', "").replace(',', ".");
|
||||||
|
let s = s.trim();
|
||||||
|
s.parse::<f64>()
|
||||||
|
})
|
||||||
|
.transpose()
|
||||||
|
.context("Parseando precio")?
|
||||||
|
.map(|f| (f * 100.0) as u64);
|
||||||
|
|
||||||
|
let in_stock = Some(
|
||||||
|
dom.query_selector(".product_not_available")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.next()
|
||||||
|
.is_some(),
|
||||||
|
);
|
||||||
|
|
||||||
|
let name = dom
|
||||||
|
.query_selector("h1.product_page")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.next()
|
||||||
|
.map(|t| t.inner_text(dom.parser()))
|
||||||
|
.map(|s| s.trim().to_string());
|
||||||
|
|
||||||
|
let image_url = dom
|
||||||
|
.query_selector(".zoom img")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.next()
|
||||||
|
.and_then(|t| t.attributes().get("src").flatten())
|
||||||
|
.map(|s| s.as_utf8_str().to_string());
|
||||||
|
|
||||||
|
Ok(PrecioPoint {
|
||||||
|
ean,
|
||||||
|
fetched_at: crate::now_sec(),
|
||||||
|
in_stock,
|
||||||
|
name,
|
||||||
|
image_url,
|
||||||
|
parser_version: 5,
|
||||||
|
precio_centavos,
|
||||||
|
url,
|
||||||
|
})
|
||||||
|
}
|
|
@ -1,4 +1,5 @@
|
||||||
pub mod carrefour;
|
pub mod carrefour;
|
||||||
mod common;
|
mod common;
|
||||||
|
pub mod coto;
|
||||||
pub mod dia;
|
pub mod dia;
|
||||||
mod vtex;
|
mod vtex;
|
||||||
|
|
Loading…
Reference in a new issue