mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-22 22:26:19 +00:00
scraper-rs: dia
This commit is contained in:
parent
27aee01c1a
commit
b696551949
6 changed files with 60 additions and 5 deletions
|
@ -1,6 +1,7 @@
|
|||
use again::RetryPolicy;
|
||||
use async_channel::{Receiver, Sender};
|
||||
use nanoid::nanoid;
|
||||
use reqwest::Url;
|
||||
use rusqlite::Connection;
|
||||
use simple_error::{bail, SimpleError};
|
||||
use std::{
|
||||
|
@ -88,6 +89,7 @@ async fn fetch_and_parse(
|
|||
client: &reqwest::Client,
|
||||
url: String,
|
||||
) -> Result<PrecioPoint, anyhow::Error> {
|
||||
let url_p = Url::parse(&url).unwrap();
|
||||
let policy = RetryPolicy::exponential(Duration::from_millis(300))
|
||||
.with_max_retries(10)
|
||||
.with_jitter(true);
|
||||
|
@ -106,7 +108,11 @@ async fn fetch_and_parse(
|
|||
|
||||
let maybe_point = {
|
||||
let dom = tl::parse(&body, tl::ParserOptions::default()).map_err(FetchError::Tl)?;
|
||||
sites::carrefour::parse(url, &dom)
|
||||
match url_p.host_str().unwrap() {
|
||||
"www.carrefour.com.ar" => sites::carrefour::parse(url, &dom),
|
||||
"diaonline.supermercadosdia.com.ar" => sites::dia::parse(url, &dom),
|
||||
s => bail!("Unknown host {}", s),
|
||||
}
|
||||
};
|
||||
|
||||
let point = match maybe_point {
|
||||
|
|
|
@ -8,9 +8,7 @@ use crate::PrecioPoint;
|
|||
use super::vtex::find_product_ld;
|
||||
|
||||
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||
let precio_centavos = common::get_meta_content(dom, "product:price:amount")
|
||||
.map(|s| s.parse::<f64>().map(|f| (f * 100.0) as u64))
|
||||
.transpose()?;
|
||||
let precio_centavos = common::price_from_meta(dom)?;
|
||||
|
||||
let in_stock = match common::get_meta_content(dom, "product:availability") {
|
||||
Some(s) => match s.as_ref() {
|
||||
|
|
|
@ -10,3 +10,10 @@ pub fn get_meta_content<'a>(dom: &'a VDom<'a>, prop: &str) -> Option<Cow<'a, str
|
|||
.and_then(|tag| tag.attributes().get("content").flatten())
|
||||
.map(|s| s.as_utf8_str())
|
||||
}
|
||||
|
||||
pub fn price_from_meta(dom: &tl::VDom<'_>) -> Result<Option<u64>, anyhow::Error> {
|
||||
let precio_centavos = get_meta_content(dom, "product:price:amount")
|
||||
.map(|s| s.parse::<f64>().map(|f| (f * 100.0) as u64))
|
||||
.transpose()?;
|
||||
Ok(precio_centavos)
|
||||
}
|
||||
|
|
43
scraper-rs/src/sites/dia.rs
Normal file
43
scraper-rs/src/sites/dia.rs
Normal file
|
@ -0,0 +1,43 @@
|
|||
use anyhow::Context;
|
||||
use simple_error::bail;
|
||||
use simple_error::SimpleError;
|
||||
|
||||
use crate::sites::common;
|
||||
use crate::sites::vtex;
|
||||
use crate::PrecioPoint;
|
||||
|
||||
use super::vtex::find_product_ld;
|
||||
use super::vtex::AvailabilityLd;
|
||||
|
||||
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||
let ean = common::get_meta_content(dom, "product:retailer_item_id")
|
||||
.context("Parsing EAN")?
|
||||
.to_string();
|
||||
let precio_centavos = common::price_from_meta(dom)?;
|
||||
|
||||
let (name, image_url, in_stock) = match find_product_ld(dom) {
|
||||
Some(pm) => {
|
||||
let p = pm?;
|
||||
(
|
||||
Some(p.name),
|
||||
Some(p.image),
|
||||
Some(
|
||||
p.offers.offers.first().context("No offer")?.availability
|
||||
== AvailabilityLd::InStock,
|
||||
),
|
||||
)
|
||||
}
|
||||
None => bail!("No JSON/LD"),
|
||||
};
|
||||
|
||||
Ok(PrecioPoint {
|
||||
ean,
|
||||
fetched_at: crate::now_sec(),
|
||||
in_stock,
|
||||
name,
|
||||
image_url,
|
||||
parser_version: 5,
|
||||
precio_centavos,
|
||||
url,
|
||||
})
|
||||
}
|
|
@ -1,3 +1,4 @@
|
|||
pub mod carrefour;
|
||||
mod common;
|
||||
pub mod dia;
|
||||
mod vtex;
|
||||
|
|
|
@ -79,7 +79,7 @@ pub struct OfferLd {
|
|||
pub enum OfferTypeLd {
|
||||
Offer,
|
||||
}
|
||||
#[derive(Deserialize)]
|
||||
#[derive(Deserialize, PartialEq)]
|
||||
pub enum AvailabilityLd {
|
||||
#[serde(rename = "http://schema.org/InStock")]
|
||||
InStock,
|
||||
|
|
Loading…
Reference in a new issue