mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-23 06:36:19 +00:00
scraper-rs: dia
This commit is contained in:
parent
27aee01c1a
commit
b696551949
6 changed files with 60 additions and 5 deletions
|
@ -1,6 +1,7 @@
|
||||||
use again::RetryPolicy;
|
use again::RetryPolicy;
|
||||||
use async_channel::{Receiver, Sender};
|
use async_channel::{Receiver, Sender};
|
||||||
use nanoid::nanoid;
|
use nanoid::nanoid;
|
||||||
|
use reqwest::Url;
|
||||||
use rusqlite::Connection;
|
use rusqlite::Connection;
|
||||||
use simple_error::{bail, SimpleError};
|
use simple_error::{bail, SimpleError};
|
||||||
use std::{
|
use std::{
|
||||||
|
@ -88,6 +89,7 @@ async fn fetch_and_parse(
|
||||||
client: &reqwest::Client,
|
client: &reqwest::Client,
|
||||||
url: String,
|
url: String,
|
||||||
) -> Result<PrecioPoint, anyhow::Error> {
|
) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
|
let url_p = Url::parse(&url).unwrap();
|
||||||
let policy = RetryPolicy::exponential(Duration::from_millis(300))
|
let policy = RetryPolicy::exponential(Duration::from_millis(300))
|
||||||
.with_max_retries(10)
|
.with_max_retries(10)
|
||||||
.with_jitter(true);
|
.with_jitter(true);
|
||||||
|
@ -106,7 +108,11 @@ async fn fetch_and_parse(
|
||||||
|
|
||||||
let maybe_point = {
|
let maybe_point = {
|
||||||
let dom = tl::parse(&body, tl::ParserOptions::default()).map_err(FetchError::Tl)?;
|
let dom = tl::parse(&body, tl::ParserOptions::default()).map_err(FetchError::Tl)?;
|
||||||
sites::carrefour::parse(url, &dom)
|
match url_p.host_str().unwrap() {
|
||||||
|
"www.carrefour.com.ar" => sites::carrefour::parse(url, &dom),
|
||||||
|
"diaonline.supermercadosdia.com.ar" => sites::dia::parse(url, &dom),
|
||||||
|
s => bail!("Unknown host {}", s),
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let point = match maybe_point {
|
let point = match maybe_point {
|
||||||
|
|
|
@ -8,9 +8,7 @@ use crate::PrecioPoint;
|
||||||
use super::vtex::find_product_ld;
|
use super::vtex::find_product_ld;
|
||||||
|
|
||||||
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
let precio_centavos = common::get_meta_content(dom, "product:price:amount")
|
let precio_centavos = common::price_from_meta(dom)?;
|
||||||
.map(|s| s.parse::<f64>().map(|f| (f * 100.0) as u64))
|
|
||||||
.transpose()?;
|
|
||||||
|
|
||||||
let in_stock = match common::get_meta_content(dom, "product:availability") {
|
let in_stock = match common::get_meta_content(dom, "product:availability") {
|
||||||
Some(s) => match s.as_ref() {
|
Some(s) => match s.as_ref() {
|
||||||
|
|
|
@ -10,3 +10,10 @@ pub fn get_meta_content<'a>(dom: &'a VDom<'a>, prop: &str) -> Option<Cow<'a, str
|
||||||
.and_then(|tag| tag.attributes().get("content").flatten())
|
.and_then(|tag| tag.attributes().get("content").flatten())
|
||||||
.map(|s| s.as_utf8_str())
|
.map(|s| s.as_utf8_str())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn price_from_meta(dom: &tl::VDom<'_>) -> Result<Option<u64>, anyhow::Error> {
|
||||||
|
let precio_centavos = get_meta_content(dom, "product:price:amount")
|
||||||
|
.map(|s| s.parse::<f64>().map(|f| (f * 100.0) as u64))
|
||||||
|
.transpose()?;
|
||||||
|
Ok(precio_centavos)
|
||||||
|
}
|
||||||
|
|
43
scraper-rs/src/sites/dia.rs
Normal file
43
scraper-rs/src/sites/dia.rs
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
use anyhow::Context;
|
||||||
|
use simple_error::bail;
|
||||||
|
use simple_error::SimpleError;
|
||||||
|
|
||||||
|
use crate::sites::common;
|
||||||
|
use crate::sites::vtex;
|
||||||
|
use crate::PrecioPoint;
|
||||||
|
|
||||||
|
use super::vtex::find_product_ld;
|
||||||
|
use super::vtex::AvailabilityLd;
|
||||||
|
|
||||||
|
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
|
let ean = common::get_meta_content(dom, "product:retailer_item_id")
|
||||||
|
.context("Parsing EAN")?
|
||||||
|
.to_string();
|
||||||
|
let precio_centavos = common::price_from_meta(dom)?;
|
||||||
|
|
||||||
|
let (name, image_url, in_stock) = match find_product_ld(dom) {
|
||||||
|
Some(pm) => {
|
||||||
|
let p = pm?;
|
||||||
|
(
|
||||||
|
Some(p.name),
|
||||||
|
Some(p.image),
|
||||||
|
Some(
|
||||||
|
p.offers.offers.first().context("No offer")?.availability
|
||||||
|
== AvailabilityLd::InStock,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
None => bail!("No JSON/LD"),
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(PrecioPoint {
|
||||||
|
ean,
|
||||||
|
fetched_at: crate::now_sec(),
|
||||||
|
in_stock,
|
||||||
|
name,
|
||||||
|
image_url,
|
||||||
|
parser_version: 5,
|
||||||
|
precio_centavos,
|
||||||
|
url,
|
||||||
|
})
|
||||||
|
}
|
|
@ -1,3 +1,4 @@
|
||||||
pub mod carrefour;
|
pub mod carrefour;
|
||||||
mod common;
|
mod common;
|
||||||
|
pub mod dia;
|
||||||
mod vtex;
|
mod vtex;
|
||||||
|
|
|
@ -79,7 +79,7 @@ pub struct OfferLd {
|
||||||
pub enum OfferTypeLd {
|
pub enum OfferTypeLd {
|
||||||
Offer,
|
Offer,
|
||||||
}
|
}
|
||||||
#[derive(Deserialize)]
|
#[derive(Deserialize, PartialEq)]
|
||||||
pub enum AvailabilityLd {
|
pub enum AvailabilityLd {
|
||||||
#[serde(rename = "http://schema.org/InStock")]
|
#[serde(rename = "http://schema.org/InStock")]
|
||||||
InStock,
|
InStock,
|
||||||
|
|
Loading…
Reference in a new issue