scraper-rs: dia

This commit is contained in:
Cat /dev/Nulo 2024-01-11 13:05:51 -03:00
parent 27aee01c1a
commit b696551949
6 changed files with 60 additions and 5 deletions

View file

@ -1,6 +1,7 @@
use again::RetryPolicy; use again::RetryPolicy;
use async_channel::{Receiver, Sender}; use async_channel::{Receiver, Sender};
use nanoid::nanoid; use nanoid::nanoid;
use reqwest::Url;
use rusqlite::Connection; use rusqlite::Connection;
use simple_error::{bail, SimpleError}; use simple_error::{bail, SimpleError};
use std::{ use std::{
@ -88,6 +89,7 @@ async fn fetch_and_parse(
client: &reqwest::Client, client: &reqwest::Client,
url: String, url: String,
) -> Result<PrecioPoint, anyhow::Error> { ) -> Result<PrecioPoint, anyhow::Error> {
let url_p = Url::parse(&url).unwrap();
let policy = RetryPolicy::exponential(Duration::from_millis(300)) let policy = RetryPolicy::exponential(Duration::from_millis(300))
.with_max_retries(10) .with_max_retries(10)
.with_jitter(true); .with_jitter(true);
@ -106,7 +108,11 @@ async fn fetch_and_parse(
let maybe_point = { let maybe_point = {
let dom = tl::parse(&body, tl::ParserOptions::default()).map_err(FetchError::Tl)?; let dom = tl::parse(&body, tl::ParserOptions::default()).map_err(FetchError::Tl)?;
sites::carrefour::parse(url, &dom) match url_p.host_str().unwrap() {
"www.carrefour.com.ar" => sites::carrefour::parse(url, &dom),
"diaonline.supermercadosdia.com.ar" => sites::dia::parse(url, &dom),
s => bail!("Unknown host {}", s),
}
}; };
let point = match maybe_point { let point = match maybe_point {

View file

@ -8,9 +8,7 @@ use crate::PrecioPoint;
use super::vtex::find_product_ld; use super::vtex::find_product_ld;
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> { pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
let precio_centavos = common::get_meta_content(dom, "product:price:amount") let precio_centavos = common::price_from_meta(dom)?;
.map(|s| s.parse::<f64>().map(|f| (f * 100.0) as u64))
.transpose()?;
let in_stock = match common::get_meta_content(dom, "product:availability") { let in_stock = match common::get_meta_content(dom, "product:availability") {
Some(s) => match s.as_ref() { Some(s) => match s.as_ref() {

View file

@ -10,3 +10,10 @@ pub fn get_meta_content<'a>(dom: &'a VDom<'a>, prop: &str) -> Option<Cow<'a, str
.and_then(|tag| tag.attributes().get("content").flatten()) .and_then(|tag| tag.attributes().get("content").flatten())
.map(|s| s.as_utf8_str()) .map(|s| s.as_utf8_str())
} }
pub fn price_from_meta(dom: &tl::VDom<'_>) -> Result<Option<u64>, anyhow::Error> {
let precio_centavos = get_meta_content(dom, "product:price:amount")
.map(|s| s.parse::<f64>().map(|f| (f * 100.0) as u64))
.transpose()?;
Ok(precio_centavos)
}

View file

@ -0,0 +1,43 @@
use anyhow::Context;
use simple_error::bail;
use simple_error::SimpleError;
use crate::sites::common;
use crate::sites::vtex;
use crate::PrecioPoint;
use super::vtex::find_product_ld;
use super::vtex::AvailabilityLd;
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
let ean = common::get_meta_content(dom, "product:retailer_item_id")
.context("Parsing EAN")?
.to_string();
let precio_centavos = common::price_from_meta(dom)?;
let (name, image_url, in_stock) = match find_product_ld(dom) {
Some(pm) => {
let p = pm?;
(
Some(p.name),
Some(p.image),
Some(
p.offers.offers.first().context("No offer")?.availability
== AvailabilityLd::InStock,
),
)
}
None => bail!("No JSON/LD"),
};
Ok(PrecioPoint {
ean,
fetched_at: crate::now_sec(),
in_stock,
name,
image_url,
parser_version: 5,
precio_centavos,
url,
})
}

View file

@ -1,3 +1,4 @@
pub mod carrefour; pub mod carrefour;
mod common; mod common;
pub mod dia;
mod vtex; mod vtex;

View file

@ -79,7 +79,7 @@ pub struct OfferLd {
pub enum OfferTypeLd { pub enum OfferTypeLd {
Offer, Offer,
} }
#[derive(Deserialize)] #[derive(Deserialize, PartialEq)]
pub enum AvailabilityLd { pub enum AvailabilityLd {
#[serde(rename = "http://schema.org/InStock")] #[serde(rename = "http://schema.org/InStock")]
InStock, InStock,