mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 03:26:19 +00:00
Merge branch 'farmacity'
This commit is contained in:
commit
9d9249e2b3
9 changed files with 111 additions and 24 deletions
|
@ -3,25 +3,31 @@ export enum Supermercado {
|
||||||
Carrefour = "Carrefour",
|
Carrefour = "Carrefour",
|
||||||
Coto = "Coto",
|
Coto = "Coto",
|
||||||
Jumbo = "Jumbo",
|
Jumbo = "Jumbo",
|
||||||
|
Farmacity = "Farmacity",
|
||||||
}
|
}
|
||||||
export const supermercados: Supermercado[] = [
|
export const supermercados: Supermercado[] = [
|
||||||
Supermercado.Carrefour,
|
Supermercado.Carrefour,
|
||||||
Supermercado.Coto,
|
Supermercado.Coto,
|
||||||
Supermercado.Dia,
|
Supermercado.Dia,
|
||||||
Supermercado.Jumbo,
|
Supermercado.Jumbo,
|
||||||
|
Supermercado.Farmacity,
|
||||||
];
|
];
|
||||||
export const hosts: { [host: string]: Supermercado } = {
|
export const hosts: { [host: string]: Supermercado } = {
|
||||||
"diaonline.supermercadosdia.com.ar": Supermercado.Dia,
|
"diaonline.supermercadosdia.com.ar": Supermercado.Dia,
|
||||||
"www.carrefour.com.ar": Supermercado.Carrefour,
|
"www.carrefour.com.ar": Supermercado.Carrefour,
|
||||||
"www.cotodigital3.com.ar": Supermercado.Coto,
|
"www.cotodigital3.com.ar": Supermercado.Coto,
|
||||||
"www.jumbo.com.ar": Supermercado.Jumbo,
|
"www.jumbo.com.ar": Supermercado.Jumbo,
|
||||||
|
"www.farmacity.com": Supermercado.Farmacity,
|
||||||
};
|
};
|
||||||
export const hostBySupermercado = Object.fromEntries(
|
export const hostBySupermercado = Object.fromEntries(
|
||||||
Object.entries(hosts).map(([a, b]) => [b, a])
|
Object.entries(hosts).map(([a, b]) => [b, a])
|
||||||
) as Record<Supermercado, string>;
|
) as Record<Supermercado, string>;
|
||||||
|
|
||||||
|
// también actualizar en sitio/src/routes/ean/[ean]/+page.svelte
|
||||||
export const colorBySupermercado: { [supermercado in Supermercado]: string } = {
|
export const colorBySupermercado: { [supermercado in Supermercado]: string } = {
|
||||||
[Supermercado.Dia]: "#d52b1e",
|
[Supermercado.Dia]: "#d52b1e",
|
||||||
[Supermercado.Carrefour]: "#19549d",
|
[Supermercado.Carrefour]: "#19549d",
|
||||||
[Supermercado.Coto]: "#e20025",
|
[Supermercado.Coto]: "#e20025",
|
||||||
[Supermercado.Jumbo]: "#2dc850",
|
[Supermercado.Jumbo]: "#2dc850",
|
||||||
|
[Supermercado.Farmacity]: "#EF7603",
|
||||||
};
|
};
|
||||||
|
|
|
@ -186,18 +186,25 @@ async fn request_and_body(client: &reqwest::Client, url: &str) -> reqwest::Resul
|
||||||
res.text().await
|
res.text().await
|
||||||
}
|
}
|
||||||
pub async fn fetch_body(client: &reqwest::Client, url: &str) -> reqwest::Result<String> {
|
pub async fn fetch_body(client: &reqwest::Client, url: &str) -> reqwest::Result<String> {
|
||||||
get_retry_policy()
|
get_fetch_retry_policy()
|
||||||
.retry_if(|| request_and_body(client, url), retry_if_wasnt_not_found)
|
.retry_if(|| request_and_body(client, url), retry_if_wasnt_not_found)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_retry_policy() -> again::RetryPolicy {
|
pub fn get_fetch_retry_policy() -> again::RetryPolicy {
|
||||||
RetryPolicy::exponential(Duration::from_millis(300))
|
RetryPolicy::exponential(Duration::from_millis(300))
|
||||||
.with_max_retries(20)
|
.with_max_retries(20)
|
||||||
.with_max_delay(Duration::from_secs(40))
|
.with_max_delay(Duration::from_secs(40))
|
||||||
.with_jitter(true)
|
.with_jitter(true)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_parse_retry_policy() -> again::RetryPolicy {
|
||||||
|
RetryPolicy::exponential(Duration::from_millis(1500))
|
||||||
|
.with_max_retries(5)
|
||||||
|
.with_max_delay(Duration::from_secs(5))
|
||||||
|
.with_jitter(true)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn retry_if_wasnt_not_found(err: &reqwest::Error) -> bool {
|
pub fn retry_if_wasnt_not_found(err: &reqwest::Error) -> bool {
|
||||||
!err.status().is_some_and(|s| s == StatusCode::NOT_FOUND)
|
!err.status().is_some_and(|s| s == StatusCode::NOT_FOUND)
|
||||||
}
|
}
|
||||||
|
@ -206,9 +213,12 @@ pub fn retry_if_wasnt_not_found(err: &reqwest::Error) -> bool {
|
||||||
async fn fetch_and_parse(
|
async fn fetch_and_parse(
|
||||||
client: &reqwest::Client,
|
client: &reqwest::Client,
|
||||||
url: String,
|
url: String,
|
||||||
|
) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
|
async fn fetch_and_scrap(
|
||||||
|
client: &reqwest::Client,
|
||||||
|
url: String,
|
||||||
) -> Result<PrecioPoint, anyhow::Error> {
|
) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
let body = fetch_body(client, &url).await?;
|
let body = fetch_body(client, &url).await?;
|
||||||
|
|
||||||
let maybe_point = { scrap_url(client, url, &body).await };
|
let maybe_point = { scrap_url(client, url, &body).await };
|
||||||
|
|
||||||
let point = match maybe_point {
|
let point = match maybe_point {
|
||||||
|
@ -227,6 +237,17 @@ async fn fetch_and_parse(
|
||||||
Ok(point)
|
Ok(point)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
get_parse_retry_policy()
|
||||||
|
.retry_if(
|
||||||
|
|| fetch_and_scrap(client, url.clone()),
|
||||||
|
|err: &anyhow::Error| match err.downcast_ref::<reqwest::Error>() {
|
||||||
|
Some(e) => !e.status().is_some_and(|s| s == StatusCode::NOT_FOUND),
|
||||||
|
None => true,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
|
async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
|
||||||
let file = tokio::fs::read_to_string(file_path).await?;
|
let file = tokio::fs::read_to_string(file_path).await?;
|
||||||
|
|
||||||
|
@ -264,6 +285,7 @@ async fn get_urls(supermercado: &Supermercado) -> Result<Vec<String>, anyhow::Er
|
||||||
Supermercado::Jumbo => sites::jumbo::get_urls().await?,
|
Supermercado::Jumbo => sites::jumbo::get_urls().await?,
|
||||||
Supermercado::Carrefour => sites::carrefour::get_urls().await?,
|
Supermercado::Carrefour => sites::carrefour::get_urls().await?,
|
||||||
Supermercado::Coto => sites::coto::get_urls().await?,
|
Supermercado::Coto => sites::coto::get_urls().await?,
|
||||||
|
Supermercado::Farmacity => sites::farmacity::get_urls().await?,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -284,6 +306,9 @@ async fn scrap_url(
|
||||||
sites::coto::parse(url, &tl::parse(body, tl::ParserOptions::default())?)
|
sites::coto::parse(url, &tl::parse(body, tl::ParserOptions::default())?)
|
||||||
}
|
}
|
||||||
"www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await,
|
"www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await,
|
||||||
|
"www.farmacity.com" => {
|
||||||
|
sites::farmacity::parse(url, &tl::parse(body, tl::ParserOptions::default())?)
|
||||||
|
}
|
||||||
s => bail!("Unknown host {}", s),
|
s => bail!("Unknown host {}", s),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,7 +3,9 @@ use futures::{stream, StreamExt, TryFutureExt, TryStreamExt};
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
|
|
||||||
use crate::{build_client, do_request, get_retry_policy, retry_if_wasnt_not_found, PrecioPoint};
|
use crate::{
|
||||||
|
build_client, do_request, get_fetch_retry_policy, retry_if_wasnt_not_found, PrecioPoint,
|
||||||
|
};
|
||||||
|
|
||||||
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
let ean = dom
|
let ean = dom
|
||||||
|
@ -90,7 +92,7 @@ pub async fn get_urls() -> anyhow::Result<Vec<String>> {
|
||||||
.finish();
|
.finish();
|
||||||
let client = &client;
|
let client = &client;
|
||||||
async move {
|
async move {
|
||||||
let text = get_retry_policy()
|
let text = get_fetch_retry_policy()
|
||||||
.retry_if(
|
.retry_if(
|
||||||
|| do_request(client, u.as_str()).and_then(|r| r.text()),
|
|| do_request(client, u.as_str()).and_then(|r| r.text()),
|
||||||
retry_if_wasnt_not_found,
|
retry_if_wasnt_not_found,
|
||||||
|
|
50
scraper-rs/src/sites/farmacity.rs
Normal file
50
scraper-rs/src/sites/farmacity.rs
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
use anyhow::Context;
|
||||||
|
use simple_error::bail;
|
||||||
|
|
||||||
|
use crate::sites::common;
|
||||||
|
use crate::PrecioPoint;
|
||||||
|
|
||||||
|
use super::vtex;
|
||||||
|
use super::vtex::find_product_ld;
|
||||||
|
use super::vtex::AvailabilityLd;
|
||||||
|
|
||||||
|
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
|
let ean = common::get_meta_content(dom, "product:retailer_item_id")
|
||||||
|
.context("Parsing EAN")?
|
||||||
|
.to_string();
|
||||||
|
let precio_centavos = common::price_from_meta(dom)?;
|
||||||
|
|
||||||
|
let (name, image_url, in_stock) = match find_product_ld(dom) {
|
||||||
|
Some(pm) => {
|
||||||
|
let p = pm?;
|
||||||
|
(
|
||||||
|
Some(p.name),
|
||||||
|
Some(p.image),
|
||||||
|
Some(
|
||||||
|
p.offers.offers.first().context("No offer")?.availability
|
||||||
|
== AvailabilityLd::InStock,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
None => bail!("No JSON/LD"),
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(PrecioPoint {
|
||||||
|
ean,
|
||||||
|
fetched_at: crate::now_sec(),
|
||||||
|
in_stock,
|
||||||
|
name,
|
||||||
|
image_url,
|
||||||
|
parser_version: 5,
|
||||||
|
precio_centavos,
|
||||||
|
url,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_urls() -> anyhow::Result<Vec<String>> {
|
||||||
|
let urls = vec![
|
||||||
|
"https://www.farmacity.com/sitemap/product-0.xml",
|
||||||
|
"https://www.farmacity.com/sitemap/product-1.xml",
|
||||||
|
];
|
||||||
|
vtex::get_urls_from_sitemap(urls).await
|
||||||
|
}
|
|
@ -7,7 +7,7 @@ use serde::Deserialize;
|
||||||
use simple_error::bail;
|
use simple_error::bail;
|
||||||
|
|
||||||
use crate::sites::common;
|
use crate::sites::common;
|
||||||
use crate::{do_request, get_retry_policy, PrecioPoint};
|
use crate::{do_request, get_fetch_retry_policy, PrecioPoint};
|
||||||
|
|
||||||
use super::vtex;
|
use super::vtex;
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ async fn get_ean_from_search(
|
||||||
url.set_query(Some(&format!("fq=skuId:{}", retailer_sku)));
|
url.set_query(Some(&format!("fq=skuId:{}", retailer_sku)));
|
||||||
url
|
url
|
||||||
};
|
};
|
||||||
let s = get_retry_policy()
|
let s = get_fetch_retry_policy()
|
||||||
.retry(|| do_request(client, url.as_str()).and_then(|r| r.text()))
|
.retry(|| do_request(client, url.as_str()).and_then(|r| r.text()))
|
||||||
.await?;
|
.await?;
|
||||||
let ean = {
|
let ean = {
|
||||||
|
|
|
@ -2,5 +2,6 @@ pub mod carrefour;
|
||||||
mod common;
|
mod common;
|
||||||
pub mod coto;
|
pub mod coto;
|
||||||
pub mod dia;
|
pub mod dia;
|
||||||
|
pub mod farmacity;
|
||||||
pub mod jumbo;
|
pub mod jumbo;
|
||||||
pub mod vtex;
|
pub mod vtex;
|
||||||
|
|
|
@ -10,7 +10,7 @@ use serde_json::json;
|
||||||
use simple_error::SimpleError;
|
use simple_error::SimpleError;
|
||||||
use tl::VDom;
|
use tl::VDom;
|
||||||
|
|
||||||
use crate::{build_client, do_request, get_retry_policy, retry_if_wasnt_not_found};
|
use crate::{build_client, do_request, get_fetch_retry_policy, retry_if_wasnt_not_found};
|
||||||
|
|
||||||
use super::common;
|
use super::common;
|
||||||
|
|
||||||
|
@ -132,7 +132,7 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<St
|
||||||
let url = url.to_string();
|
let url = url.to_string();
|
||||||
async move {
|
async move {
|
||||||
let client = client;
|
let client = client;
|
||||||
let text = get_retry_policy()
|
let text = get_fetch_retry_policy()
|
||||||
.retry_if(|| do_request(&client, &url), retry_if_wasnt_not_found)
|
.retry_if(|| do_request(&client, &url), retry_if_wasnt_not_found)
|
||||||
.await?
|
.await?
|
||||||
.text()
|
.text()
|
||||||
|
@ -152,7 +152,7 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<St
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn fetch_body<'a>(client: &reqwest::Client, url: &str) -> anyhow::Result<String> {
|
async fn fetch_body<'a>(client: &reqwest::Client, url: &str) -> anyhow::Result<String> {
|
||||||
let body = get_retry_policy()
|
let body = get_fetch_retry_policy()
|
||||||
.retry_if(|| do_request(client, url), retry_if_wasnt_not_found)
|
.retry_if(|| do_request(client, url), retry_if_wasnt_not_found)
|
||||||
.await?
|
.await?
|
||||||
.text()
|
.text()
|
||||||
|
|
|
@ -6,6 +6,7 @@ pub enum Supermercado {
|
||||||
Jumbo,
|
Jumbo,
|
||||||
Carrefour,
|
Carrefour,
|
||||||
Coto,
|
Coto,
|
||||||
|
Farmacity,
|
||||||
}
|
}
|
||||||
impl Supermercado {
|
impl Supermercado {
|
||||||
pub fn host(&self) -> &'static str {
|
pub fn host(&self) -> &'static str {
|
||||||
|
@ -14,6 +15,7 @@ impl Supermercado {
|
||||||
Self::Carrefour => "www.carrefour.com.ar",
|
Self::Carrefour => "www.carrefour.com.ar",
|
||||||
Self::Coto => "www.cotodigital3.com.ar",
|
Self::Coto => "www.cotodigital3.com.ar",
|
||||||
Self::Jumbo => "www.jumbo.com.ar",
|
Self::Jumbo => "www.jumbo.com.ar",
|
||||||
|
Self::Farmacity => "www.farmacity.com",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
[Supermercado.Carrefour]: "bg-[#19549d] focus:ring-[#19549d]",
|
[Supermercado.Carrefour]: "bg-[#19549d] focus:ring-[#19549d]",
|
||||||
[Supermercado.Coto]: "bg-[#e20025] focus:ring-[#e20025]",
|
[Supermercado.Coto]: "bg-[#e20025] focus:ring-[#e20025]",
|
||||||
[Supermercado.Jumbo]: "bg-[#2dc850] focus:ring-[#2dc850]",
|
[Supermercado.Jumbo]: "bg-[#2dc850] focus:ring-[#2dc850]",
|
||||||
|
[Supermercado.Farmacity]: "bg-[#EF7603] focus:ring-[#EF7603]",
|
||||||
};
|
};
|
||||||
|
|
||||||
const formatter = new Intl.NumberFormat("es-AR", {
|
const formatter = new Intl.NumberFormat("es-AR", {
|
||||||
|
|
Loading…
Reference in a new issue