Compare commits

..

9 commits

Author SHA1 Message Date
eca98c616e printear queries 2024-02-06 23:44:16 -03:00
9d9249e2b3 Merge branch 'farmacity' 2024-02-06 23:29:15 -03:00
00fa20f625 mostrar ultimos precios en pagina producto 2024-02-06 23:28:36 -03:00
1118bcf75d bringup farmacity 2024-02-06 23:20:04 -03:00
d1d496514c mejorar timing parseo 2024-02-06 23:17:24 -03:00
c66c325a4f no reintentar junto a parse si es 404 2024-02-06 22:53:46 -03:00
b807de8eb4 reintentar si falla parsear
muchos sitios vtex devuelven un html sin casi nada aleatoriamente
2024-02-06 22:48:44 -03:00
18771cb944 fixup! 2024-02-06 22:10:48 -03:00
d55fc8f603 para testear: solo fetchear un supermercado 2024-02-06 22:04:05 -03:00
10 changed files with 150 additions and 43 deletions

View file

@ -9,7 +9,7 @@ import { migrateDb } from "./migrate.js";
let db = null; let db = null;
export function getDb() { export function getDb() {
const sqlite = new Database(DB_PATH); const sqlite = new Database(DB_PATH);
db = drizzle(sqlite, { schema }); db = drizzle(sqlite, { schema, logger: true });
migrateDb(db); migrateDb(db);
return db; return db;
} }

View file

@ -3,25 +3,31 @@ export enum Supermercado {
Carrefour = "Carrefour", Carrefour = "Carrefour",
Coto = "Coto", Coto = "Coto",
Jumbo = "Jumbo", Jumbo = "Jumbo",
Farmacity = "Farmacity",
} }
export const supermercados: Supermercado[] = [ export const supermercados: Supermercado[] = [
Supermercado.Carrefour, Supermercado.Carrefour,
Supermercado.Coto, Supermercado.Coto,
Supermercado.Dia, Supermercado.Dia,
Supermercado.Jumbo, Supermercado.Jumbo,
Supermercado.Farmacity,
]; ];
export const hosts: { [host: string]: Supermercado } = { export const hosts: { [host: string]: Supermercado } = {
"diaonline.supermercadosdia.com.ar": Supermercado.Dia, "diaonline.supermercadosdia.com.ar": Supermercado.Dia,
"www.carrefour.com.ar": Supermercado.Carrefour, "www.carrefour.com.ar": Supermercado.Carrefour,
"www.cotodigital3.com.ar": Supermercado.Coto, "www.cotodigital3.com.ar": Supermercado.Coto,
"www.jumbo.com.ar": Supermercado.Jumbo, "www.jumbo.com.ar": Supermercado.Jumbo,
"www.farmacity.com": Supermercado.Farmacity,
}; };
export const hostBySupermercado = Object.fromEntries( export const hostBySupermercado = Object.fromEntries(
Object.entries(hosts).map(([a, b]) => [b, a]) Object.entries(hosts).map(([a, b]) => [b, a])
) as Record<Supermercado, string>; ) as Record<Supermercado, string>;
// también actualizar en sitio/src/routes/ean/[ean]/+page.svelte
export const colorBySupermercado: { [supermercado in Supermercado]: string } = { export const colorBySupermercado: { [supermercado in Supermercado]: string } = {
[Supermercado.Dia]: "#d52b1e", [Supermercado.Dia]: "#d52b1e",
[Supermercado.Carrefour]: "#19549d", [Supermercado.Carrefour]: "#19549d",
[Supermercado.Coto]: "#e20025", [Supermercado.Coto]: "#e20025",
[Supermercado.Jumbo]: "#2dc850", [Supermercado.Jumbo]: "#2dc850",
[Supermercado.Farmacity]: "#EF7603",
}; };

View file

@ -46,10 +46,12 @@ struct GetUrlListArgs {
struct ScrapUrlArgs { struct ScrapUrlArgs {
url: String, url: String,
} }
#[derive(clap::Args)] #[derive(clap::Args, Clone, Copy)]
struct AutoArgs { struct AutoArgs {
#[arg(long)] #[arg(long)]
n_products: Option<usize>, n_products: Option<usize>,
#[arg(long)]
only_supermercado: Option<Supermercado>,
} }
#[tokio::main] #[tokio::main]
@ -184,18 +186,25 @@ async fn request_and_body(client: &reqwest::Client, url: &str) -> reqwest::Resul
res.text().await res.text().await
} }
pub async fn fetch_body(client: &reqwest::Client, url: &str) -> reqwest::Result<String> { pub async fn fetch_body(client: &reqwest::Client, url: &str) -> reqwest::Result<String> {
get_retry_policy() get_fetch_retry_policy()
.retry_if(|| request_and_body(client, url), retry_if_wasnt_not_found) .retry_if(|| request_and_body(client, url), retry_if_wasnt_not_found)
.await .await
} }
pub fn get_retry_policy() -> again::RetryPolicy { pub fn get_fetch_retry_policy() -> again::RetryPolicy {
RetryPolicy::exponential(Duration::from_millis(300)) RetryPolicy::exponential(Duration::from_millis(300))
.with_max_retries(20) .with_max_retries(20)
.with_max_delay(Duration::from_secs(40)) .with_max_delay(Duration::from_secs(40))
.with_jitter(true) .with_jitter(true)
} }
pub fn get_parse_retry_policy() -> again::RetryPolicy {
RetryPolicy::exponential(Duration::from_millis(1500))
.with_max_retries(5)
.with_max_delay(Duration::from_secs(5))
.with_jitter(true)
}
pub fn retry_if_wasnt_not_found(err: &reqwest::Error) -> bool { pub fn retry_if_wasnt_not_found(err: &reqwest::Error) -> bool {
!err.status().is_some_and(|s| s == StatusCode::NOT_FOUND) !err.status().is_some_and(|s| s == StatusCode::NOT_FOUND)
} }
@ -205,24 +214,38 @@ async fn fetch_and_parse(
client: &reqwest::Client, client: &reqwest::Client,
url: String, url: String,
) -> Result<PrecioPoint, anyhow::Error> { ) -> Result<PrecioPoint, anyhow::Error> {
let body = fetch_body(client, &url).await?; async fn fetch_and_scrap(
client: &reqwest::Client,
url: String,
) -> Result<PrecioPoint, anyhow::Error> {
let body = fetch_body(client, &url).await?;
let maybe_point = { scrap_url(client, url, &body).await };
let maybe_point = { scrap_url(client, url, &body).await }; let point = match maybe_point {
Ok(p) => Ok(p),
Err(err) => {
let now: DateTime<Utc> = Utc::now();
let debug_path = PathBuf::from(format!("debug-{}/", now.format("%Y-%m-%d")));
tokio::fs::create_dir_all(&debug_path).await.unwrap();
let file_path = debug_path.join(format!("{}.html", nanoid!()));
tokio::fs::write(&file_path, &body).await.unwrap();
tracing::debug!(error=%err, "Failed to parse, saved body at {}",file_path.display());
Err(err)
}
}?;
let point = match maybe_point { Ok(point)
Ok(p) => Ok(p), }
Err(err) => {
let now: DateTime<Utc> = Utc::now();
let debug_path = PathBuf::from(format!("debug-{}/", now.format("%Y-%m-%d")));
tokio::fs::create_dir_all(&debug_path).await.unwrap();
let file_path = debug_path.join(format!("{}.html", nanoid!()));
tokio::fs::write(&file_path, &body).await.unwrap();
tracing::debug!(error=%err, "Failed to parse, saved body at {}",file_path.display());
Err(err)
}
}?;
Ok(point) get_parse_retry_policy()
.retry_if(
|| fetch_and_scrap(client, url.clone()),
|err: &anyhow::Error| match err.downcast_ref::<reqwest::Error>() {
Some(e) => !e.status().is_some_and(|s| s == StatusCode::NOT_FOUND),
None => true,
},
)
.await
} }
async fn parse_file_cli(file_path: String) -> anyhow::Result<()> { async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
@ -262,6 +285,7 @@ async fn get_urls(supermercado: &Supermercado) -> Result<Vec<String>, anyhow::Er
Supermercado::Jumbo => sites::jumbo::get_urls().await?, Supermercado::Jumbo => sites::jumbo::get_urls().await?,
Supermercado::Carrefour => sites::carrefour::get_urls().await?, Supermercado::Carrefour => sites::carrefour::get_urls().await?,
Supermercado::Coto => sites::coto::get_urls().await?, Supermercado::Coto => sites::coto::get_urls().await?,
Supermercado::Farmacity => sites::farmacity::get_urls().await?,
}) })
} }
@ -282,6 +306,9 @@ async fn scrap_url(
sites::coto::parse(url, &tl::parse(body, tl::ParserOptions::default())?) sites::coto::parse(url, &tl::parse(body, tl::ParserOptions::default())?)
} }
"www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await, "www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await,
"www.farmacity.com" => {
sites::farmacity::parse(url, &tl::parse(body, tl::ParserOptions::default())?)
}
s => bail!("Unknown host {}", s), s => bail!("Unknown host {}", s),
} }
} }
@ -296,7 +323,7 @@ struct AutoTelegram {
struct Auto { struct Auto {
db: Db, db: Db,
telegram: Option<AutoTelegram>, telegram: Option<AutoTelegram>,
limit_n_products: Option<usize>, args: AutoArgs,
} }
impl Auto { impl Auto {
async fn download_supermercado(self, supermercado: Supermercado) -> anyhow::Result<()> { async fn download_supermercado(self, supermercado: Supermercado) -> anyhow::Result<()> {
@ -312,7 +339,7 @@ impl Auto {
} }
let links: Vec<String> = { let links: Vec<String> = {
let mut links = self.db.get_urls_by_domain(supermercado.host()).await?; let mut links = self.db.get_urls_by_domain(supermercado.host()).await?;
if let Some(n) = self.limit_n_products { if let Some(n) = self.args.n_products {
links.truncate(n); links.truncate(n);
} }
links links
@ -386,14 +413,16 @@ async fn auto_cli(args: AutoArgs) -> anyhow::Result<()> {
} }
} }
}; };
Auto { Auto { db, telegram, args }
db,
telegram,
limit_n_products: args.n_products,
}
}; };
auto.inform("[auto] Empezando scrap").await; auto.inform("[auto] Empezando scrap").await;
let handles: Vec<_> = Supermercado::value_variants()
let supermercados = match args.only_supermercado {
Some(supermercado) => [supermercado].to_vec(),
None => Supermercado::value_variants().to_vec(),
};
let handles: Vec<_> = supermercados
.iter() .iter()
.map(|s| tokio::spawn(auto.clone().download_supermercado(s.to_owned()))) .map(|s| tokio::spawn(auto.clone().download_supermercado(s.to_owned())))
.collect(); .collect();
@ -425,7 +454,12 @@ async fn cron_cli() -> anyhow::Result<()> {
.unwrap(); .unwrap();
println!("Waiting for {:?}", t); println!("Waiting for {:?}", t);
tokio::time::sleep(t).await; tokio::time::sleep(t).await;
auto_cli(AutoArgs { n_products: None }).await.unwrap(); auto_cli(AutoArgs {
n_products: None,
only_supermercado: None,
})
.await
.unwrap();
} }
} }

View file

@ -3,7 +3,9 @@ use futures::{stream, StreamExt, TryFutureExt, TryStreamExt};
use itertools::Itertools; use itertools::Itertools;
use reqwest::Url; use reqwest::Url;
use crate::{build_client, do_request, get_retry_policy, retry_if_wasnt_not_found, PrecioPoint}; use crate::{
build_client, do_request, get_fetch_retry_policy, retry_if_wasnt_not_found, PrecioPoint,
};
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> { pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
let ean = dom let ean = dom
@ -90,7 +92,7 @@ pub async fn get_urls() -> anyhow::Result<Vec<String>> {
.finish(); .finish();
let client = &client; let client = &client;
async move { async move {
let text = get_retry_policy() let text = get_fetch_retry_policy()
.retry_if( .retry_if(
|| do_request(client, u.as_str()).and_then(|r| r.text()), || do_request(client, u.as_str()).and_then(|r| r.text()),
retry_if_wasnt_not_found, retry_if_wasnt_not_found,

View file

@ -0,0 +1,50 @@
use anyhow::Context;
use simple_error::bail;
use crate::sites::common;
use crate::PrecioPoint;
use super::vtex;
use super::vtex::find_product_ld;
use super::vtex::AvailabilityLd;
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
let ean = common::get_meta_content(dom, "product:retailer_item_id")
.context("Parsing EAN")?
.to_string();
let precio_centavos = common::price_from_meta(dom)?;
let (name, image_url, in_stock) = match find_product_ld(dom) {
Some(pm) => {
let p = pm?;
(
Some(p.name),
Some(p.image),
Some(
p.offers.offers.first().context("No offer")?.availability
== AvailabilityLd::InStock,
),
)
}
None => bail!("No JSON/LD"),
};
Ok(PrecioPoint {
ean,
fetched_at: crate::now_sec(),
in_stock,
name,
image_url,
parser_version: 5,
precio_centavos,
url,
})
}
pub async fn get_urls() -> anyhow::Result<Vec<String>> {
let urls = vec![
"https://www.farmacity.com/sitemap/product-0.xml",
"https://www.farmacity.com/sitemap/product-1.xml",
];
vtex::get_urls_from_sitemap(urls).await
}

View file

@ -7,7 +7,7 @@ use serde::Deserialize;
use simple_error::bail; use simple_error::bail;
use crate::sites::common; use crate::sites::common;
use crate::{do_request, get_retry_policy, PrecioPoint}; use crate::{do_request, get_fetch_retry_policy, PrecioPoint};
use super::vtex; use super::vtex;
@ -31,7 +31,7 @@ async fn get_ean_from_search(
url.set_query(Some(&format!("fq=skuId:{}", retailer_sku))); url.set_query(Some(&format!("fq=skuId:{}", retailer_sku)));
url url
}; };
let s = get_retry_policy() let s = get_fetch_retry_policy()
.retry(|| do_request(client, url.as_str()).and_then(|r| r.text())) .retry(|| do_request(client, url.as_str()).and_then(|r| r.text()))
.await?; .await?;
let ean = { let ean = {

View file

@ -2,5 +2,6 @@ pub mod carrefour;
mod common; mod common;
pub mod coto; pub mod coto;
pub mod dia; pub mod dia;
pub mod farmacity;
pub mod jumbo; pub mod jumbo;
pub mod vtex; pub mod vtex;

View file

@ -10,7 +10,7 @@ use serde_json::json;
use simple_error::SimpleError; use simple_error::SimpleError;
use tl::VDom; use tl::VDom;
use crate::{build_client, do_request, get_retry_policy, retry_if_wasnt_not_found}; use crate::{build_client, do_request, get_fetch_retry_policy, retry_if_wasnt_not_found};
use super::common; use super::common;
@ -132,7 +132,7 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<St
let url = url.to_string(); let url = url.to_string();
async move { async move {
let client = client; let client = client;
let text = get_retry_policy() let text = get_fetch_retry_policy()
.retry_if(|| do_request(&client, &url), retry_if_wasnt_not_found) .retry_if(|| do_request(&client, &url), retry_if_wasnt_not_found)
.await? .await?
.text() .text()
@ -152,7 +152,7 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<St
} }
async fn fetch_body<'a>(client: &reqwest::Client, url: &str) -> anyhow::Result<String> { async fn fetch_body<'a>(client: &reqwest::Client, url: &str) -> anyhow::Result<String> {
let body = get_retry_policy() let body = get_fetch_retry_policy()
.retry_if(|| do_request(client, url), retry_if_wasnt_not_found) .retry_if(|| do_request(client, url), retry_if_wasnt_not_found)
.await? .await?
.text() .text()

View file

@ -1,11 +1,12 @@
use clap::ValueEnum; use clap::ValueEnum;
#[derive(ValueEnum, Clone, Debug)] #[derive(ValueEnum, Clone, Debug, Copy)]
pub enum Supermercado { pub enum Supermercado {
Dia, Dia,
Jumbo, Jumbo,
Carrefour, Carrefour,
Coto, Coto,
Farmacity,
} }
impl Supermercado { impl Supermercado {
pub fn host(&self) -> &'static str { pub fn host(&self) -> &'static str {
@ -14,6 +15,7 @@ impl Supermercado {
Self::Carrefour => "www.carrefour.com.ar", Self::Carrefour => "www.carrefour.com.ar",
Self::Coto => "www.cotodigital3.com.ar", Self::Coto => "www.cotodigital3.com.ar",
Self::Jumbo => "www.jumbo.com.ar", Self::Jumbo => "www.jumbo.com.ar",
Self::Farmacity => "www.farmacity.com",
} }
} }
} }

View file

@ -1,38 +1,50 @@
<script lang="ts"> <script lang="ts">
import { Supermercado, hosts } from "db-datos/supermercado"; import { Supermercado, hosts } from "db-datos/supermercado";
import * as schema from "db-datos/schema";
import type { PageData } from "./$types"; import type { PageData } from "./$types";
import Chart from "./Chart.svelte"; import Chart from "./Chart.svelte";
export let data: PageData; export let data: PageData;
let urls: Map<Supermercado, string>; let urls: Map<Supermercado, schema.Precio>;
$: urls = data.precios.toReversed().reduce((prev, curr) => { $: urls = data.precios.reduce((prev, curr) => {
const url = new URL(curr.url); const url = new URL(curr.url);
const supermercado = hosts[url.hostname]; const supermercado = hosts[url.hostname];
prev.set(supermercado, curr.url); prev.set(supermercado, curr);
return prev; return prev;
}, new Map<Supermercado, string>()); }, new Map<Supermercado, schema.Precio>());
const classBySupermercado: { [supermercado in Supermercado]: string } = { const classBySupermercado: { [supermercado in Supermercado]: string } = {
[Supermercado.Dia]: "bg-[#d52b1e] focus:ring-[#d52b1e]", [Supermercado.Dia]: "bg-[#d52b1e] focus:ring-[#d52b1e]",
[Supermercado.Carrefour]: "bg-[#19549d] focus:ring-[#19549d]", [Supermercado.Carrefour]: "bg-[#19549d] focus:ring-[#19549d]",
[Supermercado.Coto]: "bg-[#e20025] focus:ring-[#e20025]", [Supermercado.Coto]: "bg-[#e20025] focus:ring-[#e20025]",
[Supermercado.Jumbo]: "bg-[#2dc850] focus:ring-[#2dc850]", [Supermercado.Jumbo]: "bg-[#2dc850] focus:ring-[#2dc850]",
[Supermercado.Farmacity]: "bg-[#EF7603] focus:ring-[#EF7603]",
}; };
const formatter = new Intl.NumberFormat("es-AR", {
style: "currency",
currency: "ARS",
});
</script> </script>
{#if data.meta} {#if data.meta}
<h1 class="text-3xl font-bold">{data.meta.name}</h1> <h1 class="text-3xl font-bold">{data.meta.name}</h1>
<img src={data.meta.imageUrl} alt={data.meta.name} class="max-h-48" /> <img src={data.meta.imageUrl} alt={data.meta.name} class="max-h-48" />
<div class="flex gap-2"> <div class="flex gap-2">
{#each urls as [supermercado, url]} {#each urls as [supermercado, { url, precioCentavos }]}
<a <a
href={url} href={url}
rel="noreferrer noopener" rel="noreferrer noopener"
target="_blank" target="_blank"
class={`focus:shadow-outline inline-flex items-center justify-center rounded-md ${classBySupermercado[supermercado]} px-4 py-2 text-sm font-medium tracking-wide text-white transition-colors duration-200 hover:bg-opacity-80 focus:outline-none focus:ring-2 focus:ring-offset-2`} class={`focus:shadow-outline inline-flex flex-col items-center justify-center rounded-md ${classBySupermercado[supermercado]} px-4 py-2 font-medium tracking-wide text-white transition-colors duration-200 hover:bg-opacity-80 focus:outline-none focus:ring-2 focus:ring-offset-2`}
> >
Ver en {supermercado} {#if precioCentavos}
<span class="text-lg font-bold"
>{formatter.format(precioCentavos / 100)}</span
>
{/if}
<span class="text-sm">{supermercado}</span>
</a> </a>
{/each} {/each}
</div> </div>