mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-25 19:16:19 +00:00
Compare commits
9 commits
ed7af7621d
...
eca98c616e
Author | SHA1 | Date | |
---|---|---|---|
eca98c616e | |||
9d9249e2b3 | |||
00fa20f625 | |||
1118bcf75d | |||
d1d496514c | |||
c66c325a4f | |||
b807de8eb4 | |||
18771cb944 | |||
d55fc8f603 |
10 changed files with 150 additions and 43 deletions
|
@ -9,7 +9,7 @@ import { migrateDb } from "./migrate.js";
|
|||
let db = null;
|
||||
export function getDb() {
|
||||
const sqlite = new Database(DB_PATH);
|
||||
db = drizzle(sqlite, { schema });
|
||||
db = drizzle(sqlite, { schema, logger: true });
|
||||
migrateDb(db);
|
||||
return db;
|
||||
}
|
||||
|
|
|
@ -3,25 +3,31 @@ export enum Supermercado {
|
|||
Carrefour = "Carrefour",
|
||||
Coto = "Coto",
|
||||
Jumbo = "Jumbo",
|
||||
Farmacity = "Farmacity",
|
||||
}
|
||||
export const supermercados: Supermercado[] = [
|
||||
Supermercado.Carrefour,
|
||||
Supermercado.Coto,
|
||||
Supermercado.Dia,
|
||||
Supermercado.Jumbo,
|
||||
Supermercado.Farmacity,
|
||||
];
|
||||
export const hosts: { [host: string]: Supermercado } = {
|
||||
"diaonline.supermercadosdia.com.ar": Supermercado.Dia,
|
||||
"www.carrefour.com.ar": Supermercado.Carrefour,
|
||||
"www.cotodigital3.com.ar": Supermercado.Coto,
|
||||
"www.jumbo.com.ar": Supermercado.Jumbo,
|
||||
"www.farmacity.com": Supermercado.Farmacity,
|
||||
};
|
||||
export const hostBySupermercado = Object.fromEntries(
|
||||
Object.entries(hosts).map(([a, b]) => [b, a])
|
||||
) as Record<Supermercado, string>;
|
||||
|
||||
// también actualizar en sitio/src/routes/ean/[ean]/+page.svelte
|
||||
export const colorBySupermercado: { [supermercado in Supermercado]: string } = {
|
||||
[Supermercado.Dia]: "#d52b1e",
|
||||
[Supermercado.Carrefour]: "#19549d",
|
||||
[Supermercado.Coto]: "#e20025",
|
||||
[Supermercado.Jumbo]: "#2dc850",
|
||||
[Supermercado.Farmacity]: "#EF7603",
|
||||
};
|
||||
|
|
|
@ -46,10 +46,12 @@ struct GetUrlListArgs {
|
|||
struct ScrapUrlArgs {
|
||||
url: String,
|
||||
}
|
||||
#[derive(clap::Args)]
|
||||
#[derive(clap::Args, Clone, Copy)]
|
||||
struct AutoArgs {
|
||||
#[arg(long)]
|
||||
n_products: Option<usize>,
|
||||
#[arg(long)]
|
||||
only_supermercado: Option<Supermercado>,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
|
@ -184,18 +186,25 @@ async fn request_and_body(client: &reqwest::Client, url: &str) -> reqwest::Resul
|
|||
res.text().await
|
||||
}
|
||||
pub async fn fetch_body(client: &reqwest::Client, url: &str) -> reqwest::Result<String> {
|
||||
get_retry_policy()
|
||||
get_fetch_retry_policy()
|
||||
.retry_if(|| request_and_body(client, url), retry_if_wasnt_not_found)
|
||||
.await
|
||||
}
|
||||
|
||||
pub fn get_retry_policy() -> again::RetryPolicy {
|
||||
pub fn get_fetch_retry_policy() -> again::RetryPolicy {
|
||||
RetryPolicy::exponential(Duration::from_millis(300))
|
||||
.with_max_retries(20)
|
||||
.with_max_delay(Duration::from_secs(40))
|
||||
.with_jitter(true)
|
||||
}
|
||||
|
||||
pub fn get_parse_retry_policy() -> again::RetryPolicy {
|
||||
RetryPolicy::exponential(Duration::from_millis(1500))
|
||||
.with_max_retries(5)
|
||||
.with_max_delay(Duration::from_secs(5))
|
||||
.with_jitter(true)
|
||||
}
|
||||
|
||||
pub fn retry_if_wasnt_not_found(err: &reqwest::Error) -> bool {
|
||||
!err.status().is_some_and(|s| s == StatusCode::NOT_FOUND)
|
||||
}
|
||||
|
@ -205,24 +214,38 @@ async fn fetch_and_parse(
|
|||
client: &reqwest::Client,
|
||||
url: String,
|
||||
) -> Result<PrecioPoint, anyhow::Error> {
|
||||
let body = fetch_body(client, &url).await?;
|
||||
async fn fetch_and_scrap(
|
||||
client: &reqwest::Client,
|
||||
url: String,
|
||||
) -> Result<PrecioPoint, anyhow::Error> {
|
||||
let body = fetch_body(client, &url).await?;
|
||||
let maybe_point = { scrap_url(client, url, &body).await };
|
||||
|
||||
let maybe_point = { scrap_url(client, url, &body).await };
|
||||
let point = match maybe_point {
|
||||
Ok(p) => Ok(p),
|
||||
Err(err) => {
|
||||
let now: DateTime<Utc> = Utc::now();
|
||||
let debug_path = PathBuf::from(format!("debug-{}/", now.format("%Y-%m-%d")));
|
||||
tokio::fs::create_dir_all(&debug_path).await.unwrap();
|
||||
let file_path = debug_path.join(format!("{}.html", nanoid!()));
|
||||
tokio::fs::write(&file_path, &body).await.unwrap();
|
||||
tracing::debug!(error=%err, "Failed to parse, saved body at {}",file_path.display());
|
||||
Err(err)
|
||||
}
|
||||
}?;
|
||||
|
||||
let point = match maybe_point {
|
||||
Ok(p) => Ok(p),
|
||||
Err(err) => {
|
||||
let now: DateTime<Utc> = Utc::now();
|
||||
let debug_path = PathBuf::from(format!("debug-{}/", now.format("%Y-%m-%d")));
|
||||
tokio::fs::create_dir_all(&debug_path).await.unwrap();
|
||||
let file_path = debug_path.join(format!("{}.html", nanoid!()));
|
||||
tokio::fs::write(&file_path, &body).await.unwrap();
|
||||
tracing::debug!(error=%err, "Failed to parse, saved body at {}",file_path.display());
|
||||
Err(err)
|
||||
}
|
||||
}?;
|
||||
Ok(point)
|
||||
}
|
||||
|
||||
Ok(point)
|
||||
get_parse_retry_policy()
|
||||
.retry_if(
|
||||
|| fetch_and_scrap(client, url.clone()),
|
||||
|err: &anyhow::Error| match err.downcast_ref::<reqwest::Error>() {
|
||||
Some(e) => !e.status().is_some_and(|s| s == StatusCode::NOT_FOUND),
|
||||
None => true,
|
||||
},
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
|
||||
|
@ -262,6 +285,7 @@ async fn get_urls(supermercado: &Supermercado) -> Result<Vec<String>, anyhow::Er
|
|||
Supermercado::Jumbo => sites::jumbo::get_urls().await?,
|
||||
Supermercado::Carrefour => sites::carrefour::get_urls().await?,
|
||||
Supermercado::Coto => sites::coto::get_urls().await?,
|
||||
Supermercado::Farmacity => sites::farmacity::get_urls().await?,
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -282,6 +306,9 @@ async fn scrap_url(
|
|||
sites::coto::parse(url, &tl::parse(body, tl::ParserOptions::default())?)
|
||||
}
|
||||
"www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await,
|
||||
"www.farmacity.com" => {
|
||||
sites::farmacity::parse(url, &tl::parse(body, tl::ParserOptions::default())?)
|
||||
}
|
||||
s => bail!("Unknown host {}", s),
|
||||
}
|
||||
}
|
||||
|
@ -296,7 +323,7 @@ struct AutoTelegram {
|
|||
struct Auto {
|
||||
db: Db,
|
||||
telegram: Option<AutoTelegram>,
|
||||
limit_n_products: Option<usize>,
|
||||
args: AutoArgs,
|
||||
}
|
||||
impl Auto {
|
||||
async fn download_supermercado(self, supermercado: Supermercado) -> anyhow::Result<()> {
|
||||
|
@ -312,7 +339,7 @@ impl Auto {
|
|||
}
|
||||
let links: Vec<String> = {
|
||||
let mut links = self.db.get_urls_by_domain(supermercado.host()).await?;
|
||||
if let Some(n) = self.limit_n_products {
|
||||
if let Some(n) = self.args.n_products {
|
||||
links.truncate(n);
|
||||
}
|
||||
links
|
||||
|
@ -386,14 +413,16 @@ async fn auto_cli(args: AutoArgs) -> anyhow::Result<()> {
|
|||
}
|
||||
}
|
||||
};
|
||||
Auto {
|
||||
db,
|
||||
telegram,
|
||||
limit_n_products: args.n_products,
|
||||
}
|
||||
Auto { db, telegram, args }
|
||||
};
|
||||
auto.inform("[auto] Empezando scrap").await;
|
||||
let handles: Vec<_> = Supermercado::value_variants()
|
||||
|
||||
let supermercados = match args.only_supermercado {
|
||||
Some(supermercado) => [supermercado].to_vec(),
|
||||
None => Supermercado::value_variants().to_vec(),
|
||||
};
|
||||
|
||||
let handles: Vec<_> = supermercados
|
||||
.iter()
|
||||
.map(|s| tokio::spawn(auto.clone().download_supermercado(s.to_owned())))
|
||||
.collect();
|
||||
|
@ -425,7 +454,12 @@ async fn cron_cli() -> anyhow::Result<()> {
|
|||
.unwrap();
|
||||
println!("Waiting for {:?}", t);
|
||||
tokio::time::sleep(t).await;
|
||||
auto_cli(AutoArgs { n_products: None }).await.unwrap();
|
||||
auto_cli(AutoArgs {
|
||||
n_products: None,
|
||||
only_supermercado: None,
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -3,7 +3,9 @@ use futures::{stream, StreamExt, TryFutureExt, TryStreamExt};
|
|||
use itertools::Itertools;
|
||||
use reqwest::Url;
|
||||
|
||||
use crate::{build_client, do_request, get_retry_policy, retry_if_wasnt_not_found, PrecioPoint};
|
||||
use crate::{
|
||||
build_client, do_request, get_fetch_retry_policy, retry_if_wasnt_not_found, PrecioPoint,
|
||||
};
|
||||
|
||||
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||
let ean = dom
|
||||
|
@ -90,7 +92,7 @@ pub async fn get_urls() -> anyhow::Result<Vec<String>> {
|
|||
.finish();
|
||||
let client = &client;
|
||||
async move {
|
||||
let text = get_retry_policy()
|
||||
let text = get_fetch_retry_policy()
|
||||
.retry_if(
|
||||
|| do_request(client, u.as_str()).and_then(|r| r.text()),
|
||||
retry_if_wasnt_not_found,
|
||||
|
|
50
scraper-rs/src/sites/farmacity.rs
Normal file
50
scraper-rs/src/sites/farmacity.rs
Normal file
|
@ -0,0 +1,50 @@
|
|||
use anyhow::Context;
|
||||
use simple_error::bail;
|
||||
|
||||
use crate::sites::common;
|
||||
use crate::PrecioPoint;
|
||||
|
||||
use super::vtex;
|
||||
use super::vtex::find_product_ld;
|
||||
use super::vtex::AvailabilityLd;
|
||||
|
||||
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||
let ean = common::get_meta_content(dom, "product:retailer_item_id")
|
||||
.context("Parsing EAN")?
|
||||
.to_string();
|
||||
let precio_centavos = common::price_from_meta(dom)?;
|
||||
|
||||
let (name, image_url, in_stock) = match find_product_ld(dom) {
|
||||
Some(pm) => {
|
||||
let p = pm?;
|
||||
(
|
||||
Some(p.name),
|
||||
Some(p.image),
|
||||
Some(
|
||||
p.offers.offers.first().context("No offer")?.availability
|
||||
== AvailabilityLd::InStock,
|
||||
),
|
||||
)
|
||||
}
|
||||
None => bail!("No JSON/LD"),
|
||||
};
|
||||
|
||||
Ok(PrecioPoint {
|
||||
ean,
|
||||
fetched_at: crate::now_sec(),
|
||||
in_stock,
|
||||
name,
|
||||
image_url,
|
||||
parser_version: 5,
|
||||
precio_centavos,
|
||||
url,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn get_urls() -> anyhow::Result<Vec<String>> {
|
||||
let urls = vec![
|
||||
"https://www.farmacity.com/sitemap/product-0.xml",
|
||||
"https://www.farmacity.com/sitemap/product-1.xml",
|
||||
];
|
||||
vtex::get_urls_from_sitemap(urls).await
|
||||
}
|
|
@ -7,7 +7,7 @@ use serde::Deserialize;
|
|||
use simple_error::bail;
|
||||
|
||||
use crate::sites::common;
|
||||
use crate::{do_request, get_retry_policy, PrecioPoint};
|
||||
use crate::{do_request, get_fetch_retry_policy, PrecioPoint};
|
||||
|
||||
use super::vtex;
|
||||
|
||||
|
@ -31,7 +31,7 @@ async fn get_ean_from_search(
|
|||
url.set_query(Some(&format!("fq=skuId:{}", retailer_sku)));
|
||||
url
|
||||
};
|
||||
let s = get_retry_policy()
|
||||
let s = get_fetch_retry_policy()
|
||||
.retry(|| do_request(client, url.as_str()).and_then(|r| r.text()))
|
||||
.await?;
|
||||
let ean = {
|
||||
|
|
|
@ -2,5 +2,6 @@ pub mod carrefour;
|
|||
mod common;
|
||||
pub mod coto;
|
||||
pub mod dia;
|
||||
pub mod farmacity;
|
||||
pub mod jumbo;
|
||||
pub mod vtex;
|
||||
|
|
|
@ -10,7 +10,7 @@ use serde_json::json;
|
|||
use simple_error::SimpleError;
|
||||
use tl::VDom;
|
||||
|
||||
use crate::{build_client, do_request, get_retry_policy, retry_if_wasnt_not_found};
|
||||
use crate::{build_client, do_request, get_fetch_retry_policy, retry_if_wasnt_not_found};
|
||||
|
||||
use super::common;
|
||||
|
||||
|
@ -132,7 +132,7 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<St
|
|||
let url = url.to_string();
|
||||
async move {
|
||||
let client = client;
|
||||
let text = get_retry_policy()
|
||||
let text = get_fetch_retry_policy()
|
||||
.retry_if(|| do_request(&client, &url), retry_if_wasnt_not_found)
|
||||
.await?
|
||||
.text()
|
||||
|
@ -152,7 +152,7 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<St
|
|||
}
|
||||
|
||||
async fn fetch_body<'a>(client: &reqwest::Client, url: &str) -> anyhow::Result<String> {
|
||||
let body = get_retry_policy()
|
||||
let body = get_fetch_retry_policy()
|
||||
.retry_if(|| do_request(client, url), retry_if_wasnt_not_found)
|
||||
.await?
|
||||
.text()
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
use clap::ValueEnum;
|
||||
|
||||
#[derive(ValueEnum, Clone, Debug)]
|
||||
#[derive(ValueEnum, Clone, Debug, Copy)]
|
||||
pub enum Supermercado {
|
||||
Dia,
|
||||
Jumbo,
|
||||
Carrefour,
|
||||
Coto,
|
||||
Farmacity,
|
||||
}
|
||||
impl Supermercado {
|
||||
pub fn host(&self) -> &'static str {
|
||||
|
@ -14,6 +15,7 @@ impl Supermercado {
|
|||
Self::Carrefour => "www.carrefour.com.ar",
|
||||
Self::Coto => "www.cotodigital3.com.ar",
|
||||
Self::Jumbo => "www.jumbo.com.ar",
|
||||
Self::Farmacity => "www.farmacity.com",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,38 +1,50 @@
|
|||
<script lang="ts">
|
||||
import { Supermercado, hosts } from "db-datos/supermercado";
|
||||
import * as schema from "db-datos/schema";
|
||||
import type { PageData } from "./$types";
|
||||
import Chart from "./Chart.svelte";
|
||||
|
||||
export let data: PageData;
|
||||
|
||||
let urls: Map<Supermercado, string>;
|
||||
$: urls = data.precios.toReversed().reduce((prev, curr) => {
|
||||
let urls: Map<Supermercado, schema.Precio>;
|
||||
$: urls = data.precios.reduce((prev, curr) => {
|
||||
const url = new URL(curr.url);
|
||||
const supermercado = hosts[url.hostname];
|
||||
prev.set(supermercado, curr.url);
|
||||
prev.set(supermercado, curr);
|
||||
return prev;
|
||||
}, new Map<Supermercado, string>());
|
||||
}, new Map<Supermercado, schema.Precio>());
|
||||
|
||||
const classBySupermercado: { [supermercado in Supermercado]: string } = {
|
||||
[Supermercado.Dia]: "bg-[#d52b1e] focus:ring-[#d52b1e]",
|
||||
[Supermercado.Carrefour]: "bg-[#19549d] focus:ring-[#19549d]",
|
||||
[Supermercado.Coto]: "bg-[#e20025] focus:ring-[#e20025]",
|
||||
[Supermercado.Jumbo]: "bg-[#2dc850] focus:ring-[#2dc850]",
|
||||
[Supermercado.Farmacity]: "bg-[#EF7603] focus:ring-[#EF7603]",
|
||||
};
|
||||
|
||||
const formatter = new Intl.NumberFormat("es-AR", {
|
||||
style: "currency",
|
||||
currency: "ARS",
|
||||
});
|
||||
</script>
|
||||
|
||||
{#if data.meta}
|
||||
<h1 class="text-3xl font-bold">{data.meta.name}</h1>
|
||||
<img src={data.meta.imageUrl} alt={data.meta.name} class="max-h-48" />
|
||||
<div class="flex gap-2">
|
||||
{#each urls as [supermercado, url]}
|
||||
{#each urls as [supermercado, { url, precioCentavos }]}
|
||||
<a
|
||||
href={url}
|
||||
rel="noreferrer noopener"
|
||||
target="_blank"
|
||||
class={`focus:shadow-outline inline-flex items-center justify-center rounded-md ${classBySupermercado[supermercado]} px-4 py-2 text-sm font-medium tracking-wide text-white transition-colors duration-200 hover:bg-opacity-80 focus:outline-none focus:ring-2 focus:ring-offset-2`}
|
||||
class={`focus:shadow-outline inline-flex flex-col items-center justify-center rounded-md ${classBySupermercado[supermercado]} px-4 py-2 font-medium tracking-wide text-white transition-colors duration-200 hover:bg-opacity-80 focus:outline-none focus:ring-2 focus:ring-offset-2`}
|
||||
>
|
||||
Ver en {supermercado}
|
||||
{#if precioCentavos}
|
||||
<span class="text-lg font-bold"
|
||||
>{formatter.format(precioCentavos / 100)}</span
|
||||
>
|
||||
{/if}
|
||||
<span class="text-sm">{supermercado}</span>
|
||||
</a>
|
||||
{/each}
|
||||
</div>
|
||||
|
|
Loading…
Reference in a new issue