From 1c18b20e4d42fc0160fa08576424cfa2689cfee4 Mon Sep 17 00:00:00 2001 From: Nulo Date: Mon, 29 Jan 2024 12:06:35 -0300 Subject: [PATCH] WIP: mostrar best selling --- db-datos/drizzle/0010_true_black_tom.sql | 6 + db-datos/drizzle/meta/0010_snapshot.json | 183 +++++++++++++++++++++++ db-datos/drizzle/meta/_journal.json | 7 + db-datos/schema.js | 11 +- scraper-rs/Cargo.lock | 5 +- scraper-rs/Cargo.toml | 1 + scraper-rs/src/best_selling.rs | 142 ++++++++++++++++++ scraper-rs/src/main.rs | 98 ++++++++---- scraper-rs/src/sites/mod.rs | 2 +- scraper-rs/src/sites/vtex.rs | 109 +++++++++++++- scraper-rs/src/supermercado.rs | 19 +++ 11 files changed, 551 insertions(+), 32 deletions(-) create mode 100644 db-datos/drizzle/0010_true_black_tom.sql create mode 100644 db-datos/drizzle/meta/0010_snapshot.json create mode 100644 scraper-rs/src/best_selling.rs create mode 100644 scraper-rs/src/supermercado.rs diff --git a/db-datos/drizzle/0010_true_black_tom.sql b/db-datos/drizzle/0010_true_black_tom.sql new file mode 100644 index 0000000..f55ee87 --- /dev/null +++ b/db-datos/drizzle/0010_true_black_tom.sql @@ -0,0 +1,6 @@ +CREATE TABLE `db_best_selling` ( + `id` integer PRIMARY KEY AUTOINCREMENT NOT NULL, + `fetched_at` integer NOT NULL, + `category` text NOT NULL, + `eans_json` text NOT NULL +); diff --git a/db-datos/drizzle/meta/0010_snapshot.json b/db-datos/drizzle/meta/0010_snapshot.json new file mode 100644 index 0000000..d2a437e --- /dev/null +++ b/db-datos/drizzle/meta/0010_snapshot.json @@ -0,0 +1,183 @@ +{ + "version": "5", + "dialect": "sqlite", + "id": "c8297337-4ed8-432e-8782-65d41be42e00", + "prevId": "2e398920-ffaf-4d55-ae13-d906cb9e0efa", + "tables": { + "db_best_selling": { + "name": "db_best_selling", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "autoincrement": true + }, + "fetched_at": { + "name": "fetched_at", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "category": { + "name": "category", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "eans_json": { + "name": "eans_json", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "precios": { + "name": "precios", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "autoincrement": true + }, + "ean": { + "name": "ean", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "fetched_at": { + "name": "fetched_at", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "precio_centavos": { + "name": "precio_centavos", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "in_stock": { + "name": "in_stock", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "url": { + "name": "url", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "warc_record_id": { + "name": "warc_record_id", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "parser_version": { + "name": "parser_version", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "image_url": { + "name": "image_url", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + } + }, + "indexes": { + "precios_ean_idx": { + "name": "precios_ean_idx", + "columns": [ + "ean" + ], + "isUnique": false + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "producto_urls": { + "name": "producto_urls", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "autoincrement": true + }, + "url": { + "name": "url", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "first_seen": { + "name": "first_seen", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "last_seen": { + "name": "last_seen", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": { + "producto_urls_url_unique": { + "name": "producto_urls_url_unique", + "columns": [ + "url" + ], + "isUnique": true + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + } + }, + "enums": {}, + "_meta": { + "schemas": {}, + "tables": {}, + "columns": {} + } +} \ No newline at end of file diff --git a/db-datos/drizzle/meta/_journal.json b/db-datos/drizzle/meta/_journal.json index bd847ef..c9d8729 100644 --- a/db-datos/drizzle/meta/_journal.json +++ b/db-datos/drizzle/meta/_journal.json @@ -71,6 +71,13 @@ "when": 1703895109501, "tag": "0009_breezy_forge", "breakpoints": true + }, + { + "idx": 10, + "version": "5", + "when": 1706540751931, + "tag": "0010_true_black_tom", + "breakpoints": true } ] } \ No newline at end of file diff --git a/db-datos/schema.js b/db-datos/schema.js index 2b921b9..eabdc35 100644 --- a/db-datos/schema.js +++ b/db-datos/schema.js @@ -19,7 +19,7 @@ export const precios = sqliteTable( return { preciosEanIdx: index("precios_ean_idx").on(precios.ean), }; - }, + } ); /** @typedef {typeof precios.$inferSelect} Precio */ @@ -32,3 +32,12 @@ export const productoUrls = sqliteTable("producto_urls", { }); /** @typedef {typeof productoUrls.$inferSelect} ProductUrl */ + +export const bestSelling = sqliteTable("db_best_selling", { + id: integer("id", { mode: "number" }).primaryKey({ autoIncrement: true }), + fetchedAt: integer("fetched_at", { mode: "timestamp" }).notNull(), + category: text("category").notNull(), + eansJson: text("eans_json").notNull(), +}); + +/** @typedef {typeof bestSelling.$inferSelect} BestSelling */ diff --git a/scraper-rs/Cargo.lock b/scraper-rs/Cargo.lock index c540cc5..30ea007 100644 --- a/scraper-rs/Cargo.lock +++ b/scraper-rs/Cargo.lock @@ -178,9 +178,9 @@ dependencies = [ [[package]] name = "base64" -version = "0.21.5" +version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" [[package]] name = "bitflags" @@ -1232,6 +1232,7 @@ version = "0.1.0" dependencies = [ "again", "anyhow", + "base64", "chrono", "clap", "cron", diff --git a/scraper-rs/Cargo.toml b/scraper-rs/Cargo.toml index 336d5d5..e8b1a0a 100644 --- a/scraper-rs/Cargo.toml +++ b/scraper-rs/Cargo.toml @@ -8,6 +8,7 @@ edition = "2021" [dependencies] again = "0.1.2" anyhow = "1.0.79" +base64 = "0.21.7" chrono = "0.4.32" clap = { version = "4.4.15", features = ["derive"] } cron = "0.12.0" diff --git a/scraper-rs/src/best_selling.rs b/scraper-rs/src/best_selling.rs new file mode 100644 index 0000000..2f3e586 --- /dev/null +++ b/scraper-rs/src/best_selling.rs @@ -0,0 +1,142 @@ +use std::collections::HashMap; + +use crate::{build_client, sites::vtex, supermercado::Supermercado}; +use chrono::{DateTime, Utc}; +use clap::ValueEnum; +use deadpool_sqlite::Pool; +use futures::{stream, FutureExt, StreamExt, TryStreamExt}; +use itertools::Itertools; +use tracing::warn; + +#[derive(ValueEnum, Clone, Debug)] +pub enum Category { + Almacen, + Bebidas, + FrutasYVerduras, +} +impl Category { + fn query(&self, supermercado: &Supermercado) -> Option<&'static str> { + match self { + Self::Almacen => match supermercado { + Supermercado::Jumbo | Supermercado::Dia => Some("almacen"), + _ => None, + }, + Self::Bebidas => match supermercado { + Supermercado::Jumbo | Supermercado::Dia => Some("bebidas"), + _ => None, + }, + Self::FrutasYVerduras => match supermercado { + Supermercado::Jumbo => Some("frutas-y-verduras"), + Supermercado::Dia => Some("frescos/frutas-y-verduras"), + _ => None, + }, + } + } + + pub fn id(&self) -> &'static str { + match self { + Self::Almacen => "almacen", + Self::Bebidas => "bebidas", + Self::FrutasYVerduras => "frutas-y-verduras", + } + } +} + +#[derive(Debug)] +pub struct BestSellingRecord { + pub fetched_at: DateTime, + pub category: Category, + pub eans: Vec, +} + +async fn get_best_selling_eans(pool: &Pool, urls: Vec) -> anyhow::Result> { + let mut eans: Vec = Vec::new(); + + for url in urls { + let q = url.clone(); + let ean = pool + .get() + .await? + .interact(move |conn| { + conn.prepare(r#"SELECT ean FROM precios WHERE url = ?1;"#)? + .query_map(rusqlite::params![q], |r| r.get::<_, String>(0)) + .map(|r| r.map(|r| r.unwrap()).next()) + }) + .await + .unwrap()?; + match ean { + Some(e) => eans.push(e), + None => warn!("No encontré EAN para {}", url), + } + } + + Ok(eans) +} + +async fn try_get_best_selling_eans( + client: reqwest::Client, + pool: Pool, + supermercado: &Supermercado, + category: &Category, +) -> anyhow::Result>> { + if let Some(query) = category.query(supermercado) { + let urls = vtex::get_best_selling_by_category(&client, supermercado.host(), query).await?; + let eans = get_best_selling_eans(&pool, urls).await?; + Ok(Some(eans)) + } else { + Ok(None) + } +} + +async fn noop(t: T) -> anyhow::Result { + Ok(t) +} + +fn rank_eans(eans: Vec>) -> Vec { + let mut map: HashMap = HashMap::new(); + for eans in eans { + for (i, ean) in eans.into_iter().enumerate() { + let base = map.get(&ean).unwrap_or(&0); + let score = base + 1000 / (i + 1); + map.insert(ean, score); + } + } + map.into_iter() + .sorted_by(|a, b| Ord::cmp(&b.1, &a.1)) + .map(|t| t.0) + .collect_vec() +} + +pub async fn get_all_best_selling(pool: &Pool) -> anyhow::Result> { + let client = &build_client(); + + stream::iter(Category::value_variants()) + .map(|category| { + stream::iter(Supermercado::value_variants()) + .map(|supermercado| { + let pool = pool.clone(); + let client = client.clone(); + tokio::spawn(try_get_best_selling_eans( + client, + pool, + supermercado, + category, + )) + }) + .buffer_unordered(5) + .map(|f| f.unwrap()) + .try_filter_map(noop) + .try_collect::>>() + .map(|r| { + r.map(rank_eans).map(|eans| BestSellingRecord { + fetched_at: Utc::now(), + category: category.clone(), + eans, + }) + }) + }) + .buffer_unordered(5) + .boxed() + .try_collect() + .await +} diff --git a/scraper-rs/src/main.rs b/scraper-rs/src/main.rs index 565b0fd..47e2055 100644 --- a/scraper-rs/src/main.rs +++ b/scraper-rs/src/main.rs @@ -1,10 +1,11 @@ use again::RetryPolicy; +use best_selling::BestSellingRecord; use clap::{Parser, ValueEnum}; use cron::Schedule; use deadpool_sqlite::Pool; -use futures::{future, stream, StreamExt}; +use futures::{future, stream, Future, StreamExt}; use nanoid::nanoid; -use reqwest::{StatusCode, Url}; +use reqwest::{header::HeaderMap, StatusCode, Url}; use simple_error::{bail, SimpleError}; use std::{ env::{self}, @@ -15,23 +16,8 @@ use std::{ }; use thiserror::Error; -#[derive(ValueEnum, Clone, Debug)] -enum Supermercado { - Dia, - Jumbo, - Carrefour, - Coto, -} -impl Supermercado { - fn host(&self) -> &'static str { - match self { - Self::Dia => "diaonline.supermercadosdia.com.ar", - Self::Carrefour => "www.carrefour.com.ar", - Self::Coto => "www.cotodigital3.com.ar", - Self::Jumbo => "www.jumbo.com.ar", - } - } -} +mod supermercado; +use supermercado::Supermercado; #[derive(Parser)] // requires `derive` feature enum Args { @@ -39,6 +25,7 @@ enum Args { ParseFile(ParseFileArgs), GetUrlList(GetUrlListArgs), ScrapUrl(ScrapUrlArgs), + ScrapBestSelling, Auto(AutoArgs), Cron(AutoArgs), } @@ -71,6 +58,7 @@ async fn main() -> anyhow::Result<()> { Args::ParseFile(a) => parse_file_cli(a.file_path).await, Args::GetUrlList(a) => get_url_list_cli(a.supermercado).await, Args::ScrapUrl(a) => scrap_url_cli(a.url).await, + Args::ScrapBestSelling => scrap_best_selling_cli().await, Args::Auto(_) => auto_cli().await, Args::Cron(_) => cron_cli().await, } @@ -83,6 +71,14 @@ async fn scrap_url_cli(url: String) -> anyhow::Result<()> { println!("Result: {:#?}", res); res.map(|_| ()) } +mod best_selling; +async fn scrap_best_selling_cli() -> anyhow::Result<()> { + let db = connect_db(); + let res = best_selling::get_all_best_selling(&db).await; + + println!("Result: {:#?}", res); + res.map(|_| ()) +} async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> { let links_str = fs::read_to_string(links_list_path).unwrap(); @@ -131,14 +127,6 @@ fn connect_db() -> Pool { cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap() } -fn build_client() -> reqwest::Client { - reqwest::ClientBuilder::default() - .timeout(Duration::from_secs(60 * 5)) - .connect_timeout(Duration::from_secs(60)) - .build() - .unwrap() -} - #[derive(Default, Debug)] struct Counters { success: u64, @@ -190,6 +178,16 @@ enum FetchError { Tl(#[from] tl::ParseError), } +fn build_client() -> reqwest::Client { + let mut headers = HeaderMap::new(); + headers.append("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36".parse().unwrap()); + reqwest::ClientBuilder::default() + .timeout(Duration::from_secs(60 * 5)) + .connect_timeout(Duration::from_secs(60)) + .default_headers(headers) + .build() + .unwrap() +} pub async fn do_request(client: &reqwest::Client, url: &str) -> reqwest::Result { let request = client.get(url).build()?; let response = client.execute(request).await?.error_for_status()?; @@ -356,9 +354,26 @@ impl Auto { )) .await; } + + let best_selling = self + .inform_time( + "Downloaded best selling", + best_selling::get_all_best_selling(&self.pool), + ) + .await?; + self.save_best_selling(best_selling).await?; + Ok(()) } + async fn inform_time, R>(&self, msg: &str, action: T) -> R { + let t0 = now_sec(); + let res = action.await; + self.inform(&format!("{} (took {})", msg, now_sec() - t0)) + .await; + res + } + async fn get_and_save_urls(&self, supermercado: &Supermercado) -> anyhow::Result<()> { let urls = get_urls(supermercado).await?; self.pool @@ -385,6 +400,35 @@ impl Auto { Ok(()) } + async fn save_best_selling(&self, best_selling: Vec) -> anyhow::Result<()> { + self.pool + .get() + .await? + .interact(move |conn| -> Result<(), anyhow::Error> { + let tx = conn.transaction()?; + { + let mut stmt = tx.prepare( + r#"INSERT INTO db_best_selling(fetched_at, category, eans_json) + VALUES (?1, ?2, ?3);"#, + )?; + for record in best_selling { + let eans_json = serde_json::Value::from(record.eans).to_string(); + let fetched_at = record.fetched_at.timestamp_millis(); + stmt.execute(rusqlite::params![ + fetched_at, + record.category.id(), + eans_json + ])?; + } + } + tx.commit()?; + Ok(()) + }) + .await + .unwrap()?; + Ok(()) + } + async fn inform(&self, msg: &str) { println!("{}", msg); if let Some(telegram) = &self.telegram { diff --git a/scraper-rs/src/sites/mod.rs b/scraper-rs/src/sites/mod.rs index 019de83..e305f94 100644 --- a/scraper-rs/src/sites/mod.rs +++ b/scraper-rs/src/sites/mod.rs @@ -3,4 +3,4 @@ mod common; pub mod coto; pub mod dia; pub mod jumbo; -mod vtex; +pub mod vtex; diff --git a/scraper-rs/src/sites/vtex.rs b/scraper-rs/src/sites/vtex.rs index 7b09945..1fdb44e 100644 --- a/scraper-rs/src/sites/vtex.rs +++ b/scraper-rs/src/sites/vtex.rs @@ -1,7 +1,12 @@ +use std::str::FromStr; + use anyhow::{bail, Context}; +use base64::Engine; use futures::{stream, StreamExt, TryStreamExt}; use itertools::Itertools; +use reqwest::Url; use serde::Deserialize; +use serde_json::json; use simple_error::SimpleError; use tl::VDom; @@ -100,7 +105,7 @@ pub fn in_stock_from_meta(dom: &VDom) -> anyhow::Result { "instock" => true, _ => bail!("Not a valid product:availability"), }, - None => bail!("No product:availability in carrefour"), + None => bail!("No product:availability in vtex"), }, ) } @@ -146,6 +151,101 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result(client: &reqwest::Client, url: &str) -> anyhow::Result { + let body = get_retry_policy() + .retry_if(|| do_request(client, url), retry_if_wasnt_not_found) + .await? + .text() + .await?; + Ok(body) +} + +async fn get_binding_id(client: &reqwest::Client, url: &str) -> anyhow::Result { + let body = fetch_body(client, url).await?; + let dom = tl::parse(&body, tl::ParserOptions::default())?; + let json = parse_script_json(&dom, "__RUNTIME__")?; + let id = json + .as_object() + .ok_or(SimpleError::new("RUNTIME not an object"))? + .get("binding") + .and_then(|v| v.as_object()) + .and_then(|o| o.get("id")) + .and_then(|v| v.as_str()) + .ok_or(SimpleError::new("binding.id does not exist"))? + .to_string(); + Ok(id) +} + +/// Returns a vec of product URLs +/// +/// Estos parametros se consiguen yendo a una página como `https://www.jumbo.com.ar/almacen` y extrayendo: +/// * `domain` - www.jumbo.com.ar +/// * `query` - almacen +/// +/// También `https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras`: +/// * `domain` - diaonline.supermercadosdia.com.ar +/// * `query` - frescos/frutas-y-verduras +pub async fn get_best_selling_by_category( + client: &reqwest::Client, + domain: &str, + query: &str, +) -> anyhow::Result> { + let base_url = { Url::from_str(&format!("https://{}/{}", domain, query)).unwrap() }; + + let binding_id = get_binding_id(client, base_url.as_str()).await?; + let url = { + let mut url = base_url.clone(); + url.set_path("/_v/segment/graphql/v1"); + url.query_pairs_mut().append_pair("workspace", "master") + .append_pair("maxAge", "short") + .append_pair("appsEtag", "remove") + .append_pair("domain", "store") + .append_pair("locale", "es-AR") + .append_pair("__bindingId", &binding_id) + .append_pair("operationName", "productSearchV3") + .append_pair("variables", "%7B%7D") + .append_pair("extensions", &{ + let variables_obj = json!({"hideUnavailableItems":true,"skusFilter":"FIRST_AVAILABLE","simulationBehavior":"default","installmentCriteria":"MAX_WITHOUT_INTEREST","productOriginVtex":false,"map":"c","query":query,"orderBy":"OrderByTopSaleDESC","from":0,"to":99,"selectedFacets": + query.split('/').map(|f| json!({"key":"c","value":f})).collect::>() + ,"facetsBehavior":"Static","categoryTreeBehavior":"default","withFacets":false}); + let b64=base64::prelude::BASE64_STANDARD.encode(variables_obj.to_string()); + + format!( + r#"{{ + "persistedQuery": {{ + "version": 1, + "sha256Hash": "40b843ca1f7934d20d05d334916220a0c2cae3833d9f17bcb79cdd2185adceac", + "sender": "vtex.store-resources@0.x", + "provider": "vtex.search-graphql@0.x" + }}, + "variables": "{}" + }}"#, b64 + ) + }); + url + }; + let body = fetch_body(client, url.as_str()).await?; + let urls: Vec = serde_json::from_str::(&body)? + .pointer("/data/productSearch/products") + .and_then(|v| v.as_array()) + .map(|a| { + a.iter() + .filter_map(|p| { + p.get("link") + .and_then(|v| v.as_str()) + .map(|s| format!("https://{}{}", domain, s)) + }) + .collect() + }) + .ok_or(SimpleError::new("failed to get best selling product urls"))?; + + if urls.len() < 2 { + bail!("Too few best selling"); + } + + Ok(urls) +} + #[cfg(test)] mod tests { use super::*; @@ -163,4 +263,11 @@ mod tests { assert_eq!(links[0], "https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g\u{200b}-684952/p"); Ok(()) } + + #[tokio::test] + async fn test_jumbo_best_selling() -> anyhow::Result<()> { + get_best_selling_by_category(&build_client(), "www.jumbo.com.ar", "almacen").await?; + // assert_eq!(links[0], "https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g\u{200b}-684952/p"); + Ok(()) + } } diff --git a/scraper-rs/src/supermercado.rs b/scraper-rs/src/supermercado.rs new file mode 100644 index 0000000..ba7994f --- /dev/null +++ b/scraper-rs/src/supermercado.rs @@ -0,0 +1,19 @@ +use clap::ValueEnum; + +#[derive(ValueEnum, Clone, Debug)] +pub enum Supermercado { + Dia, + Jumbo, + Carrefour, + Coto, +} +impl Supermercado { + pub fn host(&self) -> &'static str { + match self { + Self::Dia => "diaonline.supermercadosdia.com.ar", + Self::Carrefour => "www.carrefour.com.ar", + Self::Coto => "www.cotodigital3.com.ar", + Self::Jumbo => "www.jumbo.com.ar", + } + } +}