From e2eb3a1c3f83791752f61ff3b11388f46ce085d4 Mon Sep 17 00:00:00 2001 From: Nulo Date: Mon, 29 Jan 2024 12:06:35 -0300 Subject: [PATCH 1/8] WIP: mostrar best selling --- db-datos/drizzle/0010_true_black_tom.sql | 6 + db-datos/drizzle/meta/0010_snapshot.json | 183 +++++++++++++++++++++++ db-datos/drizzle/meta/_journal.json | 7 + db-datos/schema.js | 11 +- scraper-rs/Cargo.lock | 5 +- scraper-rs/Cargo.toml | 1 + scraper-rs/src/best_selling.rs | 142 ++++++++++++++++++ scraper-rs/src/main.rs | 98 ++++++++---- scraper-rs/src/sites/mod.rs | 2 +- scraper-rs/src/sites/vtex.rs | 109 +++++++++++++- scraper-rs/src/supermercado.rs | 19 +++ 11 files changed, 551 insertions(+), 32 deletions(-) create mode 100644 db-datos/drizzle/0010_true_black_tom.sql create mode 100644 db-datos/drizzle/meta/0010_snapshot.json create mode 100644 scraper-rs/src/best_selling.rs create mode 100644 scraper-rs/src/supermercado.rs diff --git a/db-datos/drizzle/0010_true_black_tom.sql b/db-datos/drizzle/0010_true_black_tom.sql new file mode 100644 index 0000000..f55ee87 --- /dev/null +++ b/db-datos/drizzle/0010_true_black_tom.sql @@ -0,0 +1,6 @@ +CREATE TABLE `db_best_selling` ( + `id` integer PRIMARY KEY AUTOINCREMENT NOT NULL, + `fetched_at` integer NOT NULL, + `category` text NOT NULL, + `eans_json` text NOT NULL +); diff --git a/db-datos/drizzle/meta/0010_snapshot.json b/db-datos/drizzle/meta/0010_snapshot.json new file mode 100644 index 0000000..d2a437e --- /dev/null +++ b/db-datos/drizzle/meta/0010_snapshot.json @@ -0,0 +1,183 @@ +{ + "version": "5", + "dialect": "sqlite", + "id": "c8297337-4ed8-432e-8782-65d41be42e00", + "prevId": "2e398920-ffaf-4d55-ae13-d906cb9e0efa", + "tables": { + "db_best_selling": { + "name": "db_best_selling", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "autoincrement": true + }, + "fetched_at": { + "name": "fetched_at", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "category": { + "name": "category", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "eans_json": { + "name": "eans_json", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "precios": { + "name": "precios", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "autoincrement": true + }, + "ean": { + "name": "ean", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "fetched_at": { + "name": "fetched_at", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "precio_centavos": { + "name": "precio_centavos", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "in_stock": { + "name": "in_stock", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "url": { + "name": "url", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "warc_record_id": { + "name": "warc_record_id", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "parser_version": { + "name": "parser_version", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "image_url": { + "name": "image_url", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + } + }, + "indexes": { + "precios_ean_idx": { + "name": "precios_ean_idx", + "columns": [ + "ean" + ], + "isUnique": false + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "producto_urls": { + "name": "producto_urls", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "autoincrement": true + }, + "url": { + "name": "url", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "first_seen": { + "name": "first_seen", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "last_seen": { + "name": "last_seen", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": { + "producto_urls_url_unique": { + "name": "producto_urls_url_unique", + "columns": [ + "url" + ], + "isUnique": true + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + } + }, + "enums": {}, + "_meta": { + "schemas": {}, + "tables": {}, + "columns": {} + } +} \ No newline at end of file diff --git a/db-datos/drizzle/meta/_journal.json b/db-datos/drizzle/meta/_journal.json index bd847ef..c9d8729 100644 --- a/db-datos/drizzle/meta/_journal.json +++ b/db-datos/drizzle/meta/_journal.json @@ -71,6 +71,13 @@ "when": 1703895109501, "tag": "0009_breezy_forge", "breakpoints": true + }, + { + "idx": 10, + "version": "5", + "when": 1706540751931, + "tag": "0010_true_black_tom", + "breakpoints": true } ] } \ No newline at end of file diff --git a/db-datos/schema.js b/db-datos/schema.js index 2b921b9..eabdc35 100644 --- a/db-datos/schema.js +++ b/db-datos/schema.js @@ -19,7 +19,7 @@ export const precios = sqliteTable( return { preciosEanIdx: index("precios_ean_idx").on(precios.ean), }; - }, + } ); /** @typedef {typeof precios.$inferSelect} Precio */ @@ -32,3 +32,12 @@ export const productoUrls = sqliteTable("producto_urls", { }); /** @typedef {typeof productoUrls.$inferSelect} ProductUrl */ + +export const bestSelling = sqliteTable("db_best_selling", { + id: integer("id", { mode: "number" }).primaryKey({ autoIncrement: true }), + fetchedAt: integer("fetched_at", { mode: "timestamp" }).notNull(), + category: text("category").notNull(), + eansJson: text("eans_json").notNull(), +}); + +/** @typedef {typeof bestSelling.$inferSelect} BestSelling */ diff --git a/scraper-rs/Cargo.lock b/scraper-rs/Cargo.lock index c540cc5..30ea007 100644 --- a/scraper-rs/Cargo.lock +++ b/scraper-rs/Cargo.lock @@ -178,9 +178,9 @@ dependencies = [ [[package]] name = "base64" -version = "0.21.5" +version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" [[package]] name = "bitflags" @@ -1232,6 +1232,7 @@ version = "0.1.0" dependencies = [ "again", "anyhow", + "base64", "chrono", "clap", "cron", diff --git a/scraper-rs/Cargo.toml b/scraper-rs/Cargo.toml index 336d5d5..e8b1a0a 100644 --- a/scraper-rs/Cargo.toml +++ b/scraper-rs/Cargo.toml @@ -8,6 +8,7 @@ edition = "2021" [dependencies] again = "0.1.2" anyhow = "1.0.79" +base64 = "0.21.7" chrono = "0.4.32" clap = { version = "4.4.15", features = ["derive"] } cron = "0.12.0" diff --git a/scraper-rs/src/best_selling.rs b/scraper-rs/src/best_selling.rs new file mode 100644 index 0000000..2f3e586 --- /dev/null +++ b/scraper-rs/src/best_selling.rs @@ -0,0 +1,142 @@ +use std::collections::HashMap; + +use crate::{build_client, sites::vtex, supermercado::Supermercado}; +use chrono::{DateTime, Utc}; +use clap::ValueEnum; +use deadpool_sqlite::Pool; +use futures::{stream, FutureExt, StreamExt, TryStreamExt}; +use itertools::Itertools; +use tracing::warn; + +#[derive(ValueEnum, Clone, Debug)] +pub enum Category { + Almacen, + Bebidas, + FrutasYVerduras, +} +impl Category { + fn query(&self, supermercado: &Supermercado) -> Option<&'static str> { + match self { + Self::Almacen => match supermercado { + Supermercado::Jumbo | Supermercado::Dia => Some("almacen"), + _ => None, + }, + Self::Bebidas => match supermercado { + Supermercado::Jumbo | Supermercado::Dia => Some("bebidas"), + _ => None, + }, + Self::FrutasYVerduras => match supermercado { + Supermercado::Jumbo => Some("frutas-y-verduras"), + Supermercado::Dia => Some("frescos/frutas-y-verduras"), + _ => None, + }, + } + } + + pub fn id(&self) -> &'static str { + match self { + Self::Almacen => "almacen", + Self::Bebidas => "bebidas", + Self::FrutasYVerduras => "frutas-y-verduras", + } + } +} + +#[derive(Debug)] +pub struct BestSellingRecord { + pub fetched_at: DateTime, + pub category: Category, + pub eans: Vec, +} + +async fn get_best_selling_eans(pool: &Pool, urls: Vec) -> anyhow::Result> { + let mut eans: Vec = Vec::new(); + + for url in urls { + let q = url.clone(); + let ean = pool + .get() + .await? + .interact(move |conn| { + conn.prepare(r#"SELECT ean FROM precios WHERE url = ?1;"#)? + .query_map(rusqlite::params![q], |r| r.get::<_, String>(0)) + .map(|r| r.map(|r| r.unwrap()).next()) + }) + .await + .unwrap()?; + match ean { + Some(e) => eans.push(e), + None => warn!("No encontré EAN para {}", url), + } + } + + Ok(eans) +} + +async fn try_get_best_selling_eans( + client: reqwest::Client, + pool: Pool, + supermercado: &Supermercado, + category: &Category, +) -> anyhow::Result>> { + if let Some(query) = category.query(supermercado) { + let urls = vtex::get_best_selling_by_category(&client, supermercado.host(), query).await?; + let eans = get_best_selling_eans(&pool, urls).await?; + Ok(Some(eans)) + } else { + Ok(None) + } +} + +async fn noop(t: T) -> anyhow::Result { + Ok(t) +} + +fn rank_eans(eans: Vec>) -> Vec { + let mut map: HashMap = HashMap::new(); + for eans in eans { + for (i, ean) in eans.into_iter().enumerate() { + let base = map.get(&ean).unwrap_or(&0); + let score = base + 1000 / (i + 1); + map.insert(ean, score); + } + } + map.into_iter() + .sorted_by(|a, b| Ord::cmp(&b.1, &a.1)) + .map(|t| t.0) + .collect_vec() +} + +pub async fn get_all_best_selling(pool: &Pool) -> anyhow::Result> { + let client = &build_client(); + + stream::iter(Category::value_variants()) + .map(|category| { + stream::iter(Supermercado::value_variants()) + .map(|supermercado| { + let pool = pool.clone(); + let client = client.clone(); + tokio::spawn(try_get_best_selling_eans( + client, + pool, + supermercado, + category, + )) + }) + .buffer_unordered(5) + .map(|f| f.unwrap()) + .try_filter_map(noop) + .try_collect::>>() + .map(|r| { + r.map(rank_eans).map(|eans| BestSellingRecord { + fetched_at: Utc::now(), + category: category.clone(), + eans, + }) + }) + }) + .buffer_unordered(5) + .boxed() + .try_collect() + .await +} diff --git a/scraper-rs/src/main.rs b/scraper-rs/src/main.rs index 565b0fd..47e2055 100644 --- a/scraper-rs/src/main.rs +++ b/scraper-rs/src/main.rs @@ -1,10 +1,11 @@ use again::RetryPolicy; +use best_selling::BestSellingRecord; use clap::{Parser, ValueEnum}; use cron::Schedule; use deadpool_sqlite::Pool; -use futures::{future, stream, StreamExt}; +use futures::{future, stream, Future, StreamExt}; use nanoid::nanoid; -use reqwest::{StatusCode, Url}; +use reqwest::{header::HeaderMap, StatusCode, Url}; use simple_error::{bail, SimpleError}; use std::{ env::{self}, @@ -15,23 +16,8 @@ use std::{ }; use thiserror::Error; -#[derive(ValueEnum, Clone, Debug)] -enum Supermercado { - Dia, - Jumbo, - Carrefour, - Coto, -} -impl Supermercado { - fn host(&self) -> &'static str { - match self { - Self::Dia => "diaonline.supermercadosdia.com.ar", - Self::Carrefour => "www.carrefour.com.ar", - Self::Coto => "www.cotodigital3.com.ar", - Self::Jumbo => "www.jumbo.com.ar", - } - } -} +mod supermercado; +use supermercado::Supermercado; #[derive(Parser)] // requires `derive` feature enum Args { @@ -39,6 +25,7 @@ enum Args { ParseFile(ParseFileArgs), GetUrlList(GetUrlListArgs), ScrapUrl(ScrapUrlArgs), + ScrapBestSelling, Auto(AutoArgs), Cron(AutoArgs), } @@ -71,6 +58,7 @@ async fn main() -> anyhow::Result<()> { Args::ParseFile(a) => parse_file_cli(a.file_path).await, Args::GetUrlList(a) => get_url_list_cli(a.supermercado).await, Args::ScrapUrl(a) => scrap_url_cli(a.url).await, + Args::ScrapBestSelling => scrap_best_selling_cli().await, Args::Auto(_) => auto_cli().await, Args::Cron(_) => cron_cli().await, } @@ -83,6 +71,14 @@ async fn scrap_url_cli(url: String) -> anyhow::Result<()> { println!("Result: {:#?}", res); res.map(|_| ()) } +mod best_selling; +async fn scrap_best_selling_cli() -> anyhow::Result<()> { + let db = connect_db(); + let res = best_selling::get_all_best_selling(&db).await; + + println!("Result: {:#?}", res); + res.map(|_| ()) +} async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> { let links_str = fs::read_to_string(links_list_path).unwrap(); @@ -131,14 +127,6 @@ fn connect_db() -> Pool { cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap() } -fn build_client() -> reqwest::Client { - reqwest::ClientBuilder::default() - .timeout(Duration::from_secs(60 * 5)) - .connect_timeout(Duration::from_secs(60)) - .build() - .unwrap() -} - #[derive(Default, Debug)] struct Counters { success: u64, @@ -190,6 +178,16 @@ enum FetchError { Tl(#[from] tl::ParseError), } +fn build_client() -> reqwest::Client { + let mut headers = HeaderMap::new(); + headers.append("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36".parse().unwrap()); + reqwest::ClientBuilder::default() + .timeout(Duration::from_secs(60 * 5)) + .connect_timeout(Duration::from_secs(60)) + .default_headers(headers) + .build() + .unwrap() +} pub async fn do_request(client: &reqwest::Client, url: &str) -> reqwest::Result { let request = client.get(url).build()?; let response = client.execute(request).await?.error_for_status()?; @@ -356,9 +354,26 @@ impl Auto { )) .await; } + + let best_selling = self + .inform_time( + "Downloaded best selling", + best_selling::get_all_best_selling(&self.pool), + ) + .await?; + self.save_best_selling(best_selling).await?; + Ok(()) } + async fn inform_time, R>(&self, msg: &str, action: T) -> R { + let t0 = now_sec(); + let res = action.await; + self.inform(&format!("{} (took {})", msg, now_sec() - t0)) + .await; + res + } + async fn get_and_save_urls(&self, supermercado: &Supermercado) -> anyhow::Result<()> { let urls = get_urls(supermercado).await?; self.pool @@ -385,6 +400,35 @@ impl Auto { Ok(()) } + async fn save_best_selling(&self, best_selling: Vec) -> anyhow::Result<()> { + self.pool + .get() + .await? + .interact(move |conn| -> Result<(), anyhow::Error> { + let tx = conn.transaction()?; + { + let mut stmt = tx.prepare( + r#"INSERT INTO db_best_selling(fetched_at, category, eans_json) + VALUES (?1, ?2, ?3);"#, + )?; + for record in best_selling { + let eans_json = serde_json::Value::from(record.eans).to_string(); + let fetched_at = record.fetched_at.timestamp_millis(); + stmt.execute(rusqlite::params![ + fetched_at, + record.category.id(), + eans_json + ])?; + } + } + tx.commit()?; + Ok(()) + }) + .await + .unwrap()?; + Ok(()) + } + async fn inform(&self, msg: &str) { println!("{}", msg); if let Some(telegram) = &self.telegram { diff --git a/scraper-rs/src/sites/mod.rs b/scraper-rs/src/sites/mod.rs index 019de83..e305f94 100644 --- a/scraper-rs/src/sites/mod.rs +++ b/scraper-rs/src/sites/mod.rs @@ -3,4 +3,4 @@ mod common; pub mod coto; pub mod dia; pub mod jumbo; -mod vtex; +pub mod vtex; diff --git a/scraper-rs/src/sites/vtex.rs b/scraper-rs/src/sites/vtex.rs index 7b09945..1fdb44e 100644 --- a/scraper-rs/src/sites/vtex.rs +++ b/scraper-rs/src/sites/vtex.rs @@ -1,7 +1,12 @@ +use std::str::FromStr; + use anyhow::{bail, Context}; +use base64::Engine; use futures::{stream, StreamExt, TryStreamExt}; use itertools::Itertools; +use reqwest::Url; use serde::Deserialize; +use serde_json::json; use simple_error::SimpleError; use tl::VDom; @@ -100,7 +105,7 @@ pub fn in_stock_from_meta(dom: &VDom) -> anyhow::Result { "instock" => true, _ => bail!("Not a valid product:availability"), }, - None => bail!("No product:availability in carrefour"), + None => bail!("No product:availability in vtex"), }, ) } @@ -146,6 +151,101 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result(client: &reqwest::Client, url: &str) -> anyhow::Result { + let body = get_retry_policy() + .retry_if(|| do_request(client, url), retry_if_wasnt_not_found) + .await? + .text() + .await?; + Ok(body) +} + +async fn get_binding_id(client: &reqwest::Client, url: &str) -> anyhow::Result { + let body = fetch_body(client, url).await?; + let dom = tl::parse(&body, tl::ParserOptions::default())?; + let json = parse_script_json(&dom, "__RUNTIME__")?; + let id = json + .as_object() + .ok_or(SimpleError::new("RUNTIME not an object"))? + .get("binding") + .and_then(|v| v.as_object()) + .and_then(|o| o.get("id")) + .and_then(|v| v.as_str()) + .ok_or(SimpleError::new("binding.id does not exist"))? + .to_string(); + Ok(id) +} + +/// Returns a vec of product URLs +/// +/// Estos parametros se consiguen yendo a una página como `https://www.jumbo.com.ar/almacen` y extrayendo: +/// * `domain` - www.jumbo.com.ar +/// * `query` - almacen +/// +/// También `https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras`: +/// * `domain` - diaonline.supermercadosdia.com.ar +/// * `query` - frescos/frutas-y-verduras +pub async fn get_best_selling_by_category( + client: &reqwest::Client, + domain: &str, + query: &str, +) -> anyhow::Result> { + let base_url = { Url::from_str(&format!("https://{}/{}", domain, query)).unwrap() }; + + let binding_id = get_binding_id(client, base_url.as_str()).await?; + let url = { + let mut url = base_url.clone(); + url.set_path("/_v/segment/graphql/v1"); + url.query_pairs_mut().append_pair("workspace", "master") + .append_pair("maxAge", "short") + .append_pair("appsEtag", "remove") + .append_pair("domain", "store") + .append_pair("locale", "es-AR") + .append_pair("__bindingId", &binding_id) + .append_pair("operationName", "productSearchV3") + .append_pair("variables", "%7B%7D") + .append_pair("extensions", &{ + let variables_obj = json!({"hideUnavailableItems":true,"skusFilter":"FIRST_AVAILABLE","simulationBehavior":"default","installmentCriteria":"MAX_WITHOUT_INTEREST","productOriginVtex":false,"map":"c","query":query,"orderBy":"OrderByTopSaleDESC","from":0,"to":99,"selectedFacets": + query.split('/').map(|f| json!({"key":"c","value":f})).collect::>() + ,"facetsBehavior":"Static","categoryTreeBehavior":"default","withFacets":false}); + let b64=base64::prelude::BASE64_STANDARD.encode(variables_obj.to_string()); + + format!( + r#"{{ + "persistedQuery": {{ + "version": 1, + "sha256Hash": "40b843ca1f7934d20d05d334916220a0c2cae3833d9f17bcb79cdd2185adceac", + "sender": "vtex.store-resources@0.x", + "provider": "vtex.search-graphql@0.x" + }}, + "variables": "{}" + }}"#, b64 + ) + }); + url + }; + let body = fetch_body(client, url.as_str()).await?; + let urls: Vec = serde_json::from_str::(&body)? + .pointer("/data/productSearch/products") + .and_then(|v| v.as_array()) + .map(|a| { + a.iter() + .filter_map(|p| { + p.get("link") + .and_then(|v| v.as_str()) + .map(|s| format!("https://{}{}", domain, s)) + }) + .collect() + }) + .ok_or(SimpleError::new("failed to get best selling product urls"))?; + + if urls.len() < 2 { + bail!("Too few best selling"); + } + + Ok(urls) +} + #[cfg(test)] mod tests { use super::*; @@ -163,4 +263,11 @@ mod tests { assert_eq!(links[0], "https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g\u{200b}-684952/p"); Ok(()) } + + #[tokio::test] + async fn test_jumbo_best_selling() -> anyhow::Result<()> { + get_best_selling_by_category(&build_client(), "www.jumbo.com.ar", "almacen").await?; + // assert_eq!(links[0], "https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g\u{200b}-684952/p"); + Ok(()) + } } diff --git a/scraper-rs/src/supermercado.rs b/scraper-rs/src/supermercado.rs new file mode 100644 index 0000000..ba7994f --- /dev/null +++ b/scraper-rs/src/supermercado.rs @@ -0,0 +1,19 @@ +use clap::ValueEnum; + +#[derive(ValueEnum, Clone, Debug)] +pub enum Supermercado { + Dia, + Jumbo, + Carrefour, + Coto, +} +impl Supermercado { + pub fn host(&self) -> &'static str { + match self { + Self::Dia => "diaonline.supermercadosdia.com.ar", + Self::Carrefour => "www.carrefour.com.ar", + Self::Coto => "www.cotodigital3.com.ar", + Self::Jumbo => "www.jumbo.com.ar", + } + } +} From cc0af3011a9a26874aa93a61dcffdce23483df74 Mon Sep 17 00:00:00 2001 From: Nulo Date: Mon, 29 Jan 2024 12:25:24 -0300 Subject: [PATCH 2/8] solo hacer best selling una vez --- scraper-rs/src/main.rs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/scraper-rs/src/main.rs b/scraper-rs/src/main.rs index 47e2055..308f0eb 100644 --- a/scraper-rs/src/main.rs +++ b/scraper-rs/src/main.rs @@ -355,14 +355,6 @@ impl Auto { .await; } - let best_selling = self - .inform_time( - "Downloaded best selling", - best_selling::get_all_best_selling(&self.pool), - ) - .await?; - self.save_best_selling(best_selling).await?; - Ok(()) } @@ -466,6 +458,15 @@ async fn auto_cli() -> anyhow::Result<()> { .map(|s| tokio::spawn(auto.clone().download_supermercado(s.to_owned()))) .collect(); future::try_join_all(handles).await?; + + let best_selling = auto + .inform_time( + "Downloaded best selling", + best_selling::get_all_best_selling(&auto.pool), + ) + .await?; + auto.save_best_selling(best_selling).await?; + Ok(()) } async fn cron_cli() -> anyhow::Result<()> { From 4669e6087fb7dbda5e02eedecf57861151b512b5 Mon Sep 17 00:00:00 2001 From: Nulo Date: Mon, 29 Jan 2024 12:47:09 -0300 Subject: [PATCH 3/8] arreglar migracion cli --- db-datos/migrate-cli.js | 3 +++ db-datos/migrate.js | 7 ++++++- db-datos/package.json | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) create mode 100644 db-datos/migrate-cli.js diff --git a/db-datos/migrate-cli.js b/db-datos/migrate-cli.js new file mode 100644 index 0000000..df20787 --- /dev/null +++ b/db-datos/migrate-cli.js @@ -0,0 +1,3 @@ +import { getDb } from "./db.js"; + +getDb(); diff --git a/db-datos/migrate.js b/db-datos/migrate.js index 0419aee..74ee10e 100644 --- a/db-datos/migrate.js +++ b/db-datos/migrate.js @@ -2,12 +2,17 @@ import { migrate } from "drizzle-orm/better-sqlite3/migrator"; import * as schema from "./schema.js"; import { sql } from "drizzle-orm"; +import { existsSync } from "node:fs"; +import { join } from "node:path"; /** * @param {import("drizzle-orm/better-sqlite3").BetterSQLite3Database} db */ export function migrateDb(db) { - migrate(db, { migrationsFolder: "node_modules/db-datos/drizzle" }); + let path = "drizzle/"; + if (!existsSync(join(path, "meta/_journal.json"))) + path = "node_modules/db-datos/drizzle"; + migrate(db, { migrationsFolder: path }); db.run(sql`pragma journal_mode = WAL;`); db.run(sql`PRAGMA synchronous = NORMAL;`); } diff --git a/db-datos/package.json b/db-datos/package.json index 262876d..d3a657f 100644 --- a/db-datos/package.json +++ b/db-datos/package.json @@ -6,7 +6,7 @@ "main": "index.js", "scripts": { "generate": "drizzle-kit generate:sqlite", - "migrate": "node db.js" + "migrate": "node migrate-cli.js" }, "keywords": [], "author": "", From ce0708738fc4685f0b485588db3f11a26aac1dfc Mon Sep 17 00:00:00 2001 From: Nulo Date: Tue, 30 Jan 2024 10:52:31 -0300 Subject: [PATCH 4/8] sqlx --- scraper-rs/.env | 1 + ...535425e5c2f39cf98735b5f67cb91d01937ce.json | 12 + ...7f6c0a78e7e025cf152a8c176b9fd1de241da.json | 12 + ...df4e257f3c9b0efa62c8342d077d69d826a69.json | 20 + ...8c67b47ec8a575d2a14a487b3294e0faec438.json | 12 + ...9744a8d5f70c83ed9ddddfd55009136088a52.json | 20 + scraper-rs/Cargo.lock | 755 +++++++++++++++++- scraper-rs/Cargo.toml | 3 +- scraper-rs/src/best_selling.rs | 27 +- scraper-rs/src/db.rs | 109 +++ scraper-rs/src/main.rs | 154 +--- scraper-rs/src/sites/common.rs | 4 +- scraper-rs/src/sites/coto.rs | 2 +- 13 files changed, 948 insertions(+), 183 deletions(-) create mode 100644 scraper-rs/.env create mode 100644 scraper-rs/.sqlx/query-08d55fc80c8a6ad73d311e8b1cd535425e5c2f39cf98735b5f67cb91d01937ce.json create mode 100644 scraper-rs/.sqlx/query-144f4622ac9a937aa4885ceb5a67f6c0a78e7e025cf152a8c176b9fd1de241da.json create mode 100644 scraper-rs/.sqlx/query-aa5c2a04aec149d88f6e25a9bd7df4e257f3c9b0efa62c8342d077d69d826a69.json create mode 100644 scraper-rs/.sqlx/query-d0c3a557a81f6685b242ed0be8e8c67b47ec8a575d2a14a487b3294e0faec438.json create mode 100644 scraper-rs/.sqlx/query-f249765f2fb013a81a4157a6ce19744a8d5f70c83ed9ddddfd55009136088a52.json create mode 100644 scraper-rs/src/db.rs diff --git a/scraper-rs/.env b/scraper-rs/.env new file mode 100644 index 0000000..83d2c4d --- /dev/null +++ b/scraper-rs/.env @@ -0,0 +1 @@ +DATABASE_URL=sqlite:../sqlite.db diff --git a/scraper-rs/.sqlx/query-08d55fc80c8a6ad73d311e8b1cd535425e5c2f39cf98735b5f67cb91d01937ce.json b/scraper-rs/.sqlx/query-08d55fc80c8a6ad73d311e8b1cd535425e5c2f39cf98735b5f67cb91d01937ce.json new file mode 100644 index 0000000..9f6eb81 --- /dev/null +++ b/scraper-rs/.sqlx/query-08d55fc80c8a6ad73d311e8b1cd535425e5c2f39cf98735b5f67cb91d01937ce.json @@ -0,0 +1,12 @@ +{ + "db_name": "SQLite", + "query": "INSERT INTO producto_urls(url, first_seen, last_seen)\n VALUES (?1, ?2, ?2)\n ON CONFLICT(url) DO UPDATE SET last_seen=?2;", + "describe": { + "columns": [], + "parameters": { + "Right": 2 + }, + "nullable": [] + }, + "hash": "08d55fc80c8a6ad73d311e8b1cd535425e5c2f39cf98735b5f67cb91d01937ce" +} diff --git a/scraper-rs/.sqlx/query-144f4622ac9a937aa4885ceb5a67f6c0a78e7e025cf152a8c176b9fd1de241da.json b/scraper-rs/.sqlx/query-144f4622ac9a937aa4885ceb5a67f6c0a78e7e025cf152a8c176b9fd1de241da.json new file mode 100644 index 0000000..8bf6e78 --- /dev/null +++ b/scraper-rs/.sqlx/query-144f4622ac9a937aa4885ceb5a67f6c0a78e7e025cf152a8c176b9fd1de241da.json @@ -0,0 +1,12 @@ +{ + "db_name": "SQLite", + "query": "INSERT INTO db_best_selling(fetched_at, category, eans_json)\n VALUES (?1, ?2, ?3);", + "describe": { + "columns": [], + "parameters": { + "Right": 3 + }, + "nullable": [] + }, + "hash": "144f4622ac9a937aa4885ceb5a67f6c0a78e7e025cf152a8c176b9fd1de241da" +} diff --git a/scraper-rs/.sqlx/query-aa5c2a04aec149d88f6e25a9bd7df4e257f3c9b0efa62c8342d077d69d826a69.json b/scraper-rs/.sqlx/query-aa5c2a04aec149d88f6e25a9bd7df4e257f3c9b0efa62c8342d077d69d826a69.json new file mode 100644 index 0000000..08c07a1 --- /dev/null +++ b/scraper-rs/.sqlx/query-aa5c2a04aec149d88f6e25a9bd7df4e257f3c9b0efa62c8342d077d69d826a69.json @@ -0,0 +1,20 @@ +{ + "db_name": "SQLite", + "query": "SELECT url FROM producto_urls WHERE url LIKE ?1;", + "describe": { + "columns": [ + { + "name": "url", + "ordinal": 0, + "type_info": "Text" + } + ], + "parameters": { + "Right": 1 + }, + "nullable": [ + false + ] + }, + "hash": "aa5c2a04aec149d88f6e25a9bd7df4e257f3c9b0efa62c8342d077d69d826a69" +} diff --git a/scraper-rs/.sqlx/query-d0c3a557a81f6685b242ed0be8e8c67b47ec8a575d2a14a487b3294e0faec438.json b/scraper-rs/.sqlx/query-d0c3a557a81f6685b242ed0be8e8c67b47ec8a575d2a14a487b3294e0faec438.json new file mode 100644 index 0000000..b870e21 --- /dev/null +++ b/scraper-rs/.sqlx/query-d0c3a557a81f6685b242ed0be8e8c67b47ec8a575d2a14a487b3294e0faec438.json @@ -0,0 +1,12 @@ +{ + "db_name": "SQLite", + "query": "INSERT INTO precios(ean, fetched_at, precio_centavos, in_stock, url, warc_record_id, parser_version, name, image_url) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9);", + "describe": { + "columns": [], + "parameters": { + "Right": 9 + }, + "nullable": [] + }, + "hash": "d0c3a557a81f6685b242ed0be8e8c67b47ec8a575d2a14a487b3294e0faec438" +} diff --git a/scraper-rs/.sqlx/query-f249765f2fb013a81a4157a6ce19744a8d5f70c83ed9ddddfd55009136088a52.json b/scraper-rs/.sqlx/query-f249765f2fb013a81a4157a6ce19744a8d5f70c83ed9ddddfd55009136088a52.json new file mode 100644 index 0000000..eeb8a74 --- /dev/null +++ b/scraper-rs/.sqlx/query-f249765f2fb013a81a4157a6ce19744a8d5f70c83ed9ddddfd55009136088a52.json @@ -0,0 +1,20 @@ +{ + "db_name": "SQLite", + "query": "SELECT ean FROM precios WHERE url = ?1;", + "describe": { + "columns": [ + { + "name": "ean", + "ordinal": 0, + "type_info": "Text" + } + ], + "parameters": { + "Right": 1 + }, + "nullable": [ + false + ] + }, + "hash": "f249765f2fb013a81a4157a6ce19744a8d5f70c83ed9ddddfd55009136088a52" +} diff --git a/scraper-rs/Cargo.lock b/scraper-rs/Cargo.lock index 30ea007..fc60e87 100644 --- a/scraper-rs/Cargo.lock +++ b/scraper-rs/Cargo.lock @@ -35,6 +35,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01" dependencies = [ "cfg-if", + "getrandom 0.2.11", "once_cell", "version_check", "zerocopy", @@ -145,14 +146,22 @@ dependencies = [ ] [[package]] -name = "async-trait" -version = "0.1.77" +name = "atoi" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" dependencies = [ - "proc-macro2", - "quote", - "syn", + "num-traits", +] + +[[package]] +name = "atomic-write-file" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edcdbedc2236483ab103a53415653d6b4442ea6141baf1ffa85df29635e88436" +dependencies = [ + "nix", + "rand 0.8.5", ] [[package]] @@ -182,6 +191,12 @@ version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" +[[package]] +name = "base64ct" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" + [[package]] name = "bitflags" version = "1.3.2" @@ -193,6 +208,18 @@ name = "bitflags" version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" +dependencies = [ + "serde", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] [[package]] name = "brotli" @@ -221,6 +248,12 @@ version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bytes" version = "1.5.0" @@ -287,7 +320,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] @@ -302,6 +335,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + [[package]] name = "core-foundation" version = "0.9.4" @@ -318,6 +357,30 @@ version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +[[package]] +name = "cpufeatures" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +dependencies = [ + "libc", +] + +[[package]] +name = "crc" +version = "3.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86ec7a15cbe22e59248fc7eadb1907dab5ba09372595da4d73dd805ed4417dfe" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + [[package]] name = "crc32fast" version = "1.3.2" @@ -339,51 +402,67 @@ dependencies = [ ] [[package]] -name = "deadpool" -version = "0.10.0" +name = "crossbeam-queue" +version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb84100978c1c7b37f09ed3ce3e5f843af02c2a2c431bae5b19230dad2c1b490" +checksum = "df0346b5d5e76ac2fe4e327c5fd1118d6be7c51dfb18f9b7922923f287471e35" dependencies = [ - "async-trait", - "deadpool-runtime", - "num_cpus", - "tokio", + "crossbeam-utils", ] [[package]] -name = "deadpool-runtime" -version = "0.1.3" +name = "crossbeam-utils" +version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63dfa964fe2a66f3fde91fc70b267fe193d822c7e603e2a675a49a7f46ad3f49" +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" dependencies = [ - "tokio", + "generic-array", + "typenum", ] [[package]] -name = "deadpool-sqlite" -version = "0.7.0" +name = "der" +version = "0.7.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8010e36e12f3be22543a5e478b4af20aeead9a700dd69581a5e050a070fc22c" +checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c" dependencies = [ - "deadpool", - "deadpool-sync", - "rusqlite", + "const-oid", + "pem-rfc7468", + "zeroize", ] [[package]] -name = "deadpool-sync" -version = "0.1.2" +name = "digest" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8db70494c13cae4ce67b4b4dafdaf828cf0df7237ab5b9e2fcabee4965d0a0a" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ - "deadpool-runtime", + "block-buffer", + "const-oid", + "crypto-common", + "subtle", ] +[[package]] +name = "dotenvy" +version = "0.15.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" + [[package]] name = "either" version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" +dependencies = [ + "serde", +] [[package]] name = "encoding_rs" @@ -400,6 +479,33 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +[[package]] +name = "errno" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "etcetera" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "136d1b5283a1ab77bd9257427ffd09d8667ced0570b6f938942bc7568ed5b943" +dependencies = [ + "cfg-if", + "home", + "windows-sys 0.48.0", +] + +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + [[package]] name = "fallible-iterator" version = "0.3.0" @@ -412,6 +518,18 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" +[[package]] +name = "fastrand" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" + +[[package]] +name = "finl_unicode" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6" + [[package]] name = "flate2" version = "1.0.28" @@ -422,6 +540,17 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "flume" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55ac459de2512911e4b674ce33cf20befaba382d05b62b008afc1c8b57cbf181" +dependencies = [ + "futures-core", + "futures-sink", + "spin 0.9.8", +] + [[package]] name = "fnv" version = "1.0.7" @@ -479,6 +608,17 @@ dependencies = [ "futures-util", ] +[[package]] +name = "futures-intrusive" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d930c203dd0b6ff06e0201a4a2fe9149b43c684fd4420555b26d21b1a02956f" +dependencies = [ + "futures-core", + "lock_api", + "parking_lot 0.12.1", +] + [[package]] name = "futures-io" version = "0.3.30" @@ -493,7 +633,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] @@ -526,6 +666,16 @@ dependencies = [ "slab", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.1.16" @@ -597,6 +747,9 @@ name = "heck" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +dependencies = [ + "unicode-segmentation", +] [[package]] name = "hermit-abi" @@ -604,6 +757,39 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "hkdf" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b5f8eb2ad728638ea2c7d47a21db23b7b58a72ed6a38256b8a1849f15fbbdf7" +dependencies = [ + "hmac", +] + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "home" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "html-escape" version = "0.2.13" @@ -772,6 +958,9 @@ name = "lazy_static" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +dependencies = [ + "spin 0.5.2", +] [[package]] name = "libc" @@ -779,16 +968,29 @@ version = "0.2.151" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4" +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" + [[package]] name = "libsqlite3-sys" version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf4e226dcd58b4be396f7bd3c20da8fdee2911400705297ba7d2d7cc2c30f716" dependencies = [ + "cc", "pkg-config", "vcpkg", ] +[[package]] +name = "linux-raw-sys" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" + [[package]] name = "lock_api" version = "0.4.11" @@ -805,6 +1007,16 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + [[package]] name = "memchr" version = "2.7.1" @@ -852,6 +1064,17 @@ dependencies = [ "rand 0.8.5", ] +[[package]] +name = "nix" +version = "0.27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053" +dependencies = [ + "bitflags 2.4.1", + "cfg-if", + "libc", +] + [[package]] name = "nom" version = "7.1.3" @@ -872,6 +1095,44 @@ dependencies = [ "winapi", ] +[[package]] +name = "num-bigint-dig" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151" +dependencies = [ + "byteorder", + "lazy_static", + "libm", + "num-integer", + "num-iter", + "num-traits", + "rand 0.8.5", + "smallvec", + "zeroize", +] + +[[package]] +name = "num-integer" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.17" @@ -879,6 +1140,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -960,6 +1222,21 @@ dependencies = [ "windows-targets 0.48.5", ] +[[package]] +name = "paste" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" + +[[package]] +name = "pem-rfc7468" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" +dependencies = [ + "base64ct", +] + [[package]] name = "percent-encoding" version = "2.3.1" @@ -978,6 +1255,27 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkcs1" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" +dependencies = [ + "der", + "pkcs8", + "spki", +] + +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der", + "spki", +] + [[package]] name = "pkg-config" version = "0.3.28" @@ -1158,11 +1456,31 @@ dependencies = [ "cc", "getrandom 0.2.11", "libc", - "spin", + "spin 0.9.8", "untrusted", "windows-sys 0.48.0", ] +[[package]] +name = "rsa" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d0e5124fcb30e76a7e79bfee683a2746db83784b86289f6251b54b7950a0dfc" +dependencies = [ + "const-oid", + "digest", + "num-bigint-dig", + "num-integer", + "num-traits", + "pkcs1", + "pkcs8", + "rand_core 0.6.4", + "signature", + "spki", + "subtle", + "zeroize", +] + [[package]] name = "rusqlite" version = "0.30.0" @@ -1183,6 +1501,19 @@ version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" +[[package]] +name = "rustix" +version = "0.38.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +dependencies = [ + "bitflags 2.4.1", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + [[package]] name = "rustls" version = "0.21.10" @@ -1236,8 +1567,6 @@ dependencies = [ "chrono", "clap", "cron", - "deadpool", - "deadpool-sqlite", "futures", "html-escape", "itertools", @@ -1249,6 +1578,7 @@ dependencies = [ "serde", "serde_json", "simple-error", + "sqlx", "thiserror", "tl", "tokio", @@ -1283,7 +1613,7 @@ checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] @@ -1309,6 +1639,28 @@ dependencies = [ "serde", ] +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -1327,6 +1679,16 @@ dependencies = [ "libc", ] +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "digest", + "rand_core 0.6.4", +] + [[package]] name = "simple-error" version = "0.3.0" @@ -1358,11 +1720,248 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "spin" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" + [[package]] name = "spin" version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +dependencies = [ + "lock_api", +] + +[[package]] +name = "spki" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +dependencies = [ + "base64ct", + "der", +] + +[[package]] +name = "sqlformat" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce81b7bd7c4493975347ef60d8c7e8b742d4694f4c49f93e0a12ea263938176c" +dependencies = [ + "itertools", + "nom", + "unicode_categories", +] + +[[package]] +name = "sqlx" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dba03c279da73694ef99763320dea58b51095dfe87d001b1d4b5fe78ba8763cf" +dependencies = [ + "sqlx-core", + "sqlx-macros", + "sqlx-mysql", + "sqlx-postgres", + "sqlx-sqlite", +] + +[[package]] +name = "sqlx-core" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d84b0a3c3739e220d94b3239fd69fb1f74bc36e16643423bd99de3b43c21bfbd" +dependencies = [ + "ahash", + "atoi", + "byteorder", + "bytes", + "crc", + "crossbeam-queue", + "dotenvy", + "either", + "event-listener", + "futures-channel", + "futures-core", + "futures-intrusive", + "futures-io", + "futures-util", + "hashlink", + "hex", + "indexmap", + "log", + "memchr", + "once_cell", + "paste", + "percent-encoding", + "serde", + "serde_json", + "sha2", + "smallvec", + "sqlformat", + "thiserror", + "tokio", + "tokio-stream", + "tracing", + "url", +] + +[[package]] +name = "sqlx-macros" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89961c00dc4d7dffb7aee214964b065072bff69e36ddb9e2c107541f75e4f2a5" +dependencies = [ + "proc-macro2", + "quote", + "sqlx-core", + "sqlx-macros-core", + "syn 1.0.109", +] + +[[package]] +name = "sqlx-macros-core" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0bd4519486723648186a08785143599760f7cc81c52334a55d6a83ea1e20841" +dependencies = [ + "atomic-write-file", + "dotenvy", + "either", + "heck", + "hex", + "once_cell", + "proc-macro2", + "quote", + "serde", + "serde_json", + "sha2", + "sqlx-core", + "sqlx-mysql", + "sqlx-sqlite", + "syn 1.0.109", + "tempfile", + "tokio", + "url", +] + +[[package]] +name = "sqlx-mysql" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e37195395df71fd068f6e2082247891bc11e3289624bbc776a0cdfa1ca7f1ea4" +dependencies = [ + "atoi", + "base64", + "bitflags 2.4.1", + "byteorder", + "bytes", + "crc", + "digest", + "dotenvy", + "either", + "futures-channel", + "futures-core", + "futures-io", + "futures-util", + "generic-array", + "hex", + "hkdf", + "hmac", + "itoa", + "log", + "md-5", + "memchr", + "once_cell", + "percent-encoding", + "rand 0.8.5", + "rsa", + "serde", + "sha1", + "sha2", + "smallvec", + "sqlx-core", + "stringprep", + "thiserror", + "tracing", + "whoami", +] + +[[package]] +name = "sqlx-postgres" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6ac0ac3b7ccd10cc96c7ab29791a7dd236bd94021f31eec7ba3d46a74aa1c24" +dependencies = [ + "atoi", + "base64", + "bitflags 2.4.1", + "byteorder", + "crc", + "dotenvy", + "etcetera", + "futures-channel", + "futures-core", + "futures-io", + "futures-util", + "hex", + "hkdf", + "hmac", + "home", + "itoa", + "log", + "md-5", + "memchr", + "once_cell", + "rand 0.8.5", + "serde", + "serde_json", + "sha1", + "sha2", + "smallvec", + "sqlx-core", + "stringprep", + "thiserror", + "tracing", + "whoami", +] + +[[package]] +name = "sqlx-sqlite" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "210976b7d948c7ba9fced8ca835b11cbb2d677c59c79de41ac0d397e14547490" +dependencies = [ + "atoi", + "flume", + "futures-channel", + "futures-core", + "futures-executor", + "futures-intrusive", + "futures-util", + "libsqlite3-sys", + "log", + "percent-encoding", + "serde", + "sqlx-core", + "tracing", + "url", + "urlencoding", +] + +[[package]] +name = "stringprep" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb41d74e231a107a1b4ee36bd1214b11285b77768d2e3824aedafa988fd36ee6" +dependencies = [ + "finl_unicode", + "unicode-bidi", + "unicode-normalization", +] [[package]] name = "strsim" @@ -1370,6 +1969,23 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "subtle" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.48" @@ -1402,6 +2018,19 @@ dependencies = [ "libc", ] +[[package]] +name = "tempfile" +version = "3.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +dependencies = [ + "cfg-if", + "fastrand", + "redox_syscall 0.4.1", + "rustix", + "windows-sys 0.52.0", +] + [[package]] name = "thiserror" version = "1.0.56" @@ -1419,7 +2048,7 @@ checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] @@ -1479,7 +2108,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] @@ -1504,6 +2133,17 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-stream" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + [[package]] name = "tokio-util" version = "0.7.10" @@ -1530,6 +2170,7 @@ version = "0.1.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" dependencies = [ + "log", "pin-project-lite", "tracing-attributes", "tracing-core", @@ -1543,7 +2184,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] @@ -1587,6 +2228,12 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "typenum" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" + [[package]] name = "unicode-bidi" version = "0.3.14" @@ -1608,6 +2255,18 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-segmentation" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" + +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + [[package]] name = "untrusted" version = "0.9.0" @@ -1625,6 +2284,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + [[package]] name = "utf8-width" version = "0.1.7" @@ -1697,7 +2362,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn", + "syn 2.0.48", "wasm-bindgen-shared", ] @@ -1731,7 +2396,7 @@ checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -1773,6 +2438,12 @@ version = "0.25.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1778a42e8b3b90bff8d0f5032bf22250792889a5cdc752aa0020c84abe3aaf10" +[[package]] +name = "whoami" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22fc3756b8a9133049b26c7f61ab35416c130e8c09b660f5b3958b446f52cc50" + [[package]] name = "winapi" version = "0.3.9" @@ -1963,5 +2634,11 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] + +[[package]] +name = "zeroize" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" diff --git a/scraper-rs/Cargo.toml b/scraper-rs/Cargo.toml index e8b1a0a..809f941 100644 --- a/scraper-rs/Cargo.toml +++ b/scraper-rs/Cargo.toml @@ -12,8 +12,7 @@ base64 = "0.21.7" chrono = "0.4.32" clap = { version = "4.4.15", features = ["derive"] } cron = "0.12.0" -deadpool = "0.10.0" -deadpool-sqlite = "0.7.0" +sqlx = { version = "0.7", features = [ "runtime-tokio", "sqlite" ] } futures = "0.3.30" html-escape = "0.2.13" itertools = "0.12.0" diff --git a/scraper-rs/src/best_selling.rs b/scraper-rs/src/best_selling.rs index 2f3e586..23f5cd9 100644 --- a/scraper-rs/src/best_selling.rs +++ b/scraper-rs/src/best_selling.rs @@ -1,9 +1,8 @@ use std::collections::HashMap; -use crate::{build_client, sites::vtex, supermercado::Supermercado}; +use crate::{build_client, db::Db, sites::vtex, supermercado::Supermercado}; use chrono::{DateTime, Utc}; use clap::ValueEnum; -use deadpool_sqlite::Pool; use futures::{stream, FutureExt, StreamExt, TryStreamExt}; use itertools::Itertools; use tracing::warn; @@ -49,21 +48,11 @@ pub struct BestSellingRecord { pub eans: Vec, } -async fn get_best_selling_eans(pool: &Pool, urls: Vec) -> anyhow::Result> { +async fn get_best_selling_eans(db: &Db, urls: Vec) -> anyhow::Result> { let mut eans: Vec = Vec::new(); for url in urls { - let q = url.clone(); - let ean = pool - .get() - .await? - .interact(move |conn| { - conn.prepare(r#"SELECT ean FROM precios WHERE url = ?1;"#)? - .query_map(rusqlite::params![q], |r| r.get::<_, String>(0)) - .map(|r| r.map(|r| r.unwrap()).next()) - }) - .await - .unwrap()?; + let ean = db.get_ean_by_url(&url).await?; match ean { Some(e) => eans.push(e), None => warn!("No encontré EAN para {}", url), @@ -75,13 +64,13 @@ async fn get_best_selling_eans(pool: &Pool, urls: Vec) -> anyhow::Result async fn try_get_best_selling_eans( client: reqwest::Client, - pool: Pool, + db: Db, supermercado: &Supermercado, category: &Category, ) -> anyhow::Result>> { if let Some(query) = category.query(supermercado) { let urls = vtex::get_best_selling_by_category(&client, supermercado.host(), query).await?; - let eans = get_best_selling_eans(&pool, urls).await?; + let eans = get_best_selling_eans(&db, urls).await?; Ok(Some(eans)) } else { Ok(None) @@ -107,18 +96,18 @@ fn rank_eans(eans: Vec>) -> Vec { .collect_vec() } -pub async fn get_all_best_selling(pool: &Pool) -> anyhow::Result> { +pub async fn get_all_best_selling(db: &Db) -> anyhow::Result> { let client = &build_client(); stream::iter(Category::value_variants()) .map(|category| { stream::iter(Supermercado::value_variants()) .map(|supermercado| { - let pool = pool.clone(); + let db = db.clone(); let client = client.clone(); tokio::spawn(try_get_best_selling_eans( client, - pool, + db, supermercado, category, )) diff --git a/scraper-rs/src/db.rs b/scraper-rs/src/db.rs new file mode 100644 index 0000000..2385f04 --- /dev/null +++ b/scraper-rs/src/db.rs @@ -0,0 +1,109 @@ +use std::{ + env, + str::FromStr, + time::{SystemTime, UNIX_EPOCH}, +}; + +use sqlx::{sqlite::SqliteConnectOptions, SqlitePool}; + +use crate::{best_selling::BestSellingRecord, PrecioPoint}; + +#[derive(Clone)] +pub struct Db { + pool: SqlitePool, +} + +impl Db { + pub async fn connect() -> anyhow::Result { + let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string()); + let pool = sqlx::pool::PoolOptions::new() + .max_connections(1) + .connect_with( + SqliteConnectOptions::from_str(&db_path)? + .journal_mode(sqlx::sqlite::SqliteJournalMode::Wal) + .synchronous(sqlx::sqlite::SqliteSynchronous::Normal) + .optimize_on_close(true, None), + ) + .await?; + Ok(Self { pool }) + } + + pub async fn insert_precio(&self, point: PrecioPoint) -> anyhow::Result<()> { + sqlx::query!("INSERT INTO precios(ean, fetched_at, precio_centavos, in_stock, url, warc_record_id, parser_version, name, image_url) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9);", + point.ean, + point.fetched_at, + point.precio_centavos, + point.in_stock, + point.url, + None::, + point.parser_version, + point.name, + point.image_url, + ).execute(&self.pool).await?; + Ok(()) + } + + pub async fn get_ean_by_url(&self, url: &str) -> anyhow::Result> { + Ok(sqlx::query!("SELECT ean FROM precios WHERE url = ?1;", url) + .fetch_optional(&self.pool) + .await? + .map(|r| r.ean)) + } + + pub async fn get_urls_by_domain(&self, domain: &str) -> anyhow::Result> { + let query = format!("%{}%", domain); + Ok( + sqlx::query!("SELECT url FROM producto_urls WHERE url LIKE ?1;", query) + .fetch_all(&self.pool) + .await? + .into_iter() + .map(|r| r.url) + .collect(), + ) + } + + pub async fn save_producto_urls(&self, urls: Vec) -> anyhow::Result<()> { + let now: i64 = now_ms().try_into()?; + let mut tx = self.pool.begin().await?; + for url in urls { + sqlx::query!( + r#"INSERT INTO producto_urls(url, first_seen, last_seen) + VALUES (?1, ?2, ?2) + ON CONFLICT(url) DO UPDATE SET last_seen=?2;"#, + url, + now + ) + .execute(&mut *tx) + .await?; + } + tx.commit().await?; + Ok(()) + } + + pub async fn save_best_selling(&self, records: Vec) -> anyhow::Result<()> { + let mut tx = self.pool.begin().await?; + for record in records { + let fetched_at = record.fetched_at.timestamp_millis(); + let category = record.category.id(); + let eans_json = serde_json::Value::from(record.eans).to_string(); + sqlx::query!( + r#"INSERT INTO db_best_selling(fetched_at, category, eans_json) + VALUES (?1, ?2, ?3);"#, + fetched_at, + category, + eans_json + ) + .execute(&mut *tx) + .await?; + } + tx.commit().await?; + Ok(()) + } +} + +fn now_ms() -> u128 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("Time went backwards") + .as_millis() +} diff --git a/scraper-rs/src/main.rs b/scraper-rs/src/main.rs index 308f0eb..3968b12 100644 --- a/scraper-rs/src/main.rs +++ b/scraper-rs/src/main.rs @@ -1,8 +1,7 @@ use again::RetryPolicy; -use best_selling::BestSellingRecord; use clap::{Parser, ValueEnum}; use cron::Schedule; -use deadpool_sqlite::Pool; +use db::Db; use futures::{future, stream, Future, StreamExt}; use nanoid::nanoid; use reqwest::{header::HeaderMap, StatusCode, Url}; @@ -73,7 +72,7 @@ async fn scrap_url_cli(url: String) -> anyhow::Result<()> { } mod best_selling; async fn scrap_best_selling_cli() -> anyhow::Result<()> { - let db = connect_db(); + let db = Db::connect().await?; let res = best_selling::get_all_best_selling(&db).await; println!("Result: {:#?}", res); @@ -89,14 +88,14 @@ async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> { .map(|s| s.to_owned()) .collect::>(); - let pool = connect_db(); - let counters = fetch_list(&pool, links).await; + let db = Db::connect().await?; + let counters = fetch_list(&db, links).await; println!("Finished: {:?}", counters); Ok(()) } -async fn fetch_list(pool: &Pool, links: Vec) -> Counters { +async fn fetch_list(db: &Db, links: Vec) -> Counters { let n_coroutines = env::var("N_COROUTINES") .map_or(Ok(24), |s| s.parse::()) .expect("N_COROUTINES no es un número"); @@ -105,9 +104,9 @@ async fn fetch_list(pool: &Pool, links: Vec) -> Counters { stream::iter(links) .map(|url| { - let pool = pool.clone(); + let db = db.clone(); let client = client.clone(); - tokio::spawn(fetch_and_save(client, url, pool)) + tokio::spawn(fetch_and_save(client, url, db)) }) .buffer_unordered(n_coroutines) .fold(Counters::default(), move |x, y| { @@ -121,11 +120,7 @@ async fn fetch_list(pool: &Pool, links: Vec) -> Counters { .await } -fn connect_db() -> Pool { - let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string()); - let cfg = deadpool_sqlite::Config::new(db_path); - cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap() -} +mod db; #[derive(Default, Debug)] struct Counters { @@ -134,26 +129,13 @@ struct Counters { skipped: u64, } -async fn fetch_and_save(client: reqwest::Client, url: String, pool: Pool) -> Counters { +async fn fetch_and_save(client: reqwest::Client, url: String, db: Db) -> Counters { let res = fetch_and_parse(&client, url.clone()).await; let mut counters = Counters::default(); match res { Ok(res) => { counters.success += 1; - pool.get().await.unwrap().interact(move |conn| conn.execute( - "INSERT INTO precios(ean, fetched_at, precio_centavos, in_stock, url, warc_record_id, parser_version, name, image_url) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9);", - rusqlite::params![ - res.ean, - res.fetched_at, - res.precio_centavos, - res.in_stock, - res.url, - None::, - res.parser_version, - res.name, - res.image_url, - ] - )).await.unwrap().unwrap(); + db.insert_precio(res).await.unwrap(); } Err(err) => { match err.downcast_ref::() { @@ -301,7 +283,7 @@ struct AutoTelegram { #[derive(Clone)] struct Auto { - pool: Pool, + db: Db, telegram: Option, } impl Auto { @@ -316,24 +298,7 @@ impl Auto { )) .await; } - let links: Vec = { - let search = format!("%{}%", supermercado.host()); - self.pool - .get() - .await? - .interact(move |conn| -> anyhow::Result> { - Ok(conn - .prepare( - r#"SELECT url FROM producto_urls - WHERE url LIKE ?1;"#, - )? - .query_map(rusqlite::params![search], |r| r.get::<_, String>(0))? - .map(|r| r.unwrap()) - .collect()) - }) - .await - .unwrap()? - }; + let links: Vec = self.db.get_urls_by_domain(supermercado.host()).await?; // { // let debug_path = PathBuf::from("debug/"); // tokio::fs::create_dir_all(&debug_path).await.unwrap(); @@ -345,7 +310,7 @@ impl Auto { // } { let t0 = now_sec(); - let counters = fetch_list(&self.pool, links).await; + let counters = fetch_list(&self.db, links).await; self.inform(&format!( "Downloaded {:?}: {:?} (took {})", &supermercado, @@ -368,56 +333,7 @@ impl Auto { async fn get_and_save_urls(&self, supermercado: &Supermercado) -> anyhow::Result<()> { let urls = get_urls(supermercado).await?; - self.pool - .get() - .await? - .interact(|conn| -> Result<(), anyhow::Error> { - let tx = conn.transaction()?; - { - let mut stmt = tx.prepare( - r#"INSERT INTO producto_urls(url, first_seen, last_seen) - VALUES (?1, ?2, ?2) - ON CONFLICT(url) DO UPDATE SET last_seen=?2;"#, - )?; - let now: u64 = now_ms().try_into()?; - for url in urls { - stmt.execute(rusqlite::params![url, now])?; - } - } - tx.commit()?; - Ok(()) - }) - .await - .unwrap()?; - Ok(()) - } - - async fn save_best_selling(&self, best_selling: Vec) -> anyhow::Result<()> { - self.pool - .get() - .await? - .interact(move |conn| -> Result<(), anyhow::Error> { - let tx = conn.transaction()?; - { - let mut stmt = tx.prepare( - r#"INSERT INTO db_best_selling(fetched_at, category, eans_json) - VALUES (?1, ?2, ?3);"#, - )?; - for record in best_selling { - let eans_json = serde_json::Value::from(record.eans).to_string(); - let fetched_at = record.fetched_at.timestamp_millis(); - stmt.execute(rusqlite::params![ - fetched_at, - record.category.id(), - eans_json - ])?; - } - } - tx.commit()?; - Ok(()) - }) - .await - .unwrap()?; + self.db.save_producto_urls(urls).await?; Ok(()) } @@ -438,20 +354,22 @@ impl Auto { } async fn auto_cli() -> anyhow::Result<()> { - let db = connect_db(); - let telegram = { - match ( - env::var("TELEGRAM_BOT_TOKEN"), - env::var("TELEGRAM_BOT_CHAT_ID"), - ) { - (Ok(token), Ok(chat_id)) => Some(AutoTelegram { token, chat_id }), - _ => { - tracing::warn!("No token or chat_id for telegram"); - None + let auto = { + let db = Db::connect().await?; + let telegram = { + match ( + env::var("TELEGRAM_BOT_TOKEN"), + env::var("TELEGRAM_BOT_CHAT_ID"), + ) { + (Ok(token), Ok(chat_id)) => Some(AutoTelegram { token, chat_id }), + _ => { + tracing::warn!("No token or chat_id for telegram"); + None + } } - } + }; + Auto { db, telegram } }; - let auto = Auto { pool: db, telegram }; auto.inform("[auto] Empezando scrap").await; let handles: Vec<_> = Supermercado::value_variants() .iter() @@ -462,10 +380,10 @@ async fn auto_cli() -> anyhow::Result<()> { let best_selling = auto .inform_time( "Downloaded best selling", - best_selling::get_all_best_selling(&auto.pool), + best_selling::get_all_best_selling(&auto.db), ) .await?; - auto.save_best_selling(best_selling).await?; + auto.db.save_best_selling(best_selling).await?; Ok(()) } @@ -494,8 +412,8 @@ mod sites; struct PrecioPoint { ean: String, // unix - fetched_at: u64, - precio_centavos: Option, + fetched_at: i64, + precio_centavos: Option, in_stock: Option, url: String, parser_version: u16, @@ -503,13 +421,9 @@ struct PrecioPoint { image_url: Option, } -fn now_sec() -> u64 { - since_the_epoch().as_secs() +fn now_sec() -> i64 { + since_the_epoch().as_secs().try_into().unwrap() } -fn now_ms() -> u128 { - since_the_epoch().as_millis() -} - fn since_the_epoch() -> Duration { SystemTime::now() .duration_since(UNIX_EPOCH) diff --git a/scraper-rs/src/sites/common.rs b/scraper-rs/src/sites/common.rs index d75b03e..badafb3 100644 --- a/scraper-rs/src/sites/common.rs +++ b/scraper-rs/src/sites/common.rs @@ -11,9 +11,9 @@ pub fn get_meta_content<'a>(dom: &'a VDom<'a>, prop: &str) -> Option) -> Result, anyhow::Error> { +pub fn price_from_meta(dom: &tl::VDom<'_>) -> Result, anyhow::Error> { let precio_centavos = get_meta_content(dom, "product:price:amount") - .map(|s| s.parse::().map(|f| (f * 100.0) as u64)) + .map(|s| s.parse::().map(|f| (f * 100.0) as i64)) .transpose()?; Ok(precio_centavos) } diff --git a/scraper-rs/src/sites/coto.rs b/scraper-rs/src/sites/coto.rs index 2e89664..3f1e4d8 100644 --- a/scraper-rs/src/sites/coto.rs +++ b/scraper-rs/src/sites/coto.rs @@ -37,7 +37,7 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result }) .transpose() .context("Parseando precio")? - .map(|f| (f * 100.0) as u64); + .map(|f| (f * 100.0) as i64); let in_stock = Some( dom.query_selector(".product_not_available") From 33d416d921f66442131aee68cbfd1a49761ff426 Mon Sep 17 00:00:00 2001 From: Nulo Date: Tue, 30 Jan 2024 11:06:35 -0300 Subject: [PATCH 5/8] dockerfile.scraper: agregar DB con migraciones --- Dockerfile.scraper | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Dockerfile.scraper b/Dockerfile.scraper index 8494c5f..6a06df3 100644 --- a/Dockerfile.scraper +++ b/Dockerfile.scraper @@ -2,14 +2,23 @@ FROM cgr.dev/chainguard/wolfi-base AS base WORKDIR /usr/src/app RUN apk add --no-cache libgcc +# tenemos que generar una DB con las migraciones aplicadas para compilar el codigo por sqlx::query!() +FROM base as db-build +RUN apk add --no-cache nodejs npm +RUN npm install --global pnpm +COPY db-datos/ . +RUN pnpm install +RUN DB_PATH=db.db pnpm migrate + FROM base as rs-build RUN apk add --no-cache rust build-base sqlite-dev COPY scraper-rs/ . +COPY --from=db-build /usr/src/app/db.db . RUN --mount=type=cache,sharing=locked,target=/root/.cargo/git \ --mount=type=cache,sharing=locked,target=/root/.cargo/registry \ --mount=type=cache,sharing=locked,target=/usr/src/app/target \ - cargo install --locked --path . + DATABASE_URL=sqlite:db.db cargo install --locked --path . FROM base RUN apk add --no-cache sqlite sqlite-libs From 9dd3a8766f2c07d0036767065cc28e79db9485f6 Mon Sep 17 00:00:00 2001 From: Nulo Date: Tue, 30 Jan 2024 11:48:09 -0300 Subject: [PATCH 6/8] db: arreglar init --- scraper-rs/src/db.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scraper-rs/src/db.rs b/scraper-rs/src/db.rs index 2385f04..78f9d4b 100644 --- a/scraper-rs/src/db.rs +++ b/scraper-rs/src/db.rs @@ -5,6 +5,7 @@ use std::{ }; use sqlx::{sqlite::SqliteConnectOptions, SqlitePool}; +use tracing::info; use crate::{best_selling::BestSellingRecord, PrecioPoint}; @@ -16,10 +17,11 @@ pub struct Db { impl Db { pub async fn connect() -> anyhow::Result { let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string()); + info!("Opening DB at {}", db_path); let pool = sqlx::pool::PoolOptions::new() .max_connections(1) .connect_with( - SqliteConnectOptions::from_str(&db_path)? + SqliteConnectOptions::from_str(&format!("sqlite://{}", db_path))? .journal_mode(sqlx::sqlite::SqliteJournalMode::Wal) .synchronous(sqlx::sqlite::SqliteSynchronous::Normal) .optimize_on_close(true, None), From 18d916bfcdfc59e747a53392b7f4917e90334148 Mon Sep 17 00:00:00 2001 From: Nulo Date: Tue, 30 Jan 2024 22:53:35 -0300 Subject: [PATCH 7/8] crear index --- db-datos/drizzle/0011_huge_next_avengers.sql | 1 + db-datos/drizzle/meta/0011_snapshot.json | 190 +++++++++++++++++++ db-datos/drizzle/meta/_journal.json | 7 + db-datos/schema.js | 1 + 4 files changed, 199 insertions(+) create mode 100644 db-datos/drizzle/0011_huge_next_avengers.sql create mode 100644 db-datos/drizzle/meta/0011_snapshot.json diff --git a/db-datos/drizzle/0011_huge_next_avengers.sql b/db-datos/drizzle/0011_huge_next_avengers.sql new file mode 100644 index 0000000..316c94f --- /dev/null +++ b/db-datos/drizzle/0011_huge_next_avengers.sql @@ -0,0 +1 @@ +CREATE INDEX `precios_url_idx` ON `precios` (`url`); \ No newline at end of file diff --git a/db-datos/drizzle/meta/0011_snapshot.json b/db-datos/drizzle/meta/0011_snapshot.json new file mode 100644 index 0000000..9fc2714 --- /dev/null +++ b/db-datos/drizzle/meta/0011_snapshot.json @@ -0,0 +1,190 @@ +{ + "version": "5", + "dialect": "sqlite", + "id": "8b4921b5-6ecd-4d69-ba64-9b0bfb53db84", + "prevId": "c8297337-4ed8-432e-8782-65d41be42e00", + "tables": { + "db_best_selling": { + "name": "db_best_selling", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "autoincrement": true + }, + "fetched_at": { + "name": "fetched_at", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "category": { + "name": "category", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "eans_json": { + "name": "eans_json", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "precios": { + "name": "precios", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "autoincrement": true + }, + "ean": { + "name": "ean", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "fetched_at": { + "name": "fetched_at", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "precio_centavos": { + "name": "precio_centavos", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "in_stock": { + "name": "in_stock", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "url": { + "name": "url", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "warc_record_id": { + "name": "warc_record_id", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "parser_version": { + "name": "parser_version", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "image_url": { + "name": "image_url", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + } + }, + "indexes": { + "precios_ean_idx": { + "name": "precios_ean_idx", + "columns": [ + "ean" + ], + "isUnique": false + }, + "precios_url_idx": { + "name": "precios_url_idx", + "columns": [ + "url" + ], + "isUnique": false + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "producto_urls": { + "name": "producto_urls", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "autoincrement": true + }, + "url": { + "name": "url", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "first_seen": { + "name": "first_seen", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "last_seen": { + "name": "last_seen", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": { + "producto_urls_url_unique": { + "name": "producto_urls_url_unique", + "columns": [ + "url" + ], + "isUnique": true + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + } + }, + "enums": {}, + "_meta": { + "schemas": {}, + "tables": {}, + "columns": {} + } +} \ No newline at end of file diff --git a/db-datos/drizzle/meta/_journal.json b/db-datos/drizzle/meta/_journal.json index c9d8729..89f60a6 100644 --- a/db-datos/drizzle/meta/_journal.json +++ b/db-datos/drizzle/meta/_journal.json @@ -78,6 +78,13 @@ "when": 1706540751931, "tag": "0010_true_black_tom", "breakpoints": true + }, + { + "idx": 11, + "version": "5", + "when": 1706628184254, + "tag": "0011_huge_next_avengers", + "breakpoints": true } ] } \ No newline at end of file diff --git a/db-datos/schema.js b/db-datos/schema.js index eabdc35..f77d2b5 100644 --- a/db-datos/schema.js +++ b/db-datos/schema.js @@ -18,6 +18,7 @@ export const precios = sqliteTable( (precios) => { return { preciosEanIdx: index("precios_ean_idx").on(precios.ean), + preciosUrlIdx: index("precios_url_idx").on(precios.url), }; } ); From a4869d068e76aaf699f2c556aa1f0bd87c25d5fc Mon Sep 17 00:00:00 2001 From: Nulo Date: Mon, 5 Feb 2024 22:36:00 -0300 Subject: [PATCH 8/8] WIP --- Dockerfile | 2 +- db-datos/supermercado.ts | 3 ++ sitio/src/routes/+page.server.ts | 54 +++++++++++++++++++++++++++----- sitio/src/routes/+page.svelte | 9 +++++- 4 files changed, 58 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index 73b4dcb..ee1deb9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,7 +23,7 @@ COPY --from=build /usr/src/app/sitio/package.json package.real.json RUN sh -c 'echo {\"name\":\"sitio\",\"type\":\"module\",\"dependencies\":$(jq .dependencies < package.real.json)} > package.json' && npm install COPY --from=build /usr/src/app/db-datos node_modules/db-datos COPY --from=build /usr/src/app/sitio/build . -COPY --from=build /usr/src/app/db-datos/drizzle . +COPY --from=build /usr/src/app/db-datos/drizzle drizzle ENV DB_PATH=/db/db.db EXPOSE 3000 diff --git a/db-datos/supermercado.ts b/db-datos/supermercado.ts index 1e167a0..ec4c8c3 100644 --- a/db-datos/supermercado.ts +++ b/db-datos/supermercado.ts @@ -16,6 +16,9 @@ export const hosts: { [host: string]: Supermercado } = { "www.cotodigital3.com.ar": Supermercado.Coto, "www.jumbo.com.ar": Supermercado.Jumbo, }; +export const hostBySupermercado = Object.fromEntries( + Object.entries(hosts).map(([a, b]) => [b, a]) +) as Record; export const colorBySupermercado: { [supermercado in Supermercado]: string } = { [Supermercado.Dia]: "#d52b1e", [Supermercado.Carrefour]: "#19549d", diff --git a/sitio/src/routes/+page.server.ts b/sitio/src/routes/+page.server.ts index dbb1534..ad906c7 100644 --- a/sitio/src/routes/+page.server.ts +++ b/sitio/src/routes/+page.server.ts @@ -1,25 +1,63 @@ import type { PageData, PageServerLoad } from "./$types"; import { getDb, schema } from "$lib/server/db"; const { precios } = schema; -import { sql } from "drizzle-orm"; +import { desc, sql } from "drizzle-orm"; +import { + Supermercado, + hostBySupermercado, + supermercados, +} from "db-datos/supermercado"; let cache: Promise<{ key: Date; data: { precios: Precios } }> = doQuery(); + async function doQuery() { const db = await getDb(); - const q = db + console.time("ean"); + const eans = await db .select({ ean: precios.ean, - name: precios.name, - imageUrl: precios.imageUrl, }) .from(precios) .groupBy(precios.ean) - .having(sql`max(length(name)) and max(parser_version) and in_stock`) .orderBy(sql`random()`) - .limit(150); - const res = await q; - const data = { precios: res }; + .limit(50); + console.timeEnd("ean"); + + return; + + const precioss = await Promise.all( + supermercados.map( + async ( + supermercado, + ): Promise< + [ + Supermercado, + { ean: string; name: string | null; imageUrl: string | null }[], + ] + > => { + const host = hostBySupermercado[supermercado]; + console.time(supermercado); + const q = db + .select({ + ean: precios.ean, + name: precios.name, + imageUrl: precios.imageUrl, + }) + .from(precios) + .groupBy(precios.ean) + .having(sql`max(fetched_at)`) + .where( + sql`ean in ${eans.map((x) => x.ean)} and in_stock and url like ${`%${host}%`}`, + ); + // console.debug(q.toSQL()); + const res = await q; + console.timeEnd(supermercado); + return [supermercado, res]; + }, + ), + ); + const data = { precios: precioss.flatMap(([_, r]) => r) }; return { key: new Date(), data }; } diff --git a/sitio/src/routes/+page.svelte b/sitio/src/routes/+page.svelte index d41f231..6ce7518 100644 --- a/sitio/src/routes/+page.svelte +++ b/sitio/src/routes/+page.svelte @@ -7,6 +7,13 @@ (d): d is { ean: string; name: string; imageUrl: string | null } => !!d.name, ); + $: productos = precios.reduce( + (prev, curr) => [ + ...prev, + ...(prev.find((p) => p.ean === curr.ean) ? [] : [curr]), + ], + [] as { ean: string; name: string; imageUrl: string | null }[], + );

WIP

@@ -39,7 +46,7 @@

Random

    - {#each precios as product} + {#each productos as product}