This commit is contained in:
Nulo 2024-02-06 16:26:43 +00:00 committed by GitHub
commit ad306364b7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
31 changed files with 1731 additions and 178 deletions

2
.gitignore vendored
View file

@ -16,4 +16,4 @@ target/
*/flamegraph.svg
*/perf.data*
scraper-rs/debug/
scraper-rs/debug*/

View file

@ -23,7 +23,7 @@ COPY --from=build /usr/src/app/sitio/package.json package.real.json
RUN sh -c 'echo {\"name\":\"sitio\",\"type\":\"module\",\"dependencies\":$(jq .dependencies < package.real.json)} > package.json' && npm install
COPY --from=build /usr/src/app/db-datos node_modules/db-datos
COPY --from=build /usr/src/app/sitio/build .
COPY --from=build /usr/src/app/db-datos/drizzle .
COPY --from=build /usr/src/app/db-datos/drizzle drizzle
ENV DB_PATH=/db/db.db
EXPOSE 3000

View file

@ -2,14 +2,23 @@ FROM cgr.dev/chainguard/wolfi-base AS base
WORKDIR /usr/src/app
RUN apk add --no-cache libgcc
# tenemos que generar una DB con las migraciones aplicadas para compilar el codigo por sqlx::query!()
FROM base as db-build
RUN apk add --no-cache nodejs npm
RUN npm install --global pnpm
COPY db-datos/ .
RUN pnpm install
RUN DB_PATH=db.db pnpm migrate
FROM base as rs-build
RUN apk add --no-cache rust build-base sqlite-dev
COPY scraper-rs/ .
COPY --from=db-build /usr/src/app/db.db .
RUN --mount=type=cache,sharing=locked,target=/root/.cargo/git \
--mount=type=cache,sharing=locked,target=/root/.cargo/registry \
--mount=type=cache,sharing=locked,target=/usr/src/app/target \
cargo install --locked --path .
DATABASE_URL=sqlite:db.db cargo install --locked --path .
FROM base
RUN apk add --no-cache sqlite sqlite-libs

View file

@ -0,0 +1,6 @@
CREATE TABLE `db_best_selling` (
`id` integer PRIMARY KEY AUTOINCREMENT NOT NULL,
`fetched_at` integer NOT NULL,
`category` text NOT NULL,
`eans_json` text NOT NULL
);

View file

@ -0,0 +1 @@
CREATE INDEX `precios_url_idx` ON `precios` (`url`);

View file

@ -0,0 +1,183 @@
{
"version": "5",
"dialect": "sqlite",
"id": "c8297337-4ed8-432e-8782-65d41be42e00",
"prevId": "2e398920-ffaf-4d55-ae13-d906cb9e0efa",
"tables": {
"db_best_selling": {
"name": "db_best_selling",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"category": {
"name": "category",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"eans_json": {
"name": "eans_json",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
}
},
"indexes": {},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
},
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {
"precios_ean_idx": {
"name": "precios_ean_idx",
"columns": [
"ean"
],
"isUnique": false
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
},
"producto_urls": {
"name": "producto_urls",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"first_seen": {
"name": "first_seen",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"last_seen": {
"name": "last_seen",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
}
},
"indexes": {
"producto_urls_url_unique": {
"name": "producto_urls_url_unique",
"columns": [
"url"
],
"isUnique": true
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"schemas": {},
"tables": {},
"columns": {}
}
}

View file

@ -0,0 +1,190 @@
{
"version": "5",
"dialect": "sqlite",
"id": "8b4921b5-6ecd-4d69-ba64-9b0bfb53db84",
"prevId": "c8297337-4ed8-432e-8782-65d41be42e00",
"tables": {
"db_best_selling": {
"name": "db_best_selling",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"category": {
"name": "category",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"eans_json": {
"name": "eans_json",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
}
},
"indexes": {},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
},
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {
"precios_ean_idx": {
"name": "precios_ean_idx",
"columns": [
"ean"
],
"isUnique": false
},
"precios_url_idx": {
"name": "precios_url_idx",
"columns": [
"url"
],
"isUnique": false
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
},
"producto_urls": {
"name": "producto_urls",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"first_seen": {
"name": "first_seen",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"last_seen": {
"name": "last_seen",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
}
},
"indexes": {
"producto_urls_url_unique": {
"name": "producto_urls_url_unique",
"columns": [
"url"
],
"isUnique": true
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"schemas": {},
"tables": {},
"columns": {}
}
}

View file

@ -71,6 +71,20 @@
"when": 1703895109501,
"tag": "0009_breezy_forge",
"breakpoints": true
},
{
"idx": 10,
"version": "5",
"when": 1706540751931,
"tag": "0010_true_black_tom",
"breakpoints": true
},
{
"idx": 11,
"version": "5",
"when": 1706628184254,
"tag": "0011_huge_next_avengers",
"breakpoints": true
}
]
}

3
db-datos/migrate-cli.js Normal file
View file

@ -0,0 +1,3 @@
import { getDb } from "./db.js";
getDb();

View file

@ -2,12 +2,17 @@
import { migrate } from "drizzle-orm/better-sqlite3/migrator";
import * as schema from "./schema.js";
import { sql } from "drizzle-orm";
import { existsSync } from "node:fs";
import { join } from "node:path";
/**
* @param {import("drizzle-orm/better-sqlite3").BetterSQLite3Database<schema>} db
*/
export function migrateDb(db) {
migrate(db, { migrationsFolder: "node_modules/db-datos/drizzle" });
let path = "drizzle/";
if (!existsSync(join(path, "meta/_journal.json")))
path = "node_modules/db-datos/drizzle";
migrate(db, { migrationsFolder: path });
db.run(sql`pragma journal_mode = WAL;`);
db.run(sql`PRAGMA synchronous = NORMAL;`);
}

View file

@ -6,7 +6,7 @@
"main": "index.js",
"scripts": {
"generate": "drizzle-kit generate:sqlite",
"migrate": "node db.js"
"migrate": "node migrate-cli.js"
},
"keywords": [],
"author": "",

View file

@ -18,8 +18,9 @@ export const precios = sqliteTable(
(precios) => {
return {
preciosEanIdx: index("precios_ean_idx").on(precios.ean),
preciosUrlIdx: index("precios_url_idx").on(precios.url),
};
},
}
);
/** @typedef {typeof precios.$inferSelect} Precio */
@ -32,3 +33,12 @@ export const productoUrls = sqliteTable("producto_urls", {
});
/** @typedef {typeof productoUrls.$inferSelect} ProductUrl */
export const bestSelling = sqliteTable("db_best_selling", {
id: integer("id", { mode: "number" }).primaryKey({ autoIncrement: true }),
fetchedAt: integer("fetched_at", { mode: "timestamp" }).notNull(),
category: text("category").notNull(),
eansJson: text("eans_json").notNull(),
});
/** @typedef {typeof bestSelling.$inferSelect} BestSelling */

View file

@ -16,6 +16,9 @@ export const hosts: { [host: string]: Supermercado } = {
"www.cotodigital3.com.ar": Supermercado.Coto,
"www.jumbo.com.ar": Supermercado.Jumbo,
};
export const hostBySupermercado = Object.fromEntries(
Object.entries(hosts).map(([a, b]) => [b, a])
) as Record<Supermercado, string>;
export const colorBySupermercado: { [supermercado in Supermercado]: string } = {
[Supermercado.Dia]: "#d52b1e",
[Supermercado.Carrefour]: "#19549d",

1
scraper-rs/.env Normal file
View file

@ -0,0 +1 @@
DATABASE_URL=sqlite:../sqlite.db

View file

@ -0,0 +1,12 @@
{
"db_name": "SQLite",
"query": "INSERT INTO producto_urls(url, first_seen, last_seen)\n VALUES (?1, ?2, ?2)\n ON CONFLICT(url) DO UPDATE SET last_seen=?2;",
"describe": {
"columns": [],
"parameters": {
"Right": 2
},
"nullable": []
},
"hash": "08d55fc80c8a6ad73d311e8b1cd535425e5c2f39cf98735b5f67cb91d01937ce"
}

View file

@ -0,0 +1,12 @@
{
"db_name": "SQLite",
"query": "INSERT INTO db_best_selling(fetched_at, category, eans_json)\n VALUES (?1, ?2, ?3);",
"describe": {
"columns": [],
"parameters": {
"Right": 3
},
"nullable": []
},
"hash": "144f4622ac9a937aa4885ceb5a67f6c0a78e7e025cf152a8c176b9fd1de241da"
}

View file

@ -0,0 +1,20 @@
{
"db_name": "SQLite",
"query": "SELECT url FROM producto_urls WHERE url LIKE ?1;",
"describe": {
"columns": [
{
"name": "url",
"ordinal": 0,
"type_info": "Text"
}
],
"parameters": {
"Right": 1
},
"nullable": [
false
]
},
"hash": "aa5c2a04aec149d88f6e25a9bd7df4e257f3c9b0efa62c8342d077d69d826a69"
}

View file

@ -0,0 +1,12 @@
{
"db_name": "SQLite",
"query": "INSERT INTO precios(ean, fetched_at, precio_centavos, in_stock, url, warc_record_id, parser_version, name, image_url) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9);",
"describe": {
"columns": [],
"parameters": {
"Right": 9
},
"nullable": []
},
"hash": "d0c3a557a81f6685b242ed0be8e8c67b47ec8a575d2a14a487b3294e0faec438"
}

View file

@ -0,0 +1,20 @@
{
"db_name": "SQLite",
"query": "SELECT ean FROM precios WHERE url = ?1;",
"describe": {
"columns": [
{
"name": "ean",
"ordinal": 0,
"type_info": "Text"
}
],
"parameters": {
"Right": 1
},
"nullable": [
false
]
},
"hash": "f249765f2fb013a81a4157a6ce19744a8d5f70c83ed9ddddfd55009136088a52"
}

760
scraper-rs/Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -8,11 +8,11 @@ edition = "2021"
[dependencies]
again = "0.1.2"
anyhow = "1.0.79"
base64 = "0.21.7"
chrono = "0.4.32"
clap = { version = "4.4.15", features = ["derive"] }
cron = "0.12.0"
deadpool = "0.10.0"
deadpool-sqlite = "0.7.0"
sqlx = { version = "0.7", features = [ "runtime-tokio", "sqlite" ] }
futures = "0.3.30"
html-escape = "0.2.13"
itertools = "0.12.0"

View file

@ -0,0 +1,131 @@
use std::collections::HashMap;
use crate::{build_client, db::Db, sites::vtex, supermercado::Supermercado};
use chrono::{DateTime, Utc};
use clap::ValueEnum;
use futures::{stream, FutureExt, StreamExt, TryStreamExt};
use itertools::Itertools;
use tracing::warn;
#[derive(ValueEnum, Clone, Debug)]
pub enum Category {
Almacen,
Bebidas,
FrutasYVerduras,
}
impl Category {
fn query(&self, supermercado: &Supermercado) -> Option<&'static str> {
match self {
Self::Almacen => match supermercado {
Supermercado::Jumbo | Supermercado::Dia => Some("almacen"),
_ => None,
},
Self::Bebidas => match supermercado {
Supermercado::Jumbo | Supermercado::Dia => Some("bebidas"),
_ => None,
},
Self::FrutasYVerduras => match supermercado {
Supermercado::Jumbo => Some("frutas-y-verduras"),
Supermercado::Dia => Some("frescos/frutas-y-verduras"),
_ => None,
},
}
}
pub fn id(&self) -> &'static str {
match self {
Self::Almacen => "almacen",
Self::Bebidas => "bebidas",
Self::FrutasYVerduras => "frutas-y-verduras",
}
}
}
#[derive(Debug)]
pub struct BestSellingRecord {
pub fetched_at: DateTime<Utc>,
pub category: Category,
pub eans: Vec<String>,
}
async fn get_best_selling_eans(db: &Db, urls: Vec<String>) -> anyhow::Result<Vec<String>> {
let mut eans: Vec<String> = Vec::new();
for url in urls {
let ean = db.get_ean_by_url(&url).await?;
match ean {
Some(e) => eans.push(e),
None => warn!("No encontré EAN para {}", url),
}
}
Ok(eans)
}
async fn try_get_best_selling_eans(
client: reqwest::Client,
db: Db,
supermercado: &Supermercado,
category: &Category,
) -> anyhow::Result<Option<Vec<String>>> {
if let Some(query) = category.query(supermercado) {
let urls = vtex::get_best_selling_by_category(&client, supermercado.host(), query).await?;
let eans = get_best_selling_eans(&db, urls).await?;
Ok(Some(eans))
} else {
Ok(None)
}
}
async fn noop<T>(t: T) -> anyhow::Result<T> {
Ok(t)
}
fn rank_eans(eans: Vec<Vec<String>>) -> Vec<String> {
let mut map: HashMap<String, usize> = HashMap::new();
for eans in eans {
for (i, ean) in eans.into_iter().enumerate() {
let base = map.get(&ean).unwrap_or(&0);
let score = base + 1000 / (i + 1);
map.insert(ean, score);
}
}
map.into_iter()
.sorted_by(|a, b| Ord::cmp(&b.1, &a.1))
.map(|t| t.0)
.collect_vec()
}
pub async fn get_all_best_selling(db: &Db) -> anyhow::Result<Vec<BestSellingRecord>> {
let client = &build_client();
stream::iter(Category::value_variants())
.map(|category| {
stream::iter(Supermercado::value_variants())
.map(|supermercado| {
let db = db.clone();
let client = client.clone();
tokio::spawn(try_get_best_selling_eans(
client,
db,
supermercado,
category,
))
})
.buffer_unordered(5)
.map(|f| f.unwrap())
.try_filter_map(noop)
.try_collect::<Vec<Vec<String>>>()
.map(|r| {
r.map(rank_eans).map(|eans| BestSellingRecord {
fetched_at: Utc::now(),
category: category.clone(),
eans,
})
})
})
.buffer_unordered(5)
.boxed()
.try_collect()
.await
}

111
scraper-rs/src/db.rs Normal file
View file

@ -0,0 +1,111 @@
use std::{
env,
str::FromStr,
time::{SystemTime, UNIX_EPOCH},
};
use sqlx::{sqlite::SqliteConnectOptions, SqlitePool};
use tracing::info;
use crate::{best_selling::BestSellingRecord, PrecioPoint};
#[derive(Clone)]
pub struct Db {
pool: SqlitePool,
}
impl Db {
pub async fn connect() -> anyhow::Result<Self> {
let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string());
info!("Opening DB at {}", db_path);
let pool = sqlx::pool::PoolOptions::new()
.max_connections(1)
.connect_with(
SqliteConnectOptions::from_str(&format!("sqlite://{}", db_path))?
.journal_mode(sqlx::sqlite::SqliteJournalMode::Wal)
.synchronous(sqlx::sqlite::SqliteSynchronous::Normal)
.optimize_on_close(true, None),
)
.await?;
Ok(Self { pool })
}
pub async fn insert_precio(&self, point: PrecioPoint) -> anyhow::Result<()> {
sqlx::query!("INSERT INTO precios(ean, fetched_at, precio_centavos, in_stock, url, warc_record_id, parser_version, name, image_url) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9);",
point.ean,
point.fetched_at,
point.precio_centavos,
point.in_stock,
point.url,
None::<String>,
point.parser_version,
point.name,
point.image_url,
).execute(&self.pool).await?;
Ok(())
}
pub async fn get_ean_by_url(&self, url: &str) -> anyhow::Result<Option<String>> {
Ok(sqlx::query!("SELECT ean FROM precios WHERE url = ?1;", url)
.fetch_optional(&self.pool)
.await?
.map(|r| r.ean))
}
pub async fn get_urls_by_domain(&self, domain: &str) -> anyhow::Result<Vec<String>> {
let query = format!("%{}%", domain);
Ok(
sqlx::query!("SELECT url FROM producto_urls WHERE url LIKE ?1;", query)
.fetch_all(&self.pool)
.await?
.into_iter()
.map(|r| r.url)
.collect(),
)
}
pub async fn save_producto_urls(&self, urls: Vec<String>) -> anyhow::Result<()> {
let now: i64 = now_ms().try_into()?;
let mut tx = self.pool.begin().await?;
for url in urls {
sqlx::query!(
r#"INSERT INTO producto_urls(url, first_seen, last_seen)
VALUES (?1, ?2, ?2)
ON CONFLICT(url) DO UPDATE SET last_seen=?2;"#,
url,
now
)
.execute(&mut *tx)
.await?;
}
tx.commit().await?;
Ok(())
}
pub async fn save_best_selling(&self, records: Vec<BestSellingRecord>) -> anyhow::Result<()> {
let mut tx = self.pool.begin().await?;
for record in records {
let fetched_at = record.fetched_at.timestamp_millis();
let category = record.category.id();
let eans_json = serde_json::Value::from(record.eans).to_string();
sqlx::query!(
r#"INSERT INTO db_best_selling(fetched_at, category, eans_json)
VALUES (?1, ?2, ?3);"#,
fetched_at,
category,
eans_json
)
.execute(&mut *tx)
.await?;
}
tx.commit().await?;
Ok(())
}
}
fn now_ms() -> u128 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("Time went backwards")
.as_millis()
}

View file

@ -1,10 +1,11 @@
use again::RetryPolicy;
use chrono::{DateTime, Utc};
use clap::{Parser, ValueEnum};
use cron::Schedule;
use deadpool_sqlite::Pool;
use futures::{future, stream, StreamExt};
use db::Db;
use futures::{future, stream, Future, StreamExt};
use nanoid::nanoid;
use reqwest::{StatusCode, Url};
use reqwest::{header::HeaderMap, StatusCode, Url};
use simple_error::{bail, SimpleError};
use std::{
env::{self},
@ -15,23 +16,8 @@ use std::{
};
use thiserror::Error;
#[derive(ValueEnum, Clone, Debug)]
enum Supermercado {
Dia,
Jumbo,
Carrefour,
Coto,
}
impl Supermercado {
fn host(&self) -> &'static str {
match self {
Self::Dia => "diaonline.supermercadosdia.com.ar",
Self::Carrefour => "www.carrefour.com.ar",
Self::Coto => "www.cotodigital3.com.ar",
Self::Jumbo => "www.jumbo.com.ar",
}
}
}
mod supermercado;
use supermercado::Supermercado;
#[derive(Parser)] // requires `derive` feature
enum Args {
@ -39,6 +25,7 @@ enum Args {
ParseFile(ParseFileArgs),
GetUrlList(GetUrlListArgs),
ScrapUrl(ScrapUrlArgs),
ScrapBestSelling,
Auto(AutoArgs),
Cron(AutoArgs),
}
@ -71,6 +58,7 @@ async fn main() -> anyhow::Result<()> {
Args::ParseFile(a) => parse_file_cli(a.file_path).await,
Args::GetUrlList(a) => get_url_list_cli(a.supermercado).await,
Args::ScrapUrl(a) => scrap_url_cli(a.url).await,
Args::ScrapBestSelling => scrap_best_selling_cli().await,
Args::Auto(_) => auto_cli().await,
Args::Cron(_) => cron_cli().await,
}
@ -83,6 +71,14 @@ async fn scrap_url_cli(url: String) -> anyhow::Result<()> {
println!("Result: {:#?}", res);
res.map(|_| ())
}
mod best_selling;
async fn scrap_best_selling_cli() -> anyhow::Result<()> {
let db = Db::connect().await?;
let res = best_selling::get_all_best_selling(&db).await;
println!("Result: {:#?}", res);
res.map(|_| ())
}
async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
let links_str = fs::read_to_string(links_list_path).unwrap();
@ -93,14 +89,14 @@ async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
.map(|s| s.to_owned())
.collect::<Vec<_>>();
let pool = connect_db();
let counters = fetch_list(&pool, links).await;
let db = Db::connect().await?;
let counters = fetch_list(&db, links).await;
println!("Finished: {:?}", counters);
Ok(())
}
async fn fetch_list(pool: &Pool, links: Vec<String>) -> Counters {
async fn fetch_list(db: &Db, links: Vec<String>) -> Counters {
let n_coroutines = env::var("N_COROUTINES")
.map_or(Ok(24), |s| s.parse::<usize>())
.expect("N_COROUTINES no es un número");
@ -109,9 +105,9 @@ async fn fetch_list(pool: &Pool, links: Vec<String>) -> Counters {
stream::iter(links)
.map(|url| {
let pool = pool.clone();
let db = db.clone();
let client = client.clone();
tokio::spawn(fetch_and_save(client, url, pool))
tokio::spawn(fetch_and_save(client, url, db))
})
.buffer_unordered(n_coroutines)
.fold(Counters::default(), move |x, y| {
@ -125,19 +121,7 @@ async fn fetch_list(pool: &Pool, links: Vec<String>) -> Counters {
.await
}
fn connect_db() -> Pool {
let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string());
let cfg = deadpool_sqlite::Config::new(db_path);
cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap()
}
fn build_client() -> reqwest::Client {
reqwest::ClientBuilder::default()
.timeout(Duration::from_secs(60 * 5))
.connect_timeout(Duration::from_secs(60))
.build()
.unwrap()
}
mod db;
#[derive(Default, Debug)]
struct Counters {
@ -146,26 +130,13 @@ struct Counters {
skipped: u64,
}
async fn fetch_and_save(client: reqwest::Client, url: String, pool: Pool) -> Counters {
async fn fetch_and_save(client: reqwest::Client, url: String, db: Db) -> Counters {
let res = fetch_and_parse(&client, url.clone()).await;
let mut counters = Counters::default();
match res {
Ok(res) => {
counters.success += 1;
pool.get().await.unwrap().interact(move |conn| conn.execute(
"INSERT INTO precios(ean, fetched_at, precio_centavos, in_stock, url, warc_record_id, parser_version, name, image_url) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9);",
rusqlite::params![
res.ean,
res.fetched_at,
res.precio_centavos,
res.in_stock,
res.url,
None::<String>,
res.parser_version,
res.name,
res.image_url,
]
)).await.unwrap().unwrap();
db.insert_precio(res).await.unwrap();
}
Err(err) => {
match err.downcast_ref::<reqwest::Error>() {
@ -190,6 +161,16 @@ enum FetchError {
Tl(#[from] tl::ParseError),
}
fn build_client() -> reqwest::Client {
let mut headers = HeaderMap::new();
headers.append("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36".parse().unwrap());
reqwest::ClientBuilder::default()
.timeout(Duration::from_secs(60 * 5))
.connect_timeout(Duration::from_secs(60))
.default_headers(headers)
.build()
.unwrap()
}
pub async fn do_request(client: &reqwest::Client, url: &str) -> reqwest::Result<reqwest::Response> {
let request = client.get(url).build()?;
let response = client.execute(request).await?.error_for_status()?;
@ -222,7 +203,8 @@ async fn fetch_and_parse(
let point = match maybe_point {
Ok(p) => Ok(p),
Err(err) => {
let debug_path = PathBuf::from("debug/");
let now: DateTime<Utc> = Utc::now();
let debug_path = PathBuf::from(format!("debug-{}/", now.format("%Y-%m-%d")));
tokio::fs::create_dir_all(&debug_path).await.unwrap();
let file_path = debug_path.join(format!("{}.html", nanoid!()));
tokio::fs::write(&file_path, &body).await.unwrap();
@ -303,7 +285,7 @@ struct AutoTelegram {
#[derive(Clone)]
struct Auto {
pool: Pool,
db: Db,
telegram: Option<AutoTelegram>,
}
impl Auto {
@ -318,24 +300,7 @@ impl Auto {
))
.await;
}
let links: Vec<String> = {
let search = format!("%{}%", supermercado.host());
self.pool
.get()
.await?
.interact(move |conn| -> anyhow::Result<Vec<String>> {
Ok(conn
.prepare(
r#"SELECT url FROM producto_urls
WHERE url LIKE ?1;"#,
)?
.query_map(rusqlite::params![search], |r| r.get::<_, String>(0))?
.map(|r| r.unwrap())
.collect())
})
.await
.unwrap()?
};
let links: Vec<String> = self.db.get_urls_by_domain(supermercado.host()).await?;
// {
// let debug_path = PathBuf::from("debug/");
// tokio::fs::create_dir_all(&debug_path).await.unwrap();
@ -347,7 +312,7 @@ impl Auto {
// }
{
let t0 = now_sec();
let counters = fetch_list(&self.pool, links).await;
let counters = fetch_list(&self.db, links).await;
self.inform(&format!(
"Downloaded {:?}: {:?} (took {})",
&supermercado,
@ -356,32 +321,21 @@ impl Auto {
))
.await;
}
Ok(())
}
async fn inform_time<T: Future<Output = R>, R>(&self, msg: &str, action: T) -> R {
let t0 = now_sec();
let res = action.await;
self.inform(&format!("{} (took {})", msg, now_sec() - t0))
.await;
res
}
async fn get_and_save_urls(&self, supermercado: &Supermercado) -> anyhow::Result<()> {
let urls = get_urls(supermercado).await?;
self.pool
.get()
.await?
.interact(|conn| -> Result<(), anyhow::Error> {
let tx = conn.transaction()?;
{
let mut stmt = tx.prepare(
r#"INSERT INTO producto_urls(url, first_seen, last_seen)
VALUES (?1, ?2, ?2)
ON CONFLICT(url) DO UPDATE SET last_seen=?2;"#,
)?;
let now: u64 = now_ms().try_into()?;
for url in urls {
stmt.execute(rusqlite::params![url, now])?;
}
}
tx.commit()?;
Ok(())
})
.await
.unwrap()?;
self.db.save_producto_urls(urls).await?;
Ok(())
}
@ -402,26 +356,37 @@ impl Auto {
}
async fn auto_cli() -> anyhow::Result<()> {
let db = connect_db();
let telegram = {
match (
env::var("TELEGRAM_BOT_TOKEN"),
env::var("TELEGRAM_BOT_CHAT_ID"),
) {
(Ok(token), Ok(chat_id)) => Some(AutoTelegram { token, chat_id }),
_ => {
tracing::warn!("No token or chat_id for telegram");
None
let auto = {
let db = Db::connect().await?;
let telegram = {
match (
env::var("TELEGRAM_BOT_TOKEN"),
env::var("TELEGRAM_BOT_CHAT_ID"),
) {
(Ok(token), Ok(chat_id)) => Some(AutoTelegram { token, chat_id }),
_ => {
tracing::warn!("No token or chat_id for telegram");
None
}
}
}
};
Auto { db, telegram }
};
let auto = Auto { pool: db, telegram };
auto.inform("[auto] Empezando scrap").await;
let handles: Vec<_> = Supermercado::value_variants()
.iter()
.map(|s| tokio::spawn(auto.clone().download_supermercado(s.to_owned())))
.collect();
future::try_join_all(handles).await?;
let best_selling = auto
.inform_time(
"Downloaded best selling",
best_selling::get_all_best_selling(&auto.db),
)
.await?;
auto.db.save_best_selling(best_selling).await?;
Ok(())
}
async fn cron_cli() -> anyhow::Result<()> {
@ -449,8 +414,8 @@ mod sites;
struct PrecioPoint {
ean: String,
// unix
fetched_at: u64,
precio_centavos: Option<u64>,
fetched_at: i64,
precio_centavos: Option<i64>,
in_stock: Option<bool>,
url: String,
parser_version: u16,
@ -458,13 +423,9 @@ struct PrecioPoint {
image_url: Option<String>,
}
fn now_sec() -> u64 {
since_the_epoch().as_secs()
fn now_sec() -> i64 {
since_the_epoch().as_secs().try_into().unwrap()
}
fn now_ms() -> u128 {
since_the_epoch().as_millis()
}
fn since_the_epoch() -> Duration {
SystemTime::now()
.duration_since(UNIX_EPOCH)

View file

@ -11,9 +11,9 @@ pub fn get_meta_content<'a>(dom: &'a VDom<'a>, prop: &str) -> Option<Cow<'a, str
.map(|s| s.as_utf8_str())
}
pub fn price_from_meta(dom: &tl::VDom<'_>) -> Result<Option<u64>, anyhow::Error> {
pub fn price_from_meta(dom: &tl::VDom<'_>) -> Result<Option<i64>, anyhow::Error> {
let precio_centavos = get_meta_content(dom, "product:price:amount")
.map(|s| s.parse::<f64>().map(|f| (f * 100.0) as u64))
.map(|s| s.parse::<f64>().map(|f| (f * 100.0) as i64))
.transpose()?;
Ok(precio_centavos)
}

View file

@ -37,7 +37,7 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error>
})
.transpose()
.context("Parseando precio")?
.map(|f| (f * 100.0) as u64);
.map(|f| (f * 100.0) as i64);
let in_stock = Some(
dom.query_selector(".product_not_available")

View file

@ -3,4 +3,4 @@ mod common;
pub mod coto;
pub mod dia;
pub mod jumbo;
mod vtex;
pub mod vtex;

View file

@ -1,7 +1,12 @@
use std::str::FromStr;
use anyhow::{bail, Context};
use base64::Engine;
use futures::{stream, StreamExt, TryStreamExt};
use itertools::Itertools;
use reqwest::Url;
use serde::Deserialize;
use serde_json::json;
use simple_error::SimpleError;
use tl::VDom;
@ -100,7 +105,7 @@ pub fn in_stock_from_meta(dom: &VDom) -> anyhow::Result<bool> {
"instock" => true,
_ => bail!("Not a valid product:availability"),
},
None => bail!("No product:availability in carrefour"),
None => bail!("No product:availability in vtex"),
},
)
}
@ -146,6 +151,101 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<St
Ok(total.into_iter().unique().collect())
}
async fn fetch_body<'a>(client: &reqwest::Client, url: &str) -> anyhow::Result<String> {
let body = get_retry_policy()
.retry_if(|| do_request(client, url), retry_if_wasnt_not_found)
.await?
.text()
.await?;
Ok(body)
}
async fn get_binding_id(client: &reqwest::Client, url: &str) -> anyhow::Result<String> {
let body = fetch_body(client, url).await?;
let dom = tl::parse(&body, tl::ParserOptions::default())?;
let json = parse_script_json(&dom, "__RUNTIME__")?;
let id = json
.as_object()
.ok_or(SimpleError::new("RUNTIME not an object"))?
.get("binding")
.and_then(|v| v.as_object())
.and_then(|o| o.get("id"))
.and_then(|v| v.as_str())
.ok_or(SimpleError::new("binding.id does not exist"))?
.to_string();
Ok(id)
}
/// Returns a vec of product URLs
///
/// Estos parametros se consiguen yendo a una página como `https://www.jumbo.com.ar/almacen` y extrayendo:
/// * `domain` - www.jumbo.com.ar
/// * `query` - almacen
///
/// También `https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras`:
/// * `domain` - diaonline.supermercadosdia.com.ar
/// * `query` - frescos/frutas-y-verduras
pub async fn get_best_selling_by_category(
client: &reqwest::Client,
domain: &str,
query: &str,
) -> anyhow::Result<Vec<String>> {
let base_url = { Url::from_str(&format!("https://{}/{}", domain, query)).unwrap() };
let binding_id = get_binding_id(client, base_url.as_str()).await?;
let url = {
let mut url = base_url.clone();
url.set_path("/_v/segment/graphql/v1");
url.query_pairs_mut().append_pair("workspace", "master")
.append_pair("maxAge", "short")
.append_pair("appsEtag", "remove")
.append_pair("domain", "store")
.append_pair("locale", "es-AR")
.append_pair("__bindingId", &binding_id)
.append_pair("operationName", "productSearchV3")
.append_pair("variables", "%7B%7D")
.append_pair("extensions", &{
let variables_obj = json!({"hideUnavailableItems":true,"skusFilter":"FIRST_AVAILABLE","simulationBehavior":"default","installmentCriteria":"MAX_WITHOUT_INTEREST","productOriginVtex":false,"map":"c","query":query,"orderBy":"OrderByTopSaleDESC","from":0,"to":99,"selectedFacets":
query.split('/').map(|f| json!({"key":"c","value":f})).collect::<Vec<_>>()
,"facetsBehavior":"Static","categoryTreeBehavior":"default","withFacets":false});
let b64=base64::prelude::BASE64_STANDARD.encode(variables_obj.to_string());
format!(
r#"{{
"persistedQuery": {{
"version": 1,
"sha256Hash": "40b843ca1f7934d20d05d334916220a0c2cae3833d9f17bcb79cdd2185adceac",
"sender": "vtex.store-resources@0.x",
"provider": "vtex.search-graphql@0.x"
}},
"variables": "{}"
}}"#, b64
)
});
url
};
let body = fetch_body(client, url.as_str()).await?;
let urls: Vec<String> = serde_json::from_str::<serde_json::Value>(&body)?
.pointer("/data/productSearch/products")
.and_then(|v| v.as_array())
.map(|a| {
a.iter()
.filter_map(|p| {
p.get("link")
.and_then(|v| v.as_str())
.map(|s| format!("https://{}{}", domain, s))
})
.collect()
})
.ok_or(SimpleError::new("failed to get best selling product urls"))?;
if urls.len() < 2 {
bail!("Too few best selling");
}
Ok(urls)
}
#[cfg(test)]
mod tests {
use super::*;
@ -163,4 +263,11 @@ mod tests {
assert_eq!(links[0], "https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g\u{200b}-684952/p");
Ok(())
}
#[tokio::test]
async fn test_jumbo_best_selling() -> anyhow::Result<()> {
get_best_selling_by_category(&build_client(), "www.jumbo.com.ar", "almacen").await?;
// assert_eq!(links[0], "https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g\u{200b}-684952/p");
Ok(())
}
}

View file

@ -0,0 +1,19 @@
use clap::ValueEnum;
#[derive(ValueEnum, Clone, Debug)]
pub enum Supermercado {
Dia,
Jumbo,
Carrefour,
Coto,
}
impl Supermercado {
pub fn host(&self) -> &'static str {
match self {
Self::Dia => "diaonline.supermercadosdia.com.ar",
Self::Carrefour => "www.carrefour.com.ar",
Self::Coto => "www.cotodigital3.com.ar",
Self::Jumbo => "www.jumbo.com.ar",
}
}
}

View file

@ -1,25 +1,63 @@
import type { PageData, PageServerLoad } from "./$types";
import { getDb, schema } from "$lib/server/db";
const { precios } = schema;
import { sql } from "drizzle-orm";
import { desc, sql } from "drizzle-orm";
import {
Supermercado,
hostBySupermercado,
supermercados,
} from "db-datos/supermercado";
let cache: Promise<{ key: Date; data: { precios: Precios } }> = doQuery();
async function doQuery() {
const db = await getDb();
const q = db
console.time("ean");
const eans = await db
.select({
ean: precios.ean,
name: precios.name,
imageUrl: precios.imageUrl,
})
.from(precios)
.groupBy(precios.ean)
.having(sql`max(length(name)) and max(parser_version) and in_stock`)
.orderBy(sql`random()`)
.limit(150);
const res = await q;
const data = { precios: res };
.limit(50);
console.timeEnd("ean");
return;
const precioss = await Promise.all(
supermercados.map(
async (
supermercado,
): Promise<
[
Supermercado,
{ ean: string; name: string | null; imageUrl: string | null }[],
]
> => {
const host = hostBySupermercado[supermercado];
console.time(supermercado);
const q = db
.select({
ean: precios.ean,
name: precios.name,
imageUrl: precios.imageUrl,
})
.from(precios)
.groupBy(precios.ean)
.having(sql`max(fetched_at)`)
.where(
sql`ean in ${eans.map((x) => x.ean)} and in_stock and url like ${`%${host}%`}`,
);
// console.debug(q.toSQL());
const res = await q;
console.timeEnd(supermercado);
return [supermercado, res];
},
),
);
const data = { precios: precioss.flatMap(([_, r]) => r) };
return { key: new Date(), data };
}

View file

@ -7,6 +7,13 @@
(d): d is { ean: string; name: string; imageUrl: string | null } =>
!!d.name,
);
$: productos = precios.reduce(
(prev, curr) => [
...prev,
...(prev.find((p) => p.ean === curr.ean) ? [] : [curr]),
],
[] as { ean: string; name: string; imageUrl: string | null }[],
);
</script>
<h1 class="text-xl">WIP</h1>
@ -39,7 +46,7 @@
<section>
<h2 class="text-lg font-bold">Random</h2>
<ul class="grid grid-cols-1 gap-4 md:grid-cols-2 lg:grid-cols-3">
{#each precios as product}
{#each productos as product}
<li>
<ProductPreview {product} />
</li>