From 258346e3d6ce03407c5a4b810ca5a3f669979122 Mon Sep 17 00:00:00 2001 From: Nulo Date: Wed, 13 Nov 2024 09:48:23 -0300 Subject: [PATCH] WIP: metadata scrapped --- rust/src/api/main.rs | 39 ++++++++++++++++++++ sepa/db/schema.ts | 21 +++++++++++ sepa/scripts/refresh-scrapped-metadata.ts | 43 +++++++++++++++++++++++ 3 files changed, 103 insertions(+) create mode 100644 sepa/scripts/refresh-scrapped-metadata.ts diff --git a/rust/src/api/main.rs b/rust/src/api/main.rs index 90816f8..902fbe4 100644 --- a/rust/src/api/main.rs +++ b/rust/src/api/main.rs @@ -266,6 +266,44 @@ async fn search(State(pool): State, Path(query): Path) -> im Json(results) } +#[derive(sqlx::FromRow, Debug, Serialize)] +struct Metadata { + ean: String, + fetched_at: chrono::DateTime, + precio_centavos: Option, + in_stock: Option, + url: String, + name: Option, + image_url: Option, +} + +async fn dump_latest_metadata(State(pool): State) -> impl IntoResponse { + let precios = sqlx::query!(" + SELECT p.id, p.ean, p.name, p.image_url, p.url, p.precio_centavos, p.in_stock, p.fetched_at + FROM precios p + INNER JOIN ( + SELECT ean, MAX(fetched_at) as max_fetched_at + FROM precios + GROUP BY ean + ) latest ON p.ean = latest.ean AND p.fetched_at = latest.max_fetched_at + WHERE p.name IS NOT NULL") + .fetch_all(&pool) + .await + .unwrap() + .into_iter() + .map(|r| Metadata { + ean: r.ean, + fetched_at: DateTime::from_timestamp(r.fetched_at, 0).unwrap(), + image_url: r.image_url, + name: r.name, + in_stock: r.in_stock.map(|x| x == 1), + precio_centavos: r.precio_centavos, + url: r.url, + }) + .collect_vec(); + Json(precios) +} + async fn get_info(State(pool): State) -> impl IntoResponse { #[derive(Serialize)] struct Info { @@ -321,6 +359,7 @@ async fn main() { .route("/api/0/ean/:ean/history", get(get_product_history)) .route("/api/0/info", get(get_info)) .route("/api/0/search/:query", get(search)) + .route("/api/0/internal/latest-metadata", get(dump_latest_metadata)) .with_state(pool); let listener = tokio::net::TcpListener::bind("0.0.0.0:8000").await.unwrap(); diff --git a/sepa/db/schema.ts b/sepa/db/schema.ts index efd4aba..192375e 100644 --- a/sepa/db/schema.ts +++ b/sepa/db/schema.ts @@ -11,6 +11,8 @@ import { index, pgMaterializedView, pgView, + timestamp, + boolean, } from "drizzle-orm/pg-core"; export const datasets = pgTable( @@ -211,3 +213,22 @@ export const productos_descripcion_index = pgTable( ).using("gin", sql`to_tsvector('spanish', ${table.productos_descripcion})`), }) ); + +// vĂ©ase scripts/refresh-scrapped-metadata.ts +export const productos_metadata_scrapped = pgTable( + "productos_metadata_scrapped", + { + ean: bigint("ean", { mode: "bigint" }), + fetchedAt: timestamp("fetched_at").notNull(), + precioCentavos: integer("precio_centavos"), + inStock: boolean("in_stock"), + url: text("url").notNull(), + name: text("name"), + imageUrl: text("image_url"), + }, + (table) => ({ + productos_metadata_scrapped_ean_idx: index( + "productos_metadata_scrapped_ean_idx" + ).on(table.ean), + }) +); diff --git a/sepa/scripts/refresh-scrapped-metadata.ts b/sepa/scripts/refresh-scrapped-metadata.ts new file mode 100644 index 0000000..4f529dd --- /dev/null +++ b/sepa/scripts/refresh-scrapped-metadata.ts @@ -0,0 +1,43 @@ +/** + * este script actualiza la base de datos "nueva" a partir de una base de datos + * generada por el scraper "viejo" de preciazo, que scrapea los sitios de los supermercados. + * + * solo guarda los Ășltimos metadatos de cada producto. + * + * se le pasa la base de datos SQLite del scraper como parametro. + */ + +import { drizzle } from "drizzle-orm/postgres-js"; +import postgres from "postgres"; +import * as schema from "../db/schema"; +import { Database } from "bun:sqlite"; + +if (!process.argv[2]) { + console.error("falta pasar la base de datos del scraper como parametro"); + process.exit(1); +} + +const db = drizzle(postgres(), { + schema, + logger: true, +}); +using scraperDb = new Database(process.argv[2], { + strict: true, + readonly: true, +}); + +const precios = scraperDb.query(` + SELECT p.id, p.ean, p.name, p.image_url, p.url, p.precio_centavos, p.in_stock, p.fetched_at + FROM precios p + INNER JOIN ( + SELECT ean, MAX(fetched_at) as max_fetched_at + FROM precios + GROUP BY ean + ) latest ON p.ean = latest.ean AND p.fetched_at = latest.max_fetched_at + WHERE p.name IS NOT NULL +`); + +// @ts-expect-error bun 1.1.30 has outdated types, it's fixed in main branch +for (const row of precios.iterate()) { + console.log(row); +}