WIP: metadata scrapped

This commit is contained in:
Cat /dev/Nulo 2024-11-13 09:48:23 -03:00
parent 544b0471b9
commit 258346e3d6
3 changed files with 103 additions and 0 deletions

View file

@ -266,6 +266,44 @@ async fn search(State(pool): State<SqlitePool>, Path(query): Path<String>) -> im
Json(results)
}
#[derive(sqlx::FromRow, Debug, Serialize)]
struct Metadata {
ean: String,
fetched_at: chrono::DateTime<Utc>,
precio_centavos: Option<i64>,
in_stock: Option<bool>,
url: String,
name: Option<String>,
image_url: Option<String>,
}
async fn dump_latest_metadata(State(pool): State<SqlitePool>) -> impl IntoResponse {
let precios = sqlx::query!("
SELECT p.id, p.ean, p.name, p.image_url, p.url, p.precio_centavos, p.in_stock, p.fetched_at
FROM precios p
INNER JOIN (
SELECT ean, MAX(fetched_at) as max_fetched_at
FROM precios
GROUP BY ean
) latest ON p.ean = latest.ean AND p.fetched_at = latest.max_fetched_at
WHERE p.name IS NOT NULL")
.fetch_all(&pool)
.await
.unwrap()
.into_iter()
.map(|r| Metadata {
ean: r.ean,
fetched_at: DateTime::from_timestamp(r.fetched_at, 0).unwrap(),
image_url: r.image_url,
name: r.name,
in_stock: r.in_stock.map(|x| x == 1),
precio_centavos: r.precio_centavos,
url: r.url,
})
.collect_vec();
Json(precios)
}
async fn get_info(State(pool): State<SqlitePool>) -> impl IntoResponse {
#[derive(Serialize)]
struct Info {
@ -321,6 +359,7 @@ async fn main() {
.route("/api/0/ean/:ean/history", get(get_product_history))
.route("/api/0/info", get(get_info))
.route("/api/0/search/:query", get(search))
.route("/api/0/internal/latest-metadata", get(dump_latest_metadata))
.with_state(pool);
let listener = tokio::net::TcpListener::bind("0.0.0.0:8000").await.unwrap();

View file

@ -11,6 +11,8 @@ import {
index,
pgMaterializedView,
pgView,
timestamp,
boolean,
} from "drizzle-orm/pg-core";
export const datasets = pgTable(
@ -211,3 +213,22 @@ export const productos_descripcion_index = pgTable(
).using("gin", sql`to_tsvector('spanish', ${table.productos_descripcion})`),
})
);
// véase scripts/refresh-scrapped-metadata.ts
export const productos_metadata_scrapped = pgTable(
"productos_metadata_scrapped",
{
ean: bigint("ean", { mode: "bigint" }),
fetchedAt: timestamp("fetched_at").notNull(),
precioCentavos: integer("precio_centavos"),
inStock: boolean("in_stock"),
url: text("url").notNull(),
name: text("name"),
imageUrl: text("image_url"),
},
(table) => ({
productos_metadata_scrapped_ean_idx: index(
"productos_metadata_scrapped_ean_idx"
).on(table.ean),
})
);

View file

@ -0,0 +1,43 @@
/**
* este script actualiza la base de datos "nueva" a partir de una base de datos
* generada por el scraper "viejo" de preciazo, que scrapea los sitios de los supermercados.
*
* solo guarda los últimos metadatos de cada producto.
*
* se le pasa la base de datos SQLite del scraper como parametro.
*/
import { drizzle } from "drizzle-orm/postgres-js";
import postgres from "postgres";
import * as schema from "../db/schema";
import { Database } from "bun:sqlite";
if (!process.argv[2]) {
console.error("falta pasar la base de datos del scraper como parametro");
process.exit(1);
}
const db = drizzle(postgres(), {
schema,
logger: true,
});
using scraperDb = new Database(process.argv[2], {
strict: true,
readonly: true,
});
const precios = scraperDb.query(`
SELECT p.id, p.ean, p.name, p.image_url, p.url, p.precio_centavos, p.in_stock, p.fetched_at
FROM precios p
INNER JOIN (
SELECT ean, MAX(fetched_at) as max_fetched_at
FROM precios
GROUP BY ean
) latest ON p.ean = latest.ean AND p.fetched_at = latest.max_fetched_at
WHERE p.name IS NOT NULL
`);
// @ts-expect-error bun 1.1.30 has outdated types, it's fixed in main branch
for (const row of precios.iterate()) {
console.log(row);
}