mirror of
https://github.com/catdevnull/preciazo.git
synced 2025-02-21 10:56:25 +00:00
WIP: metadata scrapped
This commit is contained in:
parent
544b0471b9
commit
258346e3d6
3 changed files with 103 additions and 0 deletions
|
@ -266,6 +266,44 @@ async fn search(State(pool): State<SqlitePool>, Path(query): Path<String>) -> im
|
|||
Json(results)
|
||||
}
|
||||
|
||||
#[derive(sqlx::FromRow, Debug, Serialize)]
|
||||
struct Metadata {
|
||||
ean: String,
|
||||
fetched_at: chrono::DateTime<Utc>,
|
||||
precio_centavos: Option<i64>,
|
||||
in_stock: Option<bool>,
|
||||
url: String,
|
||||
name: Option<String>,
|
||||
image_url: Option<String>,
|
||||
}
|
||||
|
||||
async fn dump_latest_metadata(State(pool): State<SqlitePool>) -> impl IntoResponse {
|
||||
let precios = sqlx::query!("
|
||||
SELECT p.id, p.ean, p.name, p.image_url, p.url, p.precio_centavos, p.in_stock, p.fetched_at
|
||||
FROM precios p
|
||||
INNER JOIN (
|
||||
SELECT ean, MAX(fetched_at) as max_fetched_at
|
||||
FROM precios
|
||||
GROUP BY ean
|
||||
) latest ON p.ean = latest.ean AND p.fetched_at = latest.max_fetched_at
|
||||
WHERE p.name IS NOT NULL")
|
||||
.fetch_all(&pool)
|
||||
.await
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|r| Metadata {
|
||||
ean: r.ean,
|
||||
fetched_at: DateTime::from_timestamp(r.fetched_at, 0).unwrap(),
|
||||
image_url: r.image_url,
|
||||
name: r.name,
|
||||
in_stock: r.in_stock.map(|x| x == 1),
|
||||
precio_centavos: r.precio_centavos,
|
||||
url: r.url,
|
||||
})
|
||||
.collect_vec();
|
||||
Json(precios)
|
||||
}
|
||||
|
||||
async fn get_info(State(pool): State<SqlitePool>) -> impl IntoResponse {
|
||||
#[derive(Serialize)]
|
||||
struct Info {
|
||||
|
@ -321,6 +359,7 @@ async fn main() {
|
|||
.route("/api/0/ean/:ean/history", get(get_product_history))
|
||||
.route("/api/0/info", get(get_info))
|
||||
.route("/api/0/search/:query", get(search))
|
||||
.route("/api/0/internal/latest-metadata", get(dump_latest_metadata))
|
||||
.with_state(pool);
|
||||
|
||||
let listener = tokio::net::TcpListener::bind("0.0.0.0:8000").await.unwrap();
|
||||
|
|
|
@ -11,6 +11,8 @@ import {
|
|||
index,
|
||||
pgMaterializedView,
|
||||
pgView,
|
||||
timestamp,
|
||||
boolean,
|
||||
} from "drizzle-orm/pg-core";
|
||||
|
||||
export const datasets = pgTable(
|
||||
|
@ -211,3 +213,22 @@ export const productos_descripcion_index = pgTable(
|
|||
).using("gin", sql`to_tsvector('spanish', ${table.productos_descripcion})`),
|
||||
})
|
||||
);
|
||||
|
||||
// véase scripts/refresh-scrapped-metadata.ts
|
||||
export const productos_metadata_scrapped = pgTable(
|
||||
"productos_metadata_scrapped",
|
||||
{
|
||||
ean: bigint("ean", { mode: "bigint" }),
|
||||
fetchedAt: timestamp("fetched_at").notNull(),
|
||||
precioCentavos: integer("precio_centavos"),
|
||||
inStock: boolean("in_stock"),
|
||||
url: text("url").notNull(),
|
||||
name: text("name"),
|
||||
imageUrl: text("image_url"),
|
||||
},
|
||||
(table) => ({
|
||||
productos_metadata_scrapped_ean_idx: index(
|
||||
"productos_metadata_scrapped_ean_idx"
|
||||
).on(table.ean),
|
||||
})
|
||||
);
|
||||
|
|
43
sepa/scripts/refresh-scrapped-metadata.ts
Normal file
43
sepa/scripts/refresh-scrapped-metadata.ts
Normal file
|
@ -0,0 +1,43 @@
|
|||
/**
|
||||
* este script actualiza la base de datos "nueva" a partir de una base de datos
|
||||
* generada por el scraper "viejo" de preciazo, que scrapea los sitios de los supermercados.
|
||||
*
|
||||
* solo guarda los últimos metadatos de cada producto.
|
||||
*
|
||||
* se le pasa la base de datos SQLite del scraper como parametro.
|
||||
*/
|
||||
|
||||
import { drizzle } from "drizzle-orm/postgres-js";
|
||||
import postgres from "postgres";
|
||||
import * as schema from "../db/schema";
|
||||
import { Database } from "bun:sqlite";
|
||||
|
||||
if (!process.argv[2]) {
|
||||
console.error("falta pasar la base de datos del scraper como parametro");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const db = drizzle(postgres(), {
|
||||
schema,
|
||||
logger: true,
|
||||
});
|
||||
using scraperDb = new Database(process.argv[2], {
|
||||
strict: true,
|
||||
readonly: true,
|
||||
});
|
||||
|
||||
const precios = scraperDb.query(`
|
||||
SELECT p.id, p.ean, p.name, p.image_url, p.url, p.precio_centavos, p.in_stock, p.fetched_at
|
||||
FROM precios p
|
||||
INNER JOIN (
|
||||
SELECT ean, MAX(fetched_at) as max_fetched_at
|
||||
FROM precios
|
||||
GROUP BY ean
|
||||
) latest ON p.ean = latest.ean AND p.fetched_at = latest.max_fetched_at
|
||||
WHERE p.name IS NOT NULL
|
||||
`);
|
||||
|
||||
// @ts-expect-error bun 1.1.30 has outdated types, it's fixed in main branch
|
||||
for (const row of precios.iterate()) {
|
||||
console.log(row);
|
||||
}
|
Loading…
Reference in a new issue