mirror of
https://github.com/catdevnull/preciazo.git
synced 2025-02-22 13:56:23 +00:00
WIP: metadata scrapped
This commit is contained in:
parent
544b0471b9
commit
258346e3d6
3 changed files with 103 additions and 0 deletions
|
@ -266,6 +266,44 @@ async fn search(State(pool): State<SqlitePool>, Path(query): Path<String>) -> im
|
||||||
Json(results)
|
Json(results)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(sqlx::FromRow, Debug, Serialize)]
|
||||||
|
struct Metadata {
|
||||||
|
ean: String,
|
||||||
|
fetched_at: chrono::DateTime<Utc>,
|
||||||
|
precio_centavos: Option<i64>,
|
||||||
|
in_stock: Option<bool>,
|
||||||
|
url: String,
|
||||||
|
name: Option<String>,
|
||||||
|
image_url: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn dump_latest_metadata(State(pool): State<SqlitePool>) -> impl IntoResponse {
|
||||||
|
let precios = sqlx::query!("
|
||||||
|
SELECT p.id, p.ean, p.name, p.image_url, p.url, p.precio_centavos, p.in_stock, p.fetched_at
|
||||||
|
FROM precios p
|
||||||
|
INNER JOIN (
|
||||||
|
SELECT ean, MAX(fetched_at) as max_fetched_at
|
||||||
|
FROM precios
|
||||||
|
GROUP BY ean
|
||||||
|
) latest ON p.ean = latest.ean AND p.fetched_at = latest.max_fetched_at
|
||||||
|
WHERE p.name IS NOT NULL")
|
||||||
|
.fetch_all(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.into_iter()
|
||||||
|
.map(|r| Metadata {
|
||||||
|
ean: r.ean,
|
||||||
|
fetched_at: DateTime::from_timestamp(r.fetched_at, 0).unwrap(),
|
||||||
|
image_url: r.image_url,
|
||||||
|
name: r.name,
|
||||||
|
in_stock: r.in_stock.map(|x| x == 1),
|
||||||
|
precio_centavos: r.precio_centavos,
|
||||||
|
url: r.url,
|
||||||
|
})
|
||||||
|
.collect_vec();
|
||||||
|
Json(precios)
|
||||||
|
}
|
||||||
|
|
||||||
async fn get_info(State(pool): State<SqlitePool>) -> impl IntoResponse {
|
async fn get_info(State(pool): State<SqlitePool>) -> impl IntoResponse {
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
struct Info {
|
struct Info {
|
||||||
|
@ -321,6 +359,7 @@ async fn main() {
|
||||||
.route("/api/0/ean/:ean/history", get(get_product_history))
|
.route("/api/0/ean/:ean/history", get(get_product_history))
|
||||||
.route("/api/0/info", get(get_info))
|
.route("/api/0/info", get(get_info))
|
||||||
.route("/api/0/search/:query", get(search))
|
.route("/api/0/search/:query", get(search))
|
||||||
|
.route("/api/0/internal/latest-metadata", get(dump_latest_metadata))
|
||||||
.with_state(pool);
|
.with_state(pool);
|
||||||
|
|
||||||
let listener = tokio::net::TcpListener::bind("0.0.0.0:8000").await.unwrap();
|
let listener = tokio::net::TcpListener::bind("0.0.0.0:8000").await.unwrap();
|
||||||
|
|
|
@ -11,6 +11,8 @@ import {
|
||||||
index,
|
index,
|
||||||
pgMaterializedView,
|
pgMaterializedView,
|
||||||
pgView,
|
pgView,
|
||||||
|
timestamp,
|
||||||
|
boolean,
|
||||||
} from "drizzle-orm/pg-core";
|
} from "drizzle-orm/pg-core";
|
||||||
|
|
||||||
export const datasets = pgTable(
|
export const datasets = pgTable(
|
||||||
|
@ -211,3 +213,22 @@ export const productos_descripcion_index = pgTable(
|
||||||
).using("gin", sql`to_tsvector('spanish', ${table.productos_descripcion})`),
|
).using("gin", sql`to_tsvector('spanish', ${table.productos_descripcion})`),
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// véase scripts/refresh-scrapped-metadata.ts
|
||||||
|
export const productos_metadata_scrapped = pgTable(
|
||||||
|
"productos_metadata_scrapped",
|
||||||
|
{
|
||||||
|
ean: bigint("ean", { mode: "bigint" }),
|
||||||
|
fetchedAt: timestamp("fetched_at").notNull(),
|
||||||
|
precioCentavos: integer("precio_centavos"),
|
||||||
|
inStock: boolean("in_stock"),
|
||||||
|
url: text("url").notNull(),
|
||||||
|
name: text("name"),
|
||||||
|
imageUrl: text("image_url"),
|
||||||
|
},
|
||||||
|
(table) => ({
|
||||||
|
productos_metadata_scrapped_ean_idx: index(
|
||||||
|
"productos_metadata_scrapped_ean_idx"
|
||||||
|
).on(table.ean),
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
43
sepa/scripts/refresh-scrapped-metadata.ts
Normal file
43
sepa/scripts/refresh-scrapped-metadata.ts
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
/**
|
||||||
|
* este script actualiza la base de datos "nueva" a partir de una base de datos
|
||||||
|
* generada por el scraper "viejo" de preciazo, que scrapea los sitios de los supermercados.
|
||||||
|
*
|
||||||
|
* solo guarda los últimos metadatos de cada producto.
|
||||||
|
*
|
||||||
|
* se le pasa la base de datos SQLite del scraper como parametro.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { drizzle } from "drizzle-orm/postgres-js";
|
||||||
|
import postgres from "postgres";
|
||||||
|
import * as schema from "../db/schema";
|
||||||
|
import { Database } from "bun:sqlite";
|
||||||
|
|
||||||
|
if (!process.argv[2]) {
|
||||||
|
console.error("falta pasar la base de datos del scraper como parametro");
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const db = drizzle(postgres(), {
|
||||||
|
schema,
|
||||||
|
logger: true,
|
||||||
|
});
|
||||||
|
using scraperDb = new Database(process.argv[2], {
|
||||||
|
strict: true,
|
||||||
|
readonly: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
const precios = scraperDb.query(`
|
||||||
|
SELECT p.id, p.ean, p.name, p.image_url, p.url, p.precio_centavos, p.in_stock, p.fetched_at
|
||||||
|
FROM precios p
|
||||||
|
INNER JOIN (
|
||||||
|
SELECT ean, MAX(fetched_at) as max_fetched_at
|
||||||
|
FROM precios
|
||||||
|
GROUP BY ean
|
||||||
|
) latest ON p.ean = latest.ean AND p.fetched_at = latest.max_fetched_at
|
||||||
|
WHERE p.name IS NOT NULL
|
||||||
|
`);
|
||||||
|
|
||||||
|
// @ts-expect-error bun 1.1.30 has outdated types, it's fixed in main branch
|
||||||
|
for (const row of precios.iterate()) {
|
||||||
|
console.log(row);
|
||||||
|
}
|
Loading…
Reference in a new issue