mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 11:36:20 +00:00
Compare commits
8 commits
eb2b68fab0
...
290d29ea78
Author | SHA1 | Date | |
---|---|---|---|
290d29ea78 | |||
d58df3fd04 | |||
dbbd8e7f3a | |||
7ccc2432e3 | |||
28579d6883 | |||
378d4a4313 | |||
7ae225b1d6 | |||
4f1ea65de0 |
14 changed files with 149 additions and 58 deletions
|
@ -1,6 +1,7 @@
|
||||||
data/warcs/
|
data/warcs/
|
||||||
data/carrefour/
|
data/carrefour/
|
||||||
*/*.db*
|
*/*.db*
|
||||||
|
sqlite.db
|
||||||
downloader/
|
downloader/
|
||||||
node_modules/
|
node_modules/
|
||||||
*/node_modules/
|
*/node_modules/
|
||||||
|
|
|
@ -4,6 +4,10 @@ WORKDIR /usr/src/app
|
||||||
FROM base as build
|
FROM base as build
|
||||||
RUN apk add --no-cache nodejs npm
|
RUN apk add --no-cache nodejs npm
|
||||||
RUN npm install --global pnpm
|
RUN npm install --global pnpm
|
||||||
|
COPY db-datos/package.json db-datos/package.json
|
||||||
|
COPY sitio/package.json sitio/package.json
|
||||||
|
COPY pnpm-lock.yaml pnpm-workspace.yaml .
|
||||||
|
RUN cd sitio && pnpm install
|
||||||
COPY . .
|
COPY . .
|
||||||
COPY db-datos/drizzle .
|
COPY db-datos/drizzle .
|
||||||
RUN cd sitio && \
|
RUN cd sitio && \
|
||||||
|
|
|
@ -3,6 +3,13 @@ import Database from "better-sqlite3";
|
||||||
import { drizzle } from "drizzle-orm/better-sqlite3";
|
import { drizzle } from "drizzle-orm/better-sqlite3";
|
||||||
import { DB_PATH } from "./drizzle.config.js";
|
import { DB_PATH } from "./drizzle.config.js";
|
||||||
import * as schema from "./schema.js";
|
import * as schema from "./schema.js";
|
||||||
|
import { migrateDb } from "./migrate.js";
|
||||||
|
|
||||||
export const sqlite = new Database(DB_PATH);
|
/** @type {null | import("drizzle-orm/better-sqlite3").BetterSQLite3Database<schema>} */
|
||||||
export const db = drizzle(sqlite, { schema });
|
let db = null;
|
||||||
|
export function getDb() {
|
||||||
|
const sqlite = new Database(DB_PATH);
|
||||||
|
db = drizzle(sqlite, { schema });
|
||||||
|
migrateDb(db);
|
||||||
|
return db;
|
||||||
|
}
|
||||||
|
|
|
@ -1,21 +1,13 @@
|
||||||
// @ts-check
|
// @ts-check
|
||||||
import Database from "better-sqlite3";
|
|
||||||
import { join, dirname } from "node:path";
|
|
||||||
import { drizzle } from "drizzle-orm/better-sqlite3";
|
|
||||||
import { migrate } from "drizzle-orm/better-sqlite3/migrator";
|
import { migrate } from "drizzle-orm/better-sqlite3/migrator";
|
||||||
import * as schema from "./schema.js";
|
import * as schema from "./schema.js";
|
||||||
import { DB_PATH } from "./drizzle.config.js";
|
import { sql } from "drizzle-orm";
|
||||||
|
|
||||||
const url = new URL(import.meta.url);
|
/**
|
||||||
export function migrateDb() {
|
* @param {import("drizzle-orm/better-sqlite3").BetterSQLite3Database<schema>} db
|
||||||
const sqlite = new Database(DB_PATH);
|
*/
|
||||||
const db = drizzle(sqlite, { schema });
|
export function migrateDb(db) {
|
||||||
|
migrate(db, { migrationsFolder: "node_modules/db-datos/drizzle" });
|
||||||
migrate(db, { migrationsFolder: join(dirname(url.pathname), "drizzle") });
|
db.run(sql`pragma journal_mode = WAL;`);
|
||||||
sqlite.exec(`
|
db.run(sql`PRAGMA synchronous = NORMAL;`);
|
||||||
pragma journal_mode = WAL;
|
|
||||||
PRAGMA synchronous = NORMAL;
|
|
||||||
`);
|
|
||||||
|
|
||||||
sqlite.close();
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,25 +0,0 @@
|
||||||
import { sql } from "drizzle-orm";
|
|
||||||
import { db } from "./db.js";
|
|
||||||
import { productoUrls } from "./schema.js";
|
|
||||||
|
|
||||||
export function saveUrls(urls: string[]) {
|
|
||||||
db.transaction((tx) => {
|
|
||||||
const now = new Date();
|
|
||||||
const insertUrlTra = tx
|
|
||||||
.insert(productoUrls)
|
|
||||||
.values({
|
|
||||||
url: sql.placeholder("url"),
|
|
||||||
firstSeen: now,
|
|
||||||
lastSeen: now,
|
|
||||||
})
|
|
||||||
.onConflictDoUpdate({
|
|
||||||
target: productoUrls.url,
|
|
||||||
set: { lastSeen: now },
|
|
||||||
})
|
|
||||||
.prepare();
|
|
||||||
|
|
||||||
for (const href of urls) {
|
|
||||||
insertUrlTra.run({ url: href });
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
99
scraper-rs/Cargo.lock
generated
99
scraper-rs/Cargo.lock
generated
|
@ -61,6 +61,21 @@ version = "0.2.16"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
|
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "android-tzdata"
|
||||||
|
version = "0.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "android_system_properties"
|
||||||
|
version = "0.1.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "anstream"
|
name = "anstream"
|
||||||
version = "0.6.5"
|
version = "0.6.5"
|
||||||
|
@ -227,6 +242,20 @@ version = "1.0.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "chrono"
|
||||||
|
version = "0.4.32"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "41daef31d7a747c5c847246f36de49ced6f7403b4cdabc807a97b5cc184cda7a"
|
||||||
|
dependencies = [
|
||||||
|
"android-tzdata",
|
||||||
|
"iana-time-zone",
|
||||||
|
"js-sys",
|
||||||
|
"num-traits",
|
||||||
|
"wasm-bindgen",
|
||||||
|
"windows-targets 0.52.0",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "clap"
|
name = "clap"
|
||||||
version = "4.4.15"
|
version = "4.4.15"
|
||||||
|
@ -298,6 +327,17 @@ dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cron"
|
||||||
|
version = "0.12.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1ff76b51e4c068c52bfd2866e1567bee7c567ae8f24ada09fd4307019e25eab7"
|
||||||
|
dependencies = [
|
||||||
|
"chrono",
|
||||||
|
"nom",
|
||||||
|
"once_cell",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "deadpool"
|
name = "deadpool"
|
||||||
version = "0.10.0"
|
version = "0.10.0"
|
||||||
|
@ -636,6 +676,29 @@ dependencies = [
|
||||||
"tokio-rustls",
|
"tokio-rustls",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "iana-time-zone"
|
||||||
|
version = "0.1.59"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b6a67363e2aa4443928ce15e57ebae94fd8949958fd1223c4cfc0cd473ad7539"
|
||||||
|
dependencies = [
|
||||||
|
"android_system_properties",
|
||||||
|
"core-foundation-sys",
|
||||||
|
"iana-time-zone-haiku",
|
||||||
|
"js-sys",
|
||||||
|
"wasm-bindgen",
|
||||||
|
"windows-core",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "iana-time-zone-haiku"
|
||||||
|
version = "0.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "idna"
|
name = "idna"
|
||||||
version = "0.5.0"
|
version = "0.5.0"
|
||||||
|
@ -745,6 +808,12 @@ version = "0.3.17"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
|
checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "minimal-lexical"
|
||||||
|
version = "0.2.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "miniz_oxide"
|
name = "miniz_oxide"
|
||||||
version = "0.7.1"
|
version = "0.7.1"
|
||||||
|
@ -774,6 +843,16 @@ dependencies = [
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "nom"
|
||||||
|
version = "7.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
"minimal-lexical",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "nu-ansi-term"
|
name = "nu-ansi-term"
|
||||||
version = "0.46.0"
|
version = "0.46.0"
|
||||||
|
@ -784,6 +863,15 @@ dependencies = [
|
||||||
"winapi",
|
"winapi",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "num-traits"
|
||||||
|
version = "0.2.17"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
|
||||||
|
dependencies = [
|
||||||
|
"autocfg",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "num_cpus"
|
name = "num_cpus"
|
||||||
version = "1.16.0"
|
version = "1.16.0"
|
||||||
|
@ -1135,7 +1223,9 @@ version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"again",
|
"again",
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"chrono",
|
||||||
"clap",
|
"clap",
|
||||||
|
"cron",
|
||||||
"deadpool",
|
"deadpool",
|
||||||
"deadpool-sqlite",
|
"deadpool-sqlite",
|
||||||
"futures",
|
"futures",
|
||||||
|
@ -1688,6 +1778,15 @@ version = "0.4.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-core"
|
||||||
|
version = "0.52.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
|
||||||
|
dependencies = [
|
||||||
|
"windows-targets 0.52.0",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "windows-sys"
|
name = "windows-sys"
|
||||||
version = "0.48.0"
|
version = "0.48.0"
|
||||||
|
|
|
@ -8,7 +8,9 @@ edition = "2021"
|
||||||
[dependencies]
|
[dependencies]
|
||||||
again = "0.1.2"
|
again = "0.1.2"
|
||||||
anyhow = "1.0.79"
|
anyhow = "1.0.79"
|
||||||
|
chrono = "0.4.32"
|
||||||
clap = { version = "4.4.15", features = ["derive"] }
|
clap = { version = "4.4.15", features = ["derive"] }
|
||||||
|
cron = "0.12.0"
|
||||||
deadpool = "0.10.0"
|
deadpool = "0.10.0"
|
||||||
deadpool-sqlite = "0.7.0"
|
deadpool-sqlite = "0.7.0"
|
||||||
futures = "0.3.30"
|
futures = "0.3.30"
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
use again::RetryPolicy;
|
use again::RetryPolicy;
|
||||||
use clap::{Parser, ValueEnum};
|
use clap::{Parser, ValueEnum};
|
||||||
|
use cron::Schedule;
|
||||||
use deadpool_sqlite::Pool;
|
use deadpool_sqlite::Pool;
|
||||||
use futures::{future, stream, StreamExt};
|
use futures::{future, stream, StreamExt};
|
||||||
use nanoid::nanoid;
|
use nanoid::nanoid;
|
||||||
|
@ -9,10 +10,10 @@ use std::{
|
||||||
env::{self},
|
env::{self},
|
||||||
fs,
|
fs,
|
||||||
path::PathBuf,
|
path::PathBuf,
|
||||||
time::Duration,
|
str::FromStr,
|
||||||
|
time::{Duration, SystemTime, UNIX_EPOCH},
|
||||||
};
|
};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use tokio::time;
|
|
||||||
|
|
||||||
#[derive(ValueEnum, Clone, Debug)]
|
#[derive(ValueEnum, Clone, Debug)]
|
||||||
enum Supermercado {
|
enum Supermercado {
|
||||||
|
@ -399,16 +400,24 @@ async fn auto_cli() -> anyhow::Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
async fn cron_cli() -> anyhow::Result<()> {
|
async fn cron_cli() -> anyhow::Result<()> {
|
||||||
let mut interval = time::interval(std::time::Duration::from_secs(60 * 60 * 24));
|
// https://crontab.guru
|
||||||
|
let schedule = Schedule::from_str("0 0 2 * * * *").unwrap();
|
||||||
|
// let schedule = Schedule::from_str("0 26 21 * * * *").unwrap();
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
interval.tick().await;
|
let t = schedule
|
||||||
tokio::spawn(auto_cli());
|
.upcoming(chrono::Utc)
|
||||||
|
.next()
|
||||||
|
.unwrap()
|
||||||
|
.signed_duration_since(chrono::Utc::now())
|
||||||
|
.to_std()
|
||||||
|
.unwrap();
|
||||||
|
println!("Waiting for {:?}", t);
|
||||||
|
tokio::time::sleep(t).await;
|
||||||
|
auto_cli().await.unwrap();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
use std::time::{SystemTime, UNIX_EPOCH};
|
|
||||||
|
|
||||||
mod sites;
|
mod sites;
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
|
|
|
@ -1,4 +1,2 @@
|
||||||
export { db } from "db-datos/db.js";
|
export { getDb } from "db-datos/db.js";
|
||||||
export * as schema from "db-datos/schema.js";
|
export * as schema from "db-datos/schema.js";
|
||||||
import { migrateDb } from "db-datos/migrate.js";
|
|
||||||
migrateDb();
|
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
import { countDistinct } from "drizzle-orm";
|
import { countDistinct } from "drizzle-orm";
|
||||||
import type { PageServerLoad } from "./$types";
|
import type { PageServerLoad } from "./$types";
|
||||||
import { db, schema } from "$lib/server/db";
|
import { getDb, schema } from "$lib/server/db";
|
||||||
const { precios } = schema;
|
const { precios } = schema;
|
||||||
|
|
||||||
export const load: PageServerLoad = async () => {
|
export const load: PageServerLoad = async () => {
|
||||||
|
const db = await getDb();
|
||||||
const nProductosR = await db
|
const nProductosR = await db
|
||||||
.select({
|
.select({
|
||||||
count: countDistinct(precios.ean),
|
count: countDistinct(precios.ean),
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
import type { PageData, PageServerLoad } from "./$types";
|
import type { PageData, PageServerLoad } from "./$types";
|
||||||
import { db, schema } from "$lib/server/db";
|
import { getDb, schema } from "$lib/server/db";
|
||||||
const { precios } = schema;
|
const { precios } = schema;
|
||||||
import { sql } from "drizzle-orm";
|
import { sql } from "drizzle-orm";
|
||||||
|
|
||||||
let cache: Promise<{ key: Date; data: { precios: Precios } }> = doQuery();
|
let cache: Promise<{ key: Date; data: { precios: Precios } }> = doQuery();
|
||||||
|
|
||||||
async function doQuery() {
|
async function doQuery() {
|
||||||
|
const db = await getDb();
|
||||||
const q = db
|
const q = db
|
||||||
.select({
|
.select({
|
||||||
ean: precios.ean,
|
ean: precios.ean,
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
import { error } from "@sveltejs/kit";
|
import { error } from "@sveltejs/kit";
|
||||||
import { eq, max } from "drizzle-orm";
|
import { eq, max } from "drizzle-orm";
|
||||||
import type { PageServerLoad } from "./$types";
|
import type { PageServerLoad } from "./$types";
|
||||||
import { db, schema } from "$lib/server/db";
|
import { getDb, schema } from "$lib/server/db";
|
||||||
const { precios } = schema;
|
const { precios } = schema;
|
||||||
|
|
||||||
export const load: PageServerLoad = async ({ params }) => {
|
export const load: PageServerLoad = async ({ params }) => {
|
||||||
|
const db = await getDb();
|
||||||
const q = db
|
const q = db
|
||||||
.select()
|
.select()
|
||||||
.from(precios)
|
.from(precios)
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
import { error } from "@sveltejs/kit";
|
import { error } from "@sveltejs/kit";
|
||||||
import { sql } from "drizzle-orm";
|
import { sql } from "drizzle-orm";
|
||||||
import type { PageServerLoad } from "./$types";
|
import type { PageServerLoad } from "./$types";
|
||||||
import { db } from "$lib/server/db";
|
import { getDb } from "$lib/server/db";
|
||||||
|
|
||||||
export const load: PageServerLoad = async ({ url }) => {
|
export const load: PageServerLoad = async ({ url }) => {
|
||||||
|
const db = await getDb();
|
||||||
const query = url.searchParams.get("q");
|
const query = url.searchParams.get("q");
|
||||||
let results: null | { ean: string; name: string; imageUrl: string }[] = null;
|
let results: null | { ean: string; name: string; imageUrl: string }[] = null;
|
||||||
if (query) {
|
if (query) {
|
||||||
|
|
|
@ -17,5 +17,5 @@
|
||||||
"forceConsistentCasingInFileNames": true
|
"forceConsistentCasingInFileNames": true
|
||||||
},
|
},
|
||||||
"include": ["**/*.ts", "**/*.js"],
|
"include": ["**/*.ts", "**/*.js"],
|
||||||
"exclude": ["sitio/build"]
|
"exclude": ["./scraper-rs", "data"]
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue