Compare commits

...

8 commits

Author SHA1 Message Date
290d29ea78 dockerfile: oops 2024-01-24 21:17:34 -03:00
d58df3fd04 a verrrr 2024-01-24 20:13:20 -03:00
dbbd8e7f3a dockerfile: cachear deps 2024-01-24 18:49:43 -03:00
7ccc2432e3 ignorar db en dockerignore 2024-01-24 18:49:26 -03:00
28579d6883 tsconfig: arreglar diagnostics 2024-01-24 18:49:17 -03:00
378d4a4313 usar un cron posta 2024-01-24 18:26:40 -03:00
7ae225b1d6 sitio: migrar al iniciar 2024-01-24 17:57:59 -03:00
4f1ea65de0 xd 2024-01-24 17:31:48 -03:00
14 changed files with 149 additions and 58 deletions

View file

@ -1,6 +1,7 @@
data/warcs/ data/warcs/
data/carrefour/ data/carrefour/
*/*.db* */*.db*
sqlite.db
downloader/ downloader/
node_modules/ node_modules/
*/node_modules/ */node_modules/

View file

@ -4,6 +4,10 @@ WORKDIR /usr/src/app
FROM base as build FROM base as build
RUN apk add --no-cache nodejs npm RUN apk add --no-cache nodejs npm
RUN npm install --global pnpm RUN npm install --global pnpm
COPY db-datos/package.json db-datos/package.json
COPY sitio/package.json sitio/package.json
COPY pnpm-lock.yaml pnpm-workspace.yaml .
RUN cd sitio && pnpm install
COPY . . COPY . .
COPY db-datos/drizzle . COPY db-datos/drizzle .
RUN cd sitio && \ RUN cd sitio && \

View file

@ -3,6 +3,13 @@ import Database from "better-sqlite3";
import { drizzle } from "drizzle-orm/better-sqlite3"; import { drizzle } from "drizzle-orm/better-sqlite3";
import { DB_PATH } from "./drizzle.config.js"; import { DB_PATH } from "./drizzle.config.js";
import * as schema from "./schema.js"; import * as schema from "./schema.js";
import { migrateDb } from "./migrate.js";
export const sqlite = new Database(DB_PATH); /** @type {null | import("drizzle-orm/better-sqlite3").BetterSQLite3Database<schema>} */
export const db = drizzle(sqlite, { schema }); let db = null;
export function getDb() {
const sqlite = new Database(DB_PATH);
db = drizzle(sqlite, { schema });
migrateDb(db);
return db;
}

View file

@ -1,21 +1,13 @@
// @ts-check // @ts-check
import Database from "better-sqlite3";
import { join, dirname } from "node:path";
import { drizzle } from "drizzle-orm/better-sqlite3";
import { migrate } from "drizzle-orm/better-sqlite3/migrator"; import { migrate } from "drizzle-orm/better-sqlite3/migrator";
import * as schema from "./schema.js"; import * as schema from "./schema.js";
import { DB_PATH } from "./drizzle.config.js"; import { sql } from "drizzle-orm";
const url = new URL(import.meta.url); /**
export function migrateDb() { * @param {import("drizzle-orm/better-sqlite3").BetterSQLite3Database<schema>} db
const sqlite = new Database(DB_PATH); */
const db = drizzle(sqlite, { schema }); export function migrateDb(db) {
migrate(db, { migrationsFolder: "node_modules/db-datos/drizzle" });
migrate(db, { migrationsFolder: join(dirname(url.pathname), "drizzle") }); db.run(sql`pragma journal_mode = WAL;`);
sqlite.exec(` db.run(sql`PRAGMA synchronous = NORMAL;`);
pragma journal_mode = WAL;
PRAGMA synchronous = NORMAL;
`);
sqlite.close();
} }

View file

@ -1,25 +0,0 @@
import { sql } from "drizzle-orm";
import { db } from "./db.js";
import { productoUrls } from "./schema.js";
export function saveUrls(urls: string[]) {
db.transaction((tx) => {
const now = new Date();
const insertUrlTra = tx
.insert(productoUrls)
.values({
url: sql.placeholder("url"),
firstSeen: now,
lastSeen: now,
})
.onConflictDoUpdate({
target: productoUrls.url,
set: { lastSeen: now },
})
.prepare();
for (const href of urls) {
insertUrlTra.run({ url: href });
}
});
}

99
scraper-rs/Cargo.lock generated
View file

@ -61,6 +61,21 @@ version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
[[package]]
name = "android-tzdata"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
[[package]]
name = "android_system_properties"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
dependencies = [
"libc",
]
[[package]] [[package]]
name = "anstream" name = "anstream"
version = "0.6.5" version = "0.6.5"
@ -227,6 +242,20 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chrono"
version = "0.4.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41daef31d7a747c5c847246f36de49ced6f7403b4cdabc807a97b5cc184cda7a"
dependencies = [
"android-tzdata",
"iana-time-zone",
"js-sys",
"num-traits",
"wasm-bindgen",
"windows-targets 0.52.0",
]
[[package]] [[package]]
name = "clap" name = "clap"
version = "4.4.15" version = "4.4.15"
@ -298,6 +327,17 @@ dependencies = [
"cfg-if", "cfg-if",
] ]
[[package]]
name = "cron"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ff76b51e4c068c52bfd2866e1567bee7c567ae8f24ada09fd4307019e25eab7"
dependencies = [
"chrono",
"nom",
"once_cell",
]
[[package]] [[package]]
name = "deadpool" name = "deadpool"
version = "0.10.0" version = "0.10.0"
@ -636,6 +676,29 @@ dependencies = [
"tokio-rustls", "tokio-rustls",
] ]
[[package]]
name = "iana-time-zone"
version = "0.1.59"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6a67363e2aa4443928ce15e57ebae94fd8949958fd1223c4cfc0cd473ad7539"
dependencies = [
"android_system_properties",
"core-foundation-sys",
"iana-time-zone-haiku",
"js-sys",
"wasm-bindgen",
"windows-core",
]
[[package]]
name = "iana-time-zone-haiku"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
dependencies = [
"cc",
]
[[package]] [[package]]
name = "idna" name = "idna"
version = "0.5.0" version = "0.5.0"
@ -745,6 +808,12 @@ version = "0.3.17"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
[[package]]
name = "minimal-lexical"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
[[package]] [[package]]
name = "miniz_oxide" name = "miniz_oxide"
version = "0.7.1" version = "0.7.1"
@ -774,6 +843,16 @@ dependencies = [
"rand 0.8.5", "rand 0.8.5",
] ]
[[package]]
name = "nom"
version = "7.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
dependencies = [
"memchr",
"minimal-lexical",
]
[[package]] [[package]]
name = "nu-ansi-term" name = "nu-ansi-term"
version = "0.46.0" version = "0.46.0"
@ -784,6 +863,15 @@ dependencies = [
"winapi", "winapi",
] ]
[[package]]
name = "num-traits"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
dependencies = [
"autocfg",
]
[[package]] [[package]]
name = "num_cpus" name = "num_cpus"
version = "1.16.0" version = "1.16.0"
@ -1135,7 +1223,9 @@ version = "0.1.0"
dependencies = [ dependencies = [
"again", "again",
"anyhow", "anyhow",
"chrono",
"clap", "clap",
"cron",
"deadpool", "deadpool",
"deadpool-sqlite", "deadpool-sqlite",
"futures", "futures",
@ -1688,6 +1778,15 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-core"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
dependencies = [
"windows-targets 0.52.0",
]
[[package]] [[package]]
name = "windows-sys" name = "windows-sys"
version = "0.48.0" version = "0.48.0"

View file

@ -8,7 +8,9 @@ edition = "2021"
[dependencies] [dependencies]
again = "0.1.2" again = "0.1.2"
anyhow = "1.0.79" anyhow = "1.0.79"
chrono = "0.4.32"
clap = { version = "4.4.15", features = ["derive"] } clap = { version = "4.4.15", features = ["derive"] }
cron = "0.12.0"
deadpool = "0.10.0" deadpool = "0.10.0"
deadpool-sqlite = "0.7.0" deadpool-sqlite = "0.7.0"
futures = "0.3.30" futures = "0.3.30"

View file

@ -1,5 +1,6 @@
use again::RetryPolicy; use again::RetryPolicy;
use clap::{Parser, ValueEnum}; use clap::{Parser, ValueEnum};
use cron::Schedule;
use deadpool_sqlite::Pool; use deadpool_sqlite::Pool;
use futures::{future, stream, StreamExt}; use futures::{future, stream, StreamExt};
use nanoid::nanoid; use nanoid::nanoid;
@ -9,10 +10,10 @@ use std::{
env::{self}, env::{self},
fs, fs,
path::PathBuf, path::PathBuf,
time::Duration, str::FromStr,
time::{Duration, SystemTime, UNIX_EPOCH},
}; };
use thiserror::Error; use thiserror::Error;
use tokio::time;
#[derive(ValueEnum, Clone, Debug)] #[derive(ValueEnum, Clone, Debug)]
enum Supermercado { enum Supermercado {
@ -399,16 +400,24 @@ async fn auto_cli() -> anyhow::Result<()> {
Ok(()) Ok(())
} }
async fn cron_cli() -> anyhow::Result<()> { async fn cron_cli() -> anyhow::Result<()> {
let mut interval = time::interval(std::time::Duration::from_secs(60 * 60 * 24)); // https://crontab.guru
let schedule = Schedule::from_str("0 0 2 * * * *").unwrap();
// let schedule = Schedule::from_str("0 26 21 * * * *").unwrap();
loop { loop {
interval.tick().await; let t = schedule
tokio::spawn(auto_cli()); .upcoming(chrono::Utc)
.next()
.unwrap()
.signed_duration_since(chrono::Utc::now())
.to_std()
.unwrap();
println!("Waiting for {:?}", t);
tokio::time::sleep(t).await;
auto_cli().await.unwrap();
} }
} }
use std::time::{SystemTime, UNIX_EPOCH};
mod sites; mod sites;
#[derive(Debug)] #[derive(Debug)]

View file

@ -1,4 +1,2 @@
export { db } from "db-datos/db.js"; export { getDb } from "db-datos/db.js";
export * as schema from "db-datos/schema.js"; export * as schema from "db-datos/schema.js";
import { migrateDb } from "db-datos/migrate.js";
migrateDb();

View file

@ -1,9 +1,10 @@
import { countDistinct } from "drizzle-orm"; import { countDistinct } from "drizzle-orm";
import type { PageServerLoad } from "./$types"; import type { PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db"; import { getDb, schema } from "$lib/server/db";
const { precios } = schema; const { precios } = schema;
export const load: PageServerLoad = async () => { export const load: PageServerLoad = async () => {
const db = await getDb();
const nProductosR = await db const nProductosR = await db
.select({ .select({
count: countDistinct(precios.ean), count: countDistinct(precios.ean),

View file

@ -1,11 +1,12 @@
import type { PageData, PageServerLoad } from "./$types"; import type { PageData, PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db"; import { getDb, schema } from "$lib/server/db";
const { precios } = schema; const { precios } = schema;
import { sql } from "drizzle-orm"; import { sql } from "drizzle-orm";
let cache: Promise<{ key: Date; data: { precios: Precios } }> = doQuery(); let cache: Promise<{ key: Date; data: { precios: Precios } }> = doQuery();
async function doQuery() { async function doQuery() {
const db = await getDb();
const q = db const q = db
.select({ .select({
ean: precios.ean, ean: precios.ean,

View file

@ -1,10 +1,11 @@
import { error } from "@sveltejs/kit"; import { error } from "@sveltejs/kit";
import { eq, max } from "drizzle-orm"; import { eq, max } from "drizzle-orm";
import type { PageServerLoad } from "./$types"; import type { PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db"; import { getDb, schema } from "$lib/server/db";
const { precios } = schema; const { precios } = schema;
export const load: PageServerLoad = async ({ params }) => { export const load: PageServerLoad = async ({ params }) => {
const db = await getDb();
const q = db const q = db
.select() .select()
.from(precios) .from(precios)

View file

@ -1,9 +1,10 @@
import { error } from "@sveltejs/kit"; import { error } from "@sveltejs/kit";
import { sql } from "drizzle-orm"; import { sql } from "drizzle-orm";
import type { PageServerLoad } from "./$types"; import type { PageServerLoad } from "./$types";
import { db } from "$lib/server/db"; import { getDb } from "$lib/server/db";
export const load: PageServerLoad = async ({ url }) => { export const load: PageServerLoad = async ({ url }) => {
const db = await getDb();
const query = url.searchParams.get("q"); const query = url.searchParams.get("q");
let results: null | { ean: string; name: string; imageUrl: string }[] = null; let results: null | { ean: string; name: string; imageUrl: string }[] = null;
if (query) { if (query) {

View file

@ -17,5 +17,5 @@
"forceConsistentCasingInFileNames": true "forceConsistentCasingInFileNames": true
}, },
"include": ["**/*.ts", "**/*.js"], "include": ["**/*.ts", "**/*.js"],
"exclude": ["sitio/build"] "exclude": ["./scraper-rs", "data"]
} }