diff --git a/db-datos/db.ts b/db-datos/db.js similarity index 68% rename from db-datos/db.ts rename to db-datos/db.js index 8781000..4ba189d 100644 --- a/db-datos/db.ts +++ b/db-datos/db.js @@ -1,5 +1,6 @@ -import { Database } from "bun:sqlite"; -import { drizzle } from "drizzle-orm/bun-sqlite"; +// @ts-check +import Database from "better-sqlite3"; +import { drizzle } from "drizzle-orm/better-sqlite3"; import { DB_PATH } from "./drizzle.config.js"; import { migrateDb } from "./migrate.js"; import * as schema from "./schema.js"; diff --git a/db-datos/drizzle.config.ts b/db-datos/drizzle.config.js similarity index 73% rename from db-datos/drizzle.config.ts rename to db-datos/drizzle.config.js index 058b309..3a95609 100644 --- a/db-datos/drizzle.config.ts +++ b/db-datos/drizzle.config.js @@ -1,11 +1,10 @@ -import type { Config } from "drizzle-kit"; - export const DB_PATH = process.env.DB_PATH ?? "../scraper/sqlite.db"; +/** @type { import("drizzle-kit").Config } */ export default { schema: "./schema.ts", out: "./drizzle", dbCredentials: { url: DB_PATH, }, -} satisfies Config; +}; diff --git a/db-datos/migrate.ts b/db-datos/migrate.js similarity index 70% rename from db-datos/migrate.ts rename to db-datos/migrate.js index c45fb9b..7e0ccd5 100644 --- a/db-datos/migrate.ts +++ b/db-datos/migrate.js @@ -1,7 +1,8 @@ -import Database from "bun:sqlite"; +// @ts-check +import Database from "better-sqlite3"; import { join, dirname } from "node:path"; -import { drizzle } from "drizzle-orm/bun-sqlite"; -import { migrate } from "drizzle-orm/bun-sqlite/migrator"; +import { drizzle } from "drizzle-orm/better-sqlite3"; +import { migrate } from "drizzle-orm/better-sqlite3/migrator"; import * as schema from "./schema.js"; import { DB_PATH } from "./drizzle.config.js"; @@ -11,7 +12,7 @@ export function migrateDb() { const db = drizzle(sqlite, { schema }); migrate(db, { migrationsFolder: join(dirname(url.pathname), "drizzle") }); - sqlite.run(` + sqlite.exec(` pragma journal_mode = WAL; PRAGMA synchronous = NORMAL; `); diff --git a/db-datos/package.json b/db-datos/package.json index 5020c26..92949dc 100644 --- a/db-datos/package.json +++ b/db-datos/package.json @@ -5,15 +5,19 @@ "description": "", "main": "index.js", "scripts": { - "generate": "drizzle-kit generate:sqlite" + "generate": "drizzle-kit generate:sqlite", + "migrate": "node db.js" }, "keywords": [], "author": "", "license": "ISC", "dependencies": { + "better-sqlite3": "^9.2.2", "drizzle-orm": "^0.29.1" }, "devDependencies": { + "@types/better-sqlite3": "^7.6.8", + "@types/node": "^20.10.6", "drizzle-kit": "^0.20.7" } } diff --git a/db-datos/schema.ts b/db-datos/schema.js similarity index 88% rename from db-datos/schema.ts rename to db-datos/schema.js index a45efda..2b921b9 100644 --- a/db-datos/schema.ts +++ b/db-datos/schema.js @@ -1,3 +1,4 @@ +// @ts-check import { index, integer, sqliteTable, text } from "drizzle-orm/sqlite-core"; export const precios = sqliteTable( @@ -18,10 +19,10 @@ export const precios = sqliteTable( return { preciosEanIdx: index("precios_ean_idx").on(precios.ean), }; - } + }, ); -export type Precio = typeof precios.$inferSelect; +/** @typedef {typeof precios.$inferSelect} Precio */ export const productoUrls = sqliteTable("producto_urls", { id: integer("id", { mode: "number" }).primaryKey({ autoIncrement: true }), @@ -30,4 +31,4 @@ export const productoUrls = sqliteTable("producto_urls", { lastSeen: integer("last_seen", { mode: "timestamp" }).notNull(), }); -export type ProductUrl = typeof productoUrls.$inferSelect; +/** @typedef {typeof productoUrls.$inferSelect} ProductUrl */ diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 40bca24..9c2f61a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -8,10 +8,19 @@ importers: db-datos: dependencies: + better-sqlite3: + specifier: ^9.2.2 + version: 9.2.2 drizzle-orm: specifier: ^0.29.1 version: 0.29.3(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2) devDependencies: + '@types/better-sqlite3': + specifier: ^7.6.8 + version: 7.6.8 + '@types/node': + specifier: ^20.10.6 + version: 20.11.0 drizzle-kit: specifier: ^0.20.7 version: 0.20.12 diff --git a/readme.md b/readme.md index 847b3fb..7e4f7b8 100644 --- a/readme.md +++ b/readme.md @@ -2,31 +2,40 @@ scrapeo "masivo" de precios y datos en supermercados argentinos -## componentes (en orden de proceso) +## componentes -- los link scrapers ([link-scrapers/](./link-scrapers/)) crean listas de links a productos para scrapear +### scraper-rs - (no hace falta correrlos porque ya hay listas armadas en [data/](./data/)) +el [scraper](./scraper-rs/) busca links de productos a scrapear, descarga todos los links, extrae varios datos y los guarda en una base de datos SQLite (definida en [db-datos](./db-datos/schema.ts)). -- el [scraper](./scraper/) descarga todos los links, extrayendo varios datos y guardandolos en una base de datos SQLite (definida en [db-datos](./db-datos/schema.ts)) -- el [sitio](./sitio/) renderiza páginas a partir de la base de datos y hace gráficos lindos +(antes había un scraper escrito en JavaScript, pero por problemas de _reliability_ lo reescribí en Rust (?)) + +### sitio + +el [sitio](./sitio/) renderiza páginas a partir de la base de datos y hace gráficos lindos. ## setup -hay que instalar [Bun](https://bun.sh/), que lo estoy usando porque hacía que el scraper corra más rápido. quizás en el futuro lo reemplace con good old Node.js. +para el schema de la base de datos y el sitio, es necesario [Node.js](https://nodejs.org/) y [pnpm](https://pnpm.io/). para el scraper, es necesario [Rust](https://www.rust-lang.org/) estable. + +crea la base de datos: +``` +cd db-datos/ +pnpm install +pnpm migrate +``` después, escrapea un sample de productos de Carrefour a una BD: ``` -cd scraper/ -bun install -bun cli.ts scrap ./data/samples/Carrefour.50.txt +cd ../scraper-rs/ +cargo run -- fetch-list ../data/samples/Carrefour.50.txt ``` ahora miralo en el sitio: ``` -cd sitio/ -bun install -bun dev +cd ../sitio/ +pnpm install +pnpm dev ```