Compare commits

...

4 commits

Author SHA1 Message Date
958daf0abd ci: pnpm 2024-01-13 22:09:24 -03:00
68bea964be db-datos sin typescript, actualizar readme 2024-01-13 22:07:23 -03:00
260aab1ea2 bun->pnpm 2024-01-13 21:56:41 -03:00
06c0e5814e WIP: parar de usar Bun 2024-01-13 21:44:33 -03:00
13 changed files with 4194 additions and 47 deletions

View file

@ -14,16 +14,19 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
- uses: oven-sh/setup-bun@v1 - uses: pnpm/action-setup@v2
with:
version: 8
- name: Use Node.js 20
uses: actions/setup-node@v3
with:
node-version: 20
cache: 'pnpm'
- name: Install dependencies
run: pnpm install
- run: bun install - run: pnpm check
working-directory: ./sitio working-directory: ./sitio
- run: bun check
working-directory: ./sitio
- run: bun install
working-directory: ./scraper
- run: bun check
working-directory: ./scraper
build-and-push-sitio: build-and-push-sitio:
needs: check needs: check

BIN
bun.lockb

Binary file not shown.

View file

@ -1,5 +1,6 @@
import { Database } from "bun:sqlite"; // @ts-check
import { drizzle } from "drizzle-orm/bun-sqlite"; import Database from "better-sqlite3";
import { drizzle } from "drizzle-orm/better-sqlite3";
import { DB_PATH } from "./drizzle.config.js"; import { DB_PATH } from "./drizzle.config.js";
import { migrateDb } from "./migrate.js"; import { migrateDb } from "./migrate.js";
import * as schema from "./schema.js"; import * as schema from "./schema.js";

View file

@ -1,11 +1,10 @@
import type { Config } from "drizzle-kit";
export const DB_PATH = process.env.DB_PATH ?? "../scraper/sqlite.db"; export const DB_PATH = process.env.DB_PATH ?? "../scraper/sqlite.db";
/** @type { import("drizzle-kit").Config } */
export default { export default {
schema: "./schema.ts", schema: "./schema.ts",
out: "./drizzle", out: "./drizzle",
dbCredentials: { dbCredentials: {
url: DB_PATH, url: DB_PATH,
}, },
} satisfies Config; };

View file

@ -1,7 +1,8 @@
import Database from "bun:sqlite"; // @ts-check
import Database from "better-sqlite3";
import { join, dirname } from "node:path"; import { join, dirname } from "node:path";
import { drizzle } from "drizzle-orm/bun-sqlite"; import { drizzle } from "drizzle-orm/better-sqlite3";
import { migrate } from "drizzle-orm/bun-sqlite/migrator"; import { migrate } from "drizzle-orm/better-sqlite3/migrator";
import * as schema from "./schema.js"; import * as schema from "./schema.js";
import { DB_PATH } from "./drizzle.config.js"; import { DB_PATH } from "./drizzle.config.js";
@ -11,7 +12,7 @@ export function migrateDb() {
const db = drizzle(sqlite, { schema }); const db = drizzle(sqlite, { schema });
migrate(db, { migrationsFolder: join(dirname(url.pathname), "drizzle") }); migrate(db, { migrationsFolder: join(dirname(url.pathname), "drizzle") });
sqlite.run(` sqlite.exec(`
pragma journal_mode = WAL; pragma journal_mode = WAL;
PRAGMA synchronous = NORMAL; PRAGMA synchronous = NORMAL;
`); `);

View file

@ -5,16 +5,19 @@
"description": "", "description": "",
"main": "index.js", "main": "index.js",
"scripts": { "scripts": {
"generate": "drizzle-kit generate:sqlite" "generate": "drizzle-kit generate:sqlite",
"migrate": "node db.js"
}, },
"keywords": [], "keywords": [],
"author": "", "author": "",
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"better-sqlite3": "^9.2.2",
"drizzle-orm": "^0.29.1" "drizzle-orm": "^0.29.1"
}, },
"devDependencies": { "devDependencies": {
"@types/bun": "^1.0.0", "@types/better-sqlite3": "^7.6.8",
"@types/node": "^20.10.6",
"drizzle-kit": "^0.20.7" "drizzle-kit": "^0.20.7"
} }
} }

View file

@ -1,3 +1,4 @@
// @ts-check
import { index, integer, sqliteTable, text } from "drizzle-orm/sqlite-core"; import { index, integer, sqliteTable, text } from "drizzle-orm/sqlite-core";
export const precios = sqliteTable( export const precios = sqliteTable(
@ -18,10 +19,10 @@ export const precios = sqliteTable(
return { return {
preciosEanIdx: index("precios_ean_idx").on(precios.ean), preciosEanIdx: index("precios_ean_idx").on(precios.ean),
}; };
} },
); );
export type Precio = typeof precios.$inferSelect; /** @typedef {typeof precios.$inferSelect} Precio */
export const productoUrls = sqliteTable("producto_urls", { export const productoUrls = sqliteTable("producto_urls", {
id: integer("id", { mode: "number" }).primaryKey({ autoIncrement: true }), id: integer("id", { mode: "number" }).primaryKey({ autoIncrement: true }),
@ -30,4 +31,4 @@ export const productoUrls = sqliteTable("producto_urls", {
lastSeen: integer("last_seen", { mode: "timestamp" }).notNull(), lastSeen: integer("last_seen", { mode: "timestamp" }).notNull(),
}); });
export type ProductUrl = typeof productoUrls.$inferSelect; /** @typedef {typeof productoUrls.$inferSelect} ProductUrl */

View file

@ -1,10 +0,0 @@
{
"name": "preciazo",
"private": true,
"workspaces": [
"link-scrapers",
"scraper",
"sitio",
"db-datos"
]
}

4136
pnpm-lock.yaml Normal file

File diff suppressed because it is too large Load diff

5
pnpm-workspace.yaml Normal file
View file

@ -0,0 +1,5 @@
packages:
- link-scrapers
- scraper
- sitio
- db-datos

View file

@ -2,31 +2,40 @@
scrapeo "masivo" de precios y datos en supermercados argentinos scrapeo "masivo" de precios y datos en supermercados argentinos
## componentes (en orden de proceso) ## componentes
- los link scrapers ([link-scrapers/](./link-scrapers/)) crean listas de links a productos para scrapear ### scraper-rs
(no hace falta correrlos porque ya hay listas armadas en [data/](./data/)) el [scraper](./scraper-rs/) busca links de productos a scrapear, descarga todos los links, extrae varios datos y los guarda en una base de datos SQLite (definida en [db-datos](./db-datos/schema.ts)).
- el [scraper](./scraper/) descarga todos los links, extrayendo varios datos y guardandolos en una base de datos SQLite (definida en [db-datos](./db-datos/schema.ts)) (antes había un scraper escrito en JavaScript, pero por problemas de _reliability_ lo reescribí en Rust (?))
- el [sitio](./sitio/) renderiza páginas a partir de la base de datos y hace gráficos lindos
### sitio
el [sitio](./sitio/) renderiza páginas a partir de la base de datos y hace gráficos lindos.
## setup ## setup
hay que instalar [Bun](https://bun.sh/), que lo estoy usando porque hacía que el scraper corra más rápido. quizás en el futuro lo reemplace con good old Node.js. para el schema de la base de datos y el sitio, es necesario [Node.js](https://nodejs.org/) y [pnpm](https://pnpm.io/). para el scraper, es necesario [Rust](https://www.rust-lang.org/) estable.
crea la base de datos:
```
cd db-datos/
pnpm install
pnpm migrate
```
después, escrapea un sample de productos de Carrefour a una BD: después, escrapea un sample de productos de Carrefour a una BD:
``` ```
cd scraper/ cd ../scraper-rs/
bun install cargo run -- fetch-list ../data/samples/Carrefour.50.txt
bun cli.ts scrap ./data/samples/Carrefour.50.txt
``` ```
ahora miralo en el sitio: ahora miralo en el sitio:
``` ```
cd sitio/ cd ../sitio/
bun install pnpm install
bun dev pnpm dev
``` ```

View file

@ -24,7 +24,6 @@
"zod": "^3.22.4" "zod": "^3.22.4"
}, },
"devDependencies": { "devDependencies": {
"@types/bun": "^1.0.0",
"typescript": "^5.3.3" "typescript": "^5.3.3"
} }
} }

View file

@ -5,8 +5,8 @@
"scripts": { "scripts": {
"dev": "vite dev", "dev": "vite dev",
"build": "vite build", "build": "vite build",
"build:container": "podman build -t gitea.nulo.in/nulo/preciazo/sitio -f ./Containerfile ..", "build:container": "podman build -t gitea.nulo.in/nulo/preciazo/sitio -f ./Dockerfile ..",
"push:container": "bun build:container && podman push gitea.nulo.in/nulo/preciazo/sitio", "push:container": "pnpm build:container && podman push gitea.nulo.in/nulo/preciazo/sitio",
"preview": "vite preview", "preview": "vite preview",
"check": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json", "check": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json",
"check:watch": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json --watch", "check:watch": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json --watch",