mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-22 14:16:19 +00:00
Compare commits
4 commits
5e738985f6
...
958daf0abd
Author | SHA1 | Date | |
---|---|---|---|
958daf0abd | |||
68bea964be | |||
260aab1ea2 | |||
06c0e5814e |
13 changed files with 4194 additions and 47 deletions
19
.github/workflows/container.yml
vendored
19
.github/workflows/container.yml
vendored
|
@ -14,16 +14,19 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
- uses: oven-sh/setup-bun@v1
|
- uses: pnpm/action-setup@v2
|
||||||
|
with:
|
||||||
|
version: 8
|
||||||
|
- name: Use Node.js 20
|
||||||
|
uses: actions/setup-node@v3
|
||||||
|
with:
|
||||||
|
node-version: 20
|
||||||
|
cache: 'pnpm'
|
||||||
|
- name: Install dependencies
|
||||||
|
run: pnpm install
|
||||||
|
|
||||||
- run: bun install
|
- run: pnpm check
|
||||||
working-directory: ./sitio
|
working-directory: ./sitio
|
||||||
- run: bun check
|
|
||||||
working-directory: ./sitio
|
|
||||||
- run: bun install
|
|
||||||
working-directory: ./scraper
|
|
||||||
- run: bun check
|
|
||||||
working-directory: ./scraper
|
|
||||||
|
|
||||||
build-and-push-sitio:
|
build-and-push-sitio:
|
||||||
needs: check
|
needs: check
|
||||||
|
|
BIN
bun.lockb
BIN
bun.lockb
Binary file not shown.
|
@ -1,5 +1,6 @@
|
||||||
import { Database } from "bun:sqlite";
|
// @ts-check
|
||||||
import { drizzle } from "drizzle-orm/bun-sqlite";
|
import Database from "better-sqlite3";
|
||||||
|
import { drizzle } from "drizzle-orm/better-sqlite3";
|
||||||
import { DB_PATH } from "./drizzle.config.js";
|
import { DB_PATH } from "./drizzle.config.js";
|
||||||
import { migrateDb } from "./migrate.js";
|
import { migrateDb } from "./migrate.js";
|
||||||
import * as schema from "./schema.js";
|
import * as schema from "./schema.js";
|
|
@ -1,11 +1,10 @@
|
||||||
import type { Config } from "drizzle-kit";
|
|
||||||
|
|
||||||
export const DB_PATH = process.env.DB_PATH ?? "../scraper/sqlite.db";
|
export const DB_PATH = process.env.DB_PATH ?? "../scraper/sqlite.db";
|
||||||
|
|
||||||
|
/** @type { import("drizzle-kit").Config } */
|
||||||
export default {
|
export default {
|
||||||
schema: "./schema.ts",
|
schema: "./schema.ts",
|
||||||
out: "./drizzle",
|
out: "./drizzle",
|
||||||
dbCredentials: {
|
dbCredentials: {
|
||||||
url: DB_PATH,
|
url: DB_PATH,
|
||||||
},
|
},
|
||||||
} satisfies Config;
|
};
|
|
@ -1,7 +1,8 @@
|
||||||
import Database from "bun:sqlite";
|
// @ts-check
|
||||||
|
import Database from "better-sqlite3";
|
||||||
import { join, dirname } from "node:path";
|
import { join, dirname } from "node:path";
|
||||||
import { drizzle } from "drizzle-orm/bun-sqlite";
|
import { drizzle } from "drizzle-orm/better-sqlite3";
|
||||||
import { migrate } from "drizzle-orm/bun-sqlite/migrator";
|
import { migrate } from "drizzle-orm/better-sqlite3/migrator";
|
||||||
import * as schema from "./schema.js";
|
import * as schema from "./schema.js";
|
||||||
import { DB_PATH } from "./drizzle.config.js";
|
import { DB_PATH } from "./drizzle.config.js";
|
||||||
|
|
||||||
|
@ -11,7 +12,7 @@ export function migrateDb() {
|
||||||
const db = drizzle(sqlite, { schema });
|
const db = drizzle(sqlite, { schema });
|
||||||
|
|
||||||
migrate(db, { migrationsFolder: join(dirname(url.pathname), "drizzle") });
|
migrate(db, { migrationsFolder: join(dirname(url.pathname), "drizzle") });
|
||||||
sqlite.run(`
|
sqlite.exec(`
|
||||||
pragma journal_mode = WAL;
|
pragma journal_mode = WAL;
|
||||||
PRAGMA synchronous = NORMAL;
|
PRAGMA synchronous = NORMAL;
|
||||||
`);
|
`);
|
|
@ -5,16 +5,19 @@
|
||||||
"description": "",
|
"description": "",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"generate": "drizzle-kit generate:sqlite"
|
"generate": "drizzle-kit generate:sqlite",
|
||||||
|
"migrate": "node db.js"
|
||||||
},
|
},
|
||||||
"keywords": [],
|
"keywords": [],
|
||||||
"author": "",
|
"author": "",
|
||||||
"license": "ISC",
|
"license": "ISC",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"better-sqlite3": "^9.2.2",
|
||||||
"drizzle-orm": "^0.29.1"
|
"drizzle-orm": "^0.29.1"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/bun": "^1.0.0",
|
"@types/better-sqlite3": "^7.6.8",
|
||||||
|
"@types/node": "^20.10.6",
|
||||||
"drizzle-kit": "^0.20.7"
|
"drizzle-kit": "^0.20.7"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
// @ts-check
|
||||||
import { index, integer, sqliteTable, text } from "drizzle-orm/sqlite-core";
|
import { index, integer, sqliteTable, text } from "drizzle-orm/sqlite-core";
|
||||||
|
|
||||||
export const precios = sqliteTable(
|
export const precios = sqliteTable(
|
||||||
|
@ -18,10 +19,10 @@ export const precios = sqliteTable(
|
||||||
return {
|
return {
|
||||||
preciosEanIdx: index("precios_ean_idx").on(precios.ean),
|
preciosEanIdx: index("precios_ean_idx").on(precios.ean),
|
||||||
};
|
};
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
export type Precio = typeof precios.$inferSelect;
|
/** @typedef {typeof precios.$inferSelect} Precio */
|
||||||
|
|
||||||
export const productoUrls = sqliteTable("producto_urls", {
|
export const productoUrls = sqliteTable("producto_urls", {
|
||||||
id: integer("id", { mode: "number" }).primaryKey({ autoIncrement: true }),
|
id: integer("id", { mode: "number" }).primaryKey({ autoIncrement: true }),
|
||||||
|
@ -30,4 +31,4 @@ export const productoUrls = sqliteTable("producto_urls", {
|
||||||
lastSeen: integer("last_seen", { mode: "timestamp" }).notNull(),
|
lastSeen: integer("last_seen", { mode: "timestamp" }).notNull(),
|
||||||
});
|
});
|
||||||
|
|
||||||
export type ProductUrl = typeof productoUrls.$inferSelect;
|
/** @typedef {typeof productoUrls.$inferSelect} ProductUrl */
|
10
package.json
10
package.json
|
@ -1,10 +0,0 @@
|
||||||
{
|
|
||||||
"name": "preciazo",
|
|
||||||
"private": true,
|
|
||||||
"workspaces": [
|
|
||||||
"link-scrapers",
|
|
||||||
"scraper",
|
|
||||||
"sitio",
|
|
||||||
"db-datos"
|
|
||||||
]
|
|
||||||
}
|
|
4136
pnpm-lock.yaml
Normal file
4136
pnpm-lock.yaml
Normal file
File diff suppressed because it is too large
Load diff
5
pnpm-workspace.yaml
Normal file
5
pnpm-workspace.yaml
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
packages:
|
||||||
|
- link-scrapers
|
||||||
|
- scraper
|
||||||
|
- sitio
|
||||||
|
- db-datos
|
33
readme.md
33
readme.md
|
@ -2,31 +2,40 @@
|
||||||
|
|
||||||
scrapeo "masivo" de precios y datos en supermercados argentinos
|
scrapeo "masivo" de precios y datos en supermercados argentinos
|
||||||
|
|
||||||
## componentes (en orden de proceso)
|
## componentes
|
||||||
|
|
||||||
- los link scrapers ([link-scrapers/](./link-scrapers/)) crean listas de links a productos para scrapear
|
### scraper-rs
|
||||||
|
|
||||||
(no hace falta correrlos porque ya hay listas armadas en [data/](./data/))
|
el [scraper](./scraper-rs/) busca links de productos a scrapear, descarga todos los links, extrae varios datos y los guarda en una base de datos SQLite (definida en [db-datos](./db-datos/schema.ts)).
|
||||||
|
|
||||||
- el [scraper](./scraper/) descarga todos los links, extrayendo varios datos y guardandolos en una base de datos SQLite (definida en [db-datos](./db-datos/schema.ts))
|
(antes había un scraper escrito en JavaScript, pero por problemas de _reliability_ lo reescribí en Rust (?))
|
||||||
- el [sitio](./sitio/) renderiza páginas a partir de la base de datos y hace gráficos lindos
|
|
||||||
|
### sitio
|
||||||
|
|
||||||
|
el [sitio](./sitio/) renderiza páginas a partir de la base de datos y hace gráficos lindos.
|
||||||
|
|
||||||
## setup
|
## setup
|
||||||
|
|
||||||
hay que instalar [Bun](https://bun.sh/), que lo estoy usando porque hacía que el scraper corra más rápido. quizás en el futuro lo reemplace con good old Node.js.
|
para el schema de la base de datos y el sitio, es necesario [Node.js](https://nodejs.org/) y [pnpm](https://pnpm.io/). para el scraper, es necesario [Rust](https://www.rust-lang.org/) estable.
|
||||||
|
|
||||||
|
crea la base de datos:
|
||||||
|
```
|
||||||
|
cd db-datos/
|
||||||
|
pnpm install
|
||||||
|
pnpm migrate
|
||||||
|
```
|
||||||
|
|
||||||
después, escrapea un sample de productos de Carrefour a una BD:
|
después, escrapea un sample de productos de Carrefour a una BD:
|
||||||
|
|
||||||
```
|
```
|
||||||
cd scraper/
|
cd ../scraper-rs/
|
||||||
bun install
|
cargo run -- fetch-list ../data/samples/Carrefour.50.txt
|
||||||
bun cli.ts scrap ./data/samples/Carrefour.50.txt
|
|
||||||
```
|
```
|
||||||
|
|
||||||
ahora miralo en el sitio:
|
ahora miralo en el sitio:
|
||||||
|
|
||||||
```
|
```
|
||||||
cd sitio/
|
cd ../sitio/
|
||||||
bun install
|
pnpm install
|
||||||
bun dev
|
pnpm dev
|
||||||
```
|
```
|
||||||
|
|
|
@ -24,7 +24,6 @@
|
||||||
"zod": "^3.22.4"
|
"zod": "^3.22.4"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/bun": "^1.0.0",
|
|
||||||
"typescript": "^5.3.3"
|
"typescript": "^5.3.3"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,8 +5,8 @@
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"dev": "vite dev",
|
"dev": "vite dev",
|
||||||
"build": "vite build",
|
"build": "vite build",
|
||||||
"build:container": "podman build -t gitea.nulo.in/nulo/preciazo/sitio -f ./Containerfile ..",
|
"build:container": "podman build -t gitea.nulo.in/nulo/preciazo/sitio -f ./Dockerfile ..",
|
||||||
"push:container": "bun build:container && podman push gitea.nulo.in/nulo/preciazo/sitio",
|
"push:container": "pnpm build:container && podman push gitea.nulo.in/nulo/preciazo/sitio",
|
||||||
"preview": "vite preview",
|
"preview": "vite preview",
|
||||||
"check": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json",
|
"check": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json",
|
||||||
"check:watch": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json --watch",
|
"check:watch": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json --watch",
|
||||||
|
|
Loading…
Reference in a new issue