mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 03:26:19 +00:00
Compare commits
3 commits
88e3aef8ad
...
4589eee5c0
Author | SHA1 | Date | |
---|---|---|---|
4589eee5c0 | |||
ff9a69e5cd | |||
1dce9abe59 |
15 changed files with 148 additions and 3573 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -12,5 +12,4 @@ scraper/debug/
|
||||||
scraper/x.tsv
|
scraper/x.tsv
|
||||||
*.tmp
|
*.tmp
|
||||||
target/
|
target/
|
||||||
.env
|
.env.*
|
||||||
.env*
|
|
BIN
bun.lockb
BIN
bun.lockb
Binary file not shown.
1
db-datos/.env
Normal file
1
db-datos/.env
Normal file
|
@ -0,0 +1 @@
|
||||||
|
DB_PATH=../scraper/sqlite.db
|
|
@ -1,10 +1,14 @@
|
||||||
import type { Config } from "drizzle-kit";
|
import type { Config } from "drizzle-kit";
|
||||||
|
|
||||||
|
if (!process.env.DB_PATH) throw new Error("no hay DB_PATH");
|
||||||
|
|
||||||
|
export const DB_PATH = process.env.DB_PATH;
|
||||||
|
|
||||||
export default {
|
export default {
|
||||||
schema: "./schema.ts",
|
schema: "./schema.ts",
|
||||||
out: "./drizzle",
|
out: "./drizzle",
|
||||||
driver: "better-sqlite",
|
driver: "better-sqlite",
|
||||||
dbCredentials: {
|
dbCredentials: {
|
||||||
url: "../scraper/sqlite.db",
|
url: process.env.DB_PATH,
|
||||||
},
|
},
|
||||||
} satisfies Config;
|
} satisfies Config;
|
||||||
|
|
1
db-datos/drizzle/0003_abandoned_landau.sql
Normal file
1
db-datos/drizzle/0003_abandoned_landau.sql
Normal file
|
@ -0,0 +1 @@
|
||||||
|
CREATE INDEX `precios_ean_idx` ON `precios` (`ean`);
|
101
db-datos/drizzle/meta/0003_snapshot.json
Normal file
101
db-datos/drizzle/meta/0003_snapshot.json
Normal file
|
@ -0,0 +1,101 @@
|
||||||
|
{
|
||||||
|
"version": "5",
|
||||||
|
"dialect": "sqlite",
|
||||||
|
"id": "e1217fdb-6f54-44c5-a04b-c5aebf202102",
|
||||||
|
"prevId": "cbd90a60-7568-489f-ac45-95bd8818ffbd",
|
||||||
|
"tables": {
|
||||||
|
"precios": {
|
||||||
|
"name": "precios",
|
||||||
|
"columns": {
|
||||||
|
"id": {
|
||||||
|
"name": "id",
|
||||||
|
"type": "integer",
|
||||||
|
"primaryKey": true,
|
||||||
|
"notNull": true,
|
||||||
|
"autoincrement": true
|
||||||
|
},
|
||||||
|
"ean": {
|
||||||
|
"name": "ean",
|
||||||
|
"type": "text",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true,
|
||||||
|
"autoincrement": false
|
||||||
|
},
|
||||||
|
"fetched_at": {
|
||||||
|
"name": "fetched_at",
|
||||||
|
"type": "integer",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true,
|
||||||
|
"autoincrement": false
|
||||||
|
},
|
||||||
|
"precio_centavos": {
|
||||||
|
"name": "precio_centavos",
|
||||||
|
"type": "integer",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": false,
|
||||||
|
"autoincrement": false
|
||||||
|
},
|
||||||
|
"in_stock": {
|
||||||
|
"name": "in_stock",
|
||||||
|
"type": "integer",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": false,
|
||||||
|
"autoincrement": false
|
||||||
|
},
|
||||||
|
"url": {
|
||||||
|
"name": "url",
|
||||||
|
"type": "text",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true,
|
||||||
|
"autoincrement": false
|
||||||
|
},
|
||||||
|
"warc_record_id": {
|
||||||
|
"name": "warc_record_id",
|
||||||
|
"type": "text",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": false,
|
||||||
|
"autoincrement": false
|
||||||
|
},
|
||||||
|
"parser_version": {
|
||||||
|
"name": "parser_version",
|
||||||
|
"type": "integer",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": false,
|
||||||
|
"autoincrement": false
|
||||||
|
},
|
||||||
|
"name": {
|
||||||
|
"name": "name",
|
||||||
|
"type": "text",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": false,
|
||||||
|
"autoincrement": false
|
||||||
|
},
|
||||||
|
"image_url": {
|
||||||
|
"name": "image_url",
|
||||||
|
"type": "text",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": false,
|
||||||
|
"autoincrement": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"indexes": {
|
||||||
|
"precios_ean_idx": {
|
||||||
|
"name": "precios_ean_idx",
|
||||||
|
"columns": [
|
||||||
|
"ean"
|
||||||
|
],
|
||||||
|
"isUnique": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"foreignKeys": {},
|
||||||
|
"compositePrimaryKeys": {},
|
||||||
|
"uniqueConstraints": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"enums": {},
|
||||||
|
"_meta": {
|
||||||
|
"schemas": {},
|
||||||
|
"tables": {},
|
||||||
|
"columns": {}
|
||||||
|
}
|
||||||
|
}
|
|
@ -22,6 +22,13 @@
|
||||||
"when": 1703452301821,
|
"when": 1703452301821,
|
||||||
"tag": "0002_wild_amazoness",
|
"tag": "0002_wild_amazoness",
|
||||||
"breakpoints": true
|
"breakpoints": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"idx": 3,
|
||||||
|
"version": "5",
|
||||||
|
"when": 1703521964385,
|
||||||
|
"tag": "0003_abandoned_landau",
|
||||||
|
"breakpoints": true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
|
@ -2,8 +2,9 @@ import Database from "bun:sqlite";
|
||||||
import { drizzle } from "drizzle-orm/bun-sqlite";
|
import { drizzle } from "drizzle-orm/bun-sqlite";
|
||||||
import { migrate } from "drizzle-orm/bun-sqlite/migrator";
|
import { migrate } from "drizzle-orm/bun-sqlite/migrator";
|
||||||
import * as schema from "./schema.js";
|
import * as schema from "./schema.js";
|
||||||
|
import { DB_PATH } from "./drizzle.config.js";
|
||||||
|
|
||||||
const sqlite = new Database("../scraper/sqlite.db");
|
const sqlite = new Database(DB_PATH);
|
||||||
const db = drizzle(sqlite, { schema });
|
const db = drizzle(sqlite, { schema });
|
||||||
|
|
||||||
migrate(db, { migrationsFolder: "./drizzle" });
|
migrate(db, { migrationsFolder: "./drizzle" });
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
import { integer, sqliteTable, text } from "drizzle-orm/sqlite-core";
|
import { index, integer, sqliteTable, text } from "drizzle-orm/sqlite-core";
|
||||||
|
|
||||||
export const precios = sqliteTable("precios", {
|
export const precios = sqliteTable(
|
||||||
|
"precios",
|
||||||
|
{
|
||||||
id: integer("id", { mode: "number" }).primaryKey({ autoIncrement: true }),
|
id: integer("id", { mode: "number" }).primaryKey({ autoIncrement: true }),
|
||||||
ean: text("ean").notNull(),
|
ean: text("ean").notNull(),
|
||||||
fetchedAt: integer("fetched_at", { mode: "timestamp" }).notNull(),
|
fetchedAt: integer("fetched_at", { mode: "timestamp" }).notNull(),
|
||||||
|
@ -11,6 +13,12 @@ export const precios = sqliteTable("precios", {
|
||||||
parserVersion: integer("parser_version"),
|
parserVersion: integer("parser_version"),
|
||||||
name: text("name"),
|
name: text("name"),
|
||||||
imageUrl: text("image_url"),
|
imageUrl: text("image_url"),
|
||||||
});
|
},
|
||||||
|
(precios) => {
|
||||||
|
return {
|
||||||
|
preciosEanIdx: index("precios_ean_idx").on(precios.ean),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
export type Precio = typeof precios.$inferSelect;
|
export type Precio = typeof precios.$inferSelect;
|
||||||
|
|
3547
pnpm-lock.yaml
3547
pnpm-lock.yaml
File diff suppressed because it is too large
Load diff
1
scraper/.env
Normal file
1
scraper/.env
Normal file
|
@ -0,0 +1 @@
|
||||||
|
DB_PATH=../scraper/sqlite.db
|
|
@ -1,4 +1,3 @@
|
||||||
import "dotenv/config";
|
|
||||||
import { mkdtemp, access } from "node:fs/promises";
|
import { mkdtemp, access } from "node:fs/promises";
|
||||||
import { tmpdir } from "node:os";
|
import { tmpdir } from "node:os";
|
||||||
import { join, resolve } from "node:path";
|
import { join, resolve } from "node:path";
|
||||||
|
@ -50,13 +49,13 @@ supermercados.forEach(downloadList);
|
||||||
|
|
||||||
async function downloadList(supermercado: Supermercado) {
|
async function downloadList(supermercado: Supermercado) {
|
||||||
const listPath = resolve(
|
const listPath = resolve(
|
||||||
join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`),
|
join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`)
|
||||||
);
|
);
|
||||||
const date = new Date();
|
const date = new Date();
|
||||||
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-"));
|
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-"));
|
||||||
const zstdWarcName = `${supermercado}-${format(
|
const zstdWarcName = `${supermercado}-${format(
|
||||||
date,
|
date,
|
||||||
"yyyy-MM-dd-HH:mm",
|
"yyyy-MM-dd-HH:mm"
|
||||||
)}.warc.zst`;
|
)}.warc.zst`;
|
||||||
const zstdWarcPath = join(ctxPath, zstdWarcName);
|
const zstdWarcPath = join(ctxPath, zstdWarcName);
|
||||||
const subproc = Bun.spawn({
|
const subproc = Bun.spawn({
|
||||||
|
@ -122,7 +121,7 @@ async function scrapAndInform({
|
||||||
inform(
|
inform(
|
||||||
`Procesado ${zstdWarcName} (${progress.done} ok, ${
|
`Procesado ${zstdWarcName} (${progress.done} ok, ${
|
||||||
progress.errors.length
|
progress.errors.length
|
||||||
} errores) (tardó ${formatMs(took)})`,
|
} errores) (tardó ${formatMs(took)})`
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
inform(`Algo falló en ${zstdWarcName}`);
|
inform(`Algo falló en ${zstdWarcName}`);
|
||||||
|
@ -159,7 +158,7 @@ function recompress(inputPath: string, outputPath: string) {
|
||||||
["-T0", "-15", "--long", "-o", outputPath],
|
["-T0", "-15", "--long", "-o", outputPath],
|
||||||
{
|
{
|
||||||
stdio: ["pipe", null, null],
|
stdio: ["pipe", null, null],
|
||||||
},
|
}
|
||||||
);
|
);
|
||||||
// @ts-expect-error a los types de bun no le gusta????
|
// @ts-expect-error a los types de bun no le gusta????
|
||||||
decompressor.stdout.pipe(compressor.stdin);
|
decompressor.stdout.pipe(compressor.stdin);
|
||||||
|
@ -215,7 +214,7 @@ async function sendTelegramMsg(text: string) {
|
||||||
if (!process.env.TELEGRAM_BOT_TOKEN || !process.env.TELEGRAM_BOT_CHAT_ID)
|
if (!process.env.TELEGRAM_BOT_TOKEN || !process.env.TELEGRAM_BOT_CHAT_ID)
|
||||||
return;
|
return;
|
||||||
const url = new URL(
|
const url = new URL(
|
||||||
`https://api.telegram.org/bot${process.env.TELEGRAM_BOT_TOKEN}/sendMessage`,
|
`https://api.telegram.org/bot${process.env.TELEGRAM_BOT_TOKEN}/sendMessage`
|
||||||
);
|
);
|
||||||
url.searchParams.set("chat_id", process.env.TELEGRAM_BOT_CHAT_ID);
|
url.searchParams.set("chat_id", process.env.TELEGRAM_BOT_CHAT_ID);
|
||||||
url.searchParams.set("text", text);
|
url.searchParams.set("text", text);
|
||||||
|
|
|
@ -15,7 +15,6 @@
|
||||||
"@aws-sdk/lib-storage": "^3.478.0",
|
"@aws-sdk/lib-storage": "^3.478.0",
|
||||||
"date-fns": "^3.0.6",
|
"date-fns": "^3.0.6",
|
||||||
"db-datos": "workspace:^",
|
"db-datos": "workspace:^",
|
||||||
"dotenv": "^16.3.1",
|
|
||||||
"drizzle-orm": "^0.29.1",
|
"drizzle-orm": "^0.29.1",
|
||||||
"linkedom": "^0.16.5",
|
"linkedom": "^0.16.5",
|
||||||
"nanoid": "^5.0.4",
|
"nanoid": "^5.0.4",
|
||||||
|
|
|
@ -9,11 +9,12 @@ import { getDiaProduct } from "./parsers/dia.js";
|
||||||
import { getCotoProduct } from "./parsers/coto.js";
|
import { getCotoProduct } from "./parsers/coto.js";
|
||||||
import { join } from "path";
|
import { join } from "path";
|
||||||
import { and, eq, sql } from "drizzle-orm";
|
import { and, eq, sql } from "drizzle-orm";
|
||||||
|
import { DB_PATH } from "db-datos/drizzle.config.js";
|
||||||
|
|
||||||
const DEBUG = false;
|
const DEBUG = false;
|
||||||
const PARSER_VERSION = 2;
|
const PARSER_VERSION = 2;
|
||||||
|
|
||||||
const sqlite = new Database("sqlite.db");
|
const sqlite = new Database(DB_PATH);
|
||||||
const db = drizzle(sqlite, { schema });
|
const db = drizzle(sqlite, { schema });
|
||||||
|
|
||||||
sqlite.run(`
|
sqlite.run(`
|
||||||
|
|
|
@ -3,7 +3,7 @@ import { drizzle } from "drizzle-orm/bun-sqlite";
|
||||||
import * as schema from "db-datos/schema.js";
|
import * as schema from "db-datos/schema.js";
|
||||||
import { env } from "$env/dynamic/private";
|
import { env } from "$env/dynamic/private";
|
||||||
|
|
||||||
const sqlite = new Database(env.DB_PATH ?? "../scraper/sqlite.db");
|
const sqlite = new Database(env.DB_PATH);
|
||||||
|
|
||||||
export const db = drizzle(sqlite, { schema });
|
export const db = drizzle(sqlite, { schema });
|
||||||
export * as schema from "db-datos/schema.js";
|
export * as schema from "db-datos/schema.js";
|
||||||
|
|
Loading…
Reference in a new issue