Compare commits

..

3 commits

Author SHA1 Message Date
4589eee5c0 siempre tomar BD de DB_PATH 2023-12-25 23:05:50 -03:00
ff9a69e5cd limpiar pnpm 2023-12-25 23:03:55 -03:00
1dce9abe59 index ean para perf 2023-12-25 13:33:11 -03:00
15 changed files with 148 additions and 3573 deletions

3
.gitignore vendored
View file

@ -12,5 +12,4 @@ scraper/debug/
scraper/x.tsv scraper/x.tsv
*.tmp *.tmp
target/ target/
.env .env.*
.env*

BIN
bun.lockb

Binary file not shown.

1
db-datos/.env Normal file
View file

@ -0,0 +1 @@
DB_PATH=../scraper/sqlite.db

View file

@ -1,10 +1,14 @@
import type { Config } from "drizzle-kit"; import type { Config } from "drizzle-kit";
if (!process.env.DB_PATH) throw new Error("no hay DB_PATH");
export const DB_PATH = process.env.DB_PATH;
export default { export default {
schema: "./schema.ts", schema: "./schema.ts",
out: "./drizzle", out: "./drizzle",
driver: "better-sqlite", driver: "better-sqlite",
dbCredentials: { dbCredentials: {
url: "../scraper/sqlite.db", url: process.env.DB_PATH,
}, },
} satisfies Config; } satisfies Config;

View file

@ -0,0 +1 @@
CREATE INDEX `precios_ean_idx` ON `precios` (`ean`);

View file

@ -0,0 +1,101 @@
{
"version": "5",
"dialect": "sqlite",
"id": "e1217fdb-6f54-44c5-a04b-c5aebf202102",
"prevId": "cbd90a60-7568-489f-ac45-95bd8818ffbd",
"tables": {
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {
"precios_ean_idx": {
"name": "precios_ean_idx",
"columns": [
"ean"
],
"isUnique": false
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"schemas": {},
"tables": {},
"columns": {}
}
}

View file

@ -22,6 +22,13 @@
"when": 1703452301821, "when": 1703452301821,
"tag": "0002_wild_amazoness", "tag": "0002_wild_amazoness",
"breakpoints": true "breakpoints": true
},
{
"idx": 3,
"version": "5",
"when": 1703521964385,
"tag": "0003_abandoned_landau",
"breakpoints": true
} }
] ]
} }

View file

@ -2,8 +2,9 @@ import Database from "bun:sqlite";
import { drizzle } from "drizzle-orm/bun-sqlite"; import { drizzle } from "drizzle-orm/bun-sqlite";
import { migrate } from "drizzle-orm/bun-sqlite/migrator"; import { migrate } from "drizzle-orm/bun-sqlite/migrator";
import * as schema from "./schema.js"; import * as schema from "./schema.js";
import { DB_PATH } from "./drizzle.config.js";
const sqlite = new Database("../scraper/sqlite.db"); const sqlite = new Database(DB_PATH);
const db = drizzle(sqlite, { schema }); const db = drizzle(sqlite, { schema });
migrate(db, { migrationsFolder: "./drizzle" }); migrate(db, { migrationsFolder: "./drizzle" });

View file

@ -1,16 +1,24 @@
import { integer, sqliteTable, text } from "drizzle-orm/sqlite-core"; import { index, integer, sqliteTable, text } from "drizzle-orm/sqlite-core";
export const precios = sqliteTable("precios", { export const precios = sqliteTable(
id: integer("id", { mode: "number" }).primaryKey({ autoIncrement: true }), "precios",
ean: text("ean").notNull(), {
fetchedAt: integer("fetched_at", { mode: "timestamp" }).notNull(), id: integer("id", { mode: "number" }).primaryKey({ autoIncrement: true }),
precioCentavos: integer("precio_centavos"), ean: text("ean").notNull(),
inStock: integer("in_stock", { mode: "boolean" }), fetchedAt: integer("fetched_at", { mode: "timestamp" }).notNull(),
url: text("url").notNull(), precioCentavos: integer("precio_centavos"),
warcRecordId: text("warc_record_id"), inStock: integer("in_stock", { mode: "boolean" }),
parserVersion: integer("parser_version"), url: text("url").notNull(),
name: text("name"), warcRecordId: text("warc_record_id"),
imageUrl: text("image_url"), parserVersion: integer("parser_version"),
}); name: text("name"),
imageUrl: text("image_url"),
},
(precios) => {
return {
preciosEanIdx: index("precios_ean_idx").on(precios.ean),
};
}
);
export type Precio = typeof precios.$inferSelect; export type Precio = typeof precios.$inferSelect;

File diff suppressed because it is too large Load diff

1
scraper/.env Normal file
View file

@ -0,0 +1 @@
DB_PATH=../scraper/sqlite.db

View file

@ -1,4 +1,3 @@
import "dotenv/config";
import { mkdtemp, access } from "node:fs/promises"; import { mkdtemp, access } from "node:fs/promises";
import { tmpdir } from "node:os"; import { tmpdir } from "node:os";
import { join, resolve } from "node:path"; import { join, resolve } from "node:path";
@ -50,13 +49,13 @@ supermercados.forEach(downloadList);
async function downloadList(supermercado: Supermercado) { async function downloadList(supermercado: Supermercado) {
const listPath = resolve( const listPath = resolve(
join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`), join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`)
); );
const date = new Date(); const date = new Date();
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-")); const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-"));
const zstdWarcName = `${supermercado}-${format( const zstdWarcName = `${supermercado}-${format(
date, date,
"yyyy-MM-dd-HH:mm", "yyyy-MM-dd-HH:mm"
)}.warc.zst`; )}.warc.zst`;
const zstdWarcPath = join(ctxPath, zstdWarcName); const zstdWarcPath = join(ctxPath, zstdWarcName);
const subproc = Bun.spawn({ const subproc = Bun.spawn({
@ -122,7 +121,7 @@ async function scrapAndInform({
inform( inform(
`Procesado ${zstdWarcName} (${progress.done} ok, ${ `Procesado ${zstdWarcName} (${progress.done} ok, ${
progress.errors.length progress.errors.length
} errores) (tardó ${formatMs(took)})`, } errores) (tardó ${formatMs(took)})`
); );
} else { } else {
inform(`Algo falló en ${zstdWarcName}`); inform(`Algo falló en ${zstdWarcName}`);
@ -159,7 +158,7 @@ function recompress(inputPath: string, outputPath: string) {
["-T0", "-15", "--long", "-o", outputPath], ["-T0", "-15", "--long", "-o", outputPath],
{ {
stdio: ["pipe", null, null], stdio: ["pipe", null, null],
}, }
); );
// @ts-expect-error a los types de bun no le gusta???? // @ts-expect-error a los types de bun no le gusta????
decompressor.stdout.pipe(compressor.stdin); decompressor.stdout.pipe(compressor.stdin);
@ -215,7 +214,7 @@ async function sendTelegramMsg(text: string) {
if (!process.env.TELEGRAM_BOT_TOKEN || !process.env.TELEGRAM_BOT_CHAT_ID) if (!process.env.TELEGRAM_BOT_TOKEN || !process.env.TELEGRAM_BOT_CHAT_ID)
return; return;
const url = new URL( const url = new URL(
`https://api.telegram.org/bot${process.env.TELEGRAM_BOT_TOKEN}/sendMessage`, `https://api.telegram.org/bot${process.env.TELEGRAM_BOT_TOKEN}/sendMessage`
); );
url.searchParams.set("chat_id", process.env.TELEGRAM_BOT_CHAT_ID); url.searchParams.set("chat_id", process.env.TELEGRAM_BOT_CHAT_ID);
url.searchParams.set("text", text); url.searchParams.set("text", text);

View file

@ -15,7 +15,6 @@
"@aws-sdk/lib-storage": "^3.478.0", "@aws-sdk/lib-storage": "^3.478.0",
"date-fns": "^3.0.6", "date-fns": "^3.0.6",
"db-datos": "workspace:^", "db-datos": "workspace:^",
"dotenv": "^16.3.1",
"drizzle-orm": "^0.29.1", "drizzle-orm": "^0.29.1",
"linkedom": "^0.16.5", "linkedom": "^0.16.5",
"nanoid": "^5.0.4", "nanoid": "^5.0.4",

View file

@ -9,11 +9,12 @@ import { getDiaProduct } from "./parsers/dia.js";
import { getCotoProduct } from "./parsers/coto.js"; import { getCotoProduct } from "./parsers/coto.js";
import { join } from "path"; import { join } from "path";
import { and, eq, sql } from "drizzle-orm"; import { and, eq, sql } from "drizzle-orm";
import { DB_PATH } from "db-datos/drizzle.config.js";
const DEBUG = false; const DEBUG = false;
const PARSER_VERSION = 2; const PARSER_VERSION = 2;
const sqlite = new Database("sqlite.db"); const sqlite = new Database(DB_PATH);
const db = drizzle(sqlite, { schema }); const db = drizzle(sqlite, { schema });
sqlite.run(` sqlite.run(`

View file

@ -3,7 +3,7 @@ import { drizzle } from "drizzle-orm/bun-sqlite";
import * as schema from "db-datos/schema.js"; import * as schema from "db-datos/schema.js";
import { env } from "$env/dynamic/private"; import { env } from "$env/dynamic/private";
const sqlite = new Database(env.DB_PATH ?? "../scraper/sqlite.db"); const sqlite = new Database(env.DB_PATH);
export const db = drizzle(sqlite, { schema }); export const db = drizzle(sqlite, { schema });
export * as schema from "db-datos/schema.js"; export * as schema from "db-datos/schema.js";