Compare commits

..

No commits in common. "6c0c2e89f7dbc0f8e7c8ef8980002eee9edc2422" and "4589eee5c09756abe0dec8043cfa059861fbf783" have entirely different histories.

16 changed files with 227 additions and 324 deletions

View file

@ -1,7 +0,0 @@
data/warcs/
data/carrefour/
*/*.db*
downloader/
node_modules/
*/node_modules/
*/Containerfile

BIN
bun.lockb

Binary file not shown.

1
db-datos/.env Normal file
View file

@ -0,0 +1 @@
DB_PATH=../scraper/sqlite.db

View file

@ -7,6 +7,7 @@ export const DB_PATH = process.env.DB_PATH;
export default { export default {
schema: "./schema.ts", schema: "./schema.ts",
out: "./drizzle", out: "./drizzle",
driver: "better-sqlite",
dbCredentials: { dbCredentials: {
url: process.env.DB_PATH, url: process.env.DB_PATH,
}, },

View file

@ -1,19 +1,12 @@
import Database from "bun:sqlite"; import Database from "bun:sqlite";
import { join } from "node:path";
import { drizzle } from "drizzle-orm/bun-sqlite"; import { drizzle } from "drizzle-orm/bun-sqlite";
import { migrate } from "drizzle-orm/bun-sqlite/migrator"; import { migrate } from "drizzle-orm/bun-sqlite/migrator";
import * as schema from "./schema.js"; import * as schema from "./schema.js";
import { DB_PATH } from "./drizzle.config.js"; import { DB_PATH } from "./drizzle.config.js";
export function migrateDb() { const sqlite = new Database(DB_PATH);
const sqlite = new Database(DB_PATH); const db = drizzle(sqlite, { schema });
const db = drizzle(sqlite, { schema });
migrate(db, { migrationsFolder: join(import.meta.dir, "drizzle") }); migrate(db, { migrationsFolder: "./drizzle" });
sqlite.run(`
pragma journal_mode = WAL;
PRAGMA synchronous = NORMAL;
`);
sqlite.close(); sqlite.close();
}

View file

@ -15,7 +15,9 @@
"drizzle-orm": "^0.29.1" "drizzle-orm": "^0.29.1"
}, },
"devDependencies": { "devDependencies": {
"@types/bun": "^1.0.0", "better-sqlite3": "^9.2.2",
"drizzle-kit": "^0.20.7" "drizzle-kit": "^0.20.7",
"tsx": "^4.7.0",
"@types/better-sqlite3": "^7.6.8"
} }
} }

1
scraper/.env Normal file
View file

@ -0,0 +1 @@
DB_PATH=../scraper/sqlite.db

View file

@ -1,27 +0,0 @@
FROM oven/bun:1-alpine AS base
WORKDIR /usr/src/app
FROM base AS builder
ENV NODE_ENV=production
COPY . .
RUN bun install --frozen-lockfile \
&& bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js \
&& rm -rf node_modules/
FROM base
RUN apk add --no-cache wget zstd cronie tini
RUN printf "#!/bin/sh\nexec bun /bin/scraper auto" > /etc/periodic/daily/scraper \
&& chmod +x /etc/periodic/daily/scraper
COPY --from=builder /tmp/cli.build.js /bin/scraper
COPY --from=builder /usr/src/app/db-datos/drizzle /bin/drizzle
COPY --from=builder /usr/src/app/data /listas
WORKDIR /app
VOLUME /db
ENV NODE_ENV=production
ENV DB_PATH=/db/db.db
ENV LISTS_DIR=/listas/
CMD ["tini", "/usr/sbin/crond", "-n"]
# CMD ["bun", "/bin/scraper"]

View file

@ -10,6 +10,28 @@ import { S3Client } from "@aws-sdk/client-s3";
import { Upload } from "@aws-sdk/lib-storage"; import { Upload } from "@aws-sdk/lib-storage";
import { BunFile } from "bun"; import { BunFile } from "bun";
if (
!process.env.S3_ACCESS_KEY_ID ||
!process.env.S3_SECRET_ACCESS_KEY ||
!process.env.S3_BUCKET_NAME
)
throw new Error("missing s3 creds");
if (!process.env.TELEGRAM_BOT_TOKEN)
console.warn("no hay TELEGRAM_BOT_TOKEN, no voy a loggear por allá");
if (!process.env.TELEGRAM_BOT_CHAT_ID)
console.warn("no hay TELEGRAM_BOT_CHAT_ID, no voy a loggear por allá");
const { S3_BUCKET_NAME, S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY } = process.env;
// https://www.backblaze.com/docs/cloud-storage-use-the-aws-sdk-for-javascript-v3-with-backblaze-b2
const s3 = new S3Client({
endpoint: "https://s3.us-west-004.backblazeb2.com",
region: "us-west-004",
credentials: {
accessKeyId: S3_ACCESS_KEY_ID,
secretAccessKey: S3_SECRET_ACCESS_KEY,
},
});
const supermercados: Supermercado[] = [ const supermercados: Supermercado[] = [
Supermercado.Carrefour, Supermercado.Carrefour,
Supermercado.Coto, Supermercado.Coto,
@ -22,53 +44,10 @@ const compressionQueue = new PQueue({ concurrency: 1 });
// hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU // hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU
const scrapQueue = new PQueue({ concurrency: 1 }); const scrapQueue = new PQueue({ concurrency: 1 });
export async function auto() { supermercados.forEach(downloadList);
const a = new Auto(); // await recompress("sqlite.db.gz", "sqlite.db.zst");
await Promise.all(supermercados.map((supr) => a.downloadList(supr)));
}
class Auto { async function downloadList(supermercado: Supermercado) {
s3Config?: { s3: S3Client; bucketName: string };
telegramConfig?: { token: string; chatId: string };
constructor() {
if (
!process.env.S3_ACCESS_KEY_ID ||
!process.env.S3_SECRET_ACCESS_KEY ||
!process.env.S3_BUCKET_NAME
) {
if (process.env.NODE_ENV === "development") {
console.warn("faltan creds de s3, no voy a subir a s3");
} else {
throw new Error("faltan creds de s3");
}
} else {
this.s3Config = {
// https://www.backblaze.com/docs/cloud-storage-use-the-aws-sdk-for-javascript-v3-with-backblaze-b2
s3: new S3Client({
endpoint: "https://s3.us-west-004.backblazeb2.com",
region: "us-west-004",
credentials: {
accessKeyId: process.env.S3_ACCESS_KEY_ID,
secretAccessKey: process.env.S3_SECRET_ACCESS_KEY,
},
}),
bucketName: process.env.S3_BUCKET_NAME,
};
}
if (!process.env.TELEGRAM_BOT_TOKEN)
console.warn("no hay TELEGRAM_BOT_TOKEN, no voy a loggear por allá");
else if (!process.env.TELEGRAM_BOT_CHAT_ID)
console.warn("no hay TELEGRAM_BOT_CHAT_ID, no voy a loggear por allá");
else
this.telegramConfig = {
token: process.env.TELEGRAM_BOT_TOKEN,
chatId: process.env.TELEGRAM_BOT_CHAT_ID,
};
}
async downloadList(supermercado: Supermercado) {
const listPath = resolve( const listPath = resolve(
join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`) join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`)
); );
@ -95,46 +74,42 @@ class Auto {
}); });
const t0 = performance.now(); const t0 = performance.now();
await subproc.exited; await subproc.exited;
this.inform( inform(`wget para ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}`);
`wget para ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}`
);
const gzippedWarcPath = join(ctxPath, "temp.warc.gz"); const gzippedWarcPath = join(ctxPath, "temp.warc.gz");
if (!(await fileExists(gzippedWarcPath))) { if (!(await exists(gzippedWarcPath))) {
const err = this.report(`no encontré el ${gzippedWarcPath}`); const err = report(`no encontré el ${gzippedWarcPath}`);
throw err; throw err;
} }
await compressionQueue.add(() => await compressionQueue.add(() => recompress(gzippedWarcPath, zstdWarcPath));
this.recompress(gzippedWarcPath, zstdWarcPath) if (!(await exists(zstdWarcPath))) {
); const err = report(`no encontré el ${zstdWarcPath}`);
if (!(await fileExists(zstdWarcPath))) {
const err = this.report(`no encontré el ${zstdWarcPath}`);
throw err; throw err;
} }
this.scrapAndInform({ zstdWarcPath, zstdWarcName }); scrapAndInform({ zstdWarcPath, zstdWarcName });
try { try {
await this.uploadToBucket({ await uploadToBucket({
fileName: zstdWarcName, fileName: zstdWarcName,
file: Bun.file(zstdWarcPath), file: Bun.file(zstdWarcPath),
}); });
} catch (error) { } catch (error) {
this.inform(`Falló subir ${zstdWarcName} a S3; ${error}`); inform(`Falló subir ${zstdWarcName} a S3; ${error}`);
console.error(error); console.error(error);
} }
// TODO: borrar archivos temporales // TODO: borrar archivos temporales
} }
async scrapAndInform({ async function scrapAndInform({
zstdWarcPath, zstdWarcPath,
zstdWarcName, zstdWarcName,
}: { }: {
zstdWarcPath: string; zstdWarcPath: string;
zstdWarcName: string; zstdWarcName: string;
}) { }) {
const res = await scrapQueue.add(async () => { const res = await scrapQueue.add(async () => {
const t0 = performance.now(); const t0 = performance.now();
const progress = await parseWarc(zstdWarcPath); const progress = await parseWarc(zstdWarcPath);
@ -143,21 +118,21 @@ class Auto {
if (res) { if (res) {
const { took, progress } = res; const { took, progress } = res;
this.inform( inform(
`Procesado ${zstdWarcName} (${progress.done} ok, ${ `Procesado ${zstdWarcName} (${progress.done} ok, ${
progress.errors.length progress.errors.length
} errores) (tardó ${formatMs(took)})` } errores) (tardó ${formatMs(took)})`
); );
} else { } else {
this.inform(`Algo falló en ${zstdWarcName}`); inform(`Algo falló en ${zstdWarcName}`);
}
} }
}
/** /**
* toma un archivo gzippeado y lo recomprime con zstd. * toma un archivo gzippeado y lo recomprime con zstd.
* borra el archivo original. * borra el archivo original.
*/ */
recompress(inputPath: string, outputPath: string) { function recompress(inputPath: string, outputPath: string) {
// XXX: por alguna razón no funciona en Bun 1.0.20 // XXX: por alguna razón no funciona en Bun 1.0.20
// const decompressor = Bun.spawn({ // const decompressor = Bun.spawn({
// cmd: ["gzip", "-dc", inputPath], // cmd: ["gzip", "-dc", inputPath],
@ -189,63 +164,44 @@ class Auto {
decompressor.stdout.pipe(compressor.stdin); decompressor.stdout.pipe(compressor.stdin);
compressor.on("close", (code) => { compressor.on("close", (code) => {
if (code !== 0) { if (code !== 0) {
const err = this.report(`zstd threw error code ${code}`); const err = report(`zstd threw error code ${code}`);
reject(err); reject(err);
} }
resolve(void 0); resolve(void 0);
}); });
}); });
} }
async uploadToBucket({ async function uploadToBucket({
fileName, fileName,
file, file,
}: { }: {
fileName: string; fileName: string;
file: BunFile; file: BunFile;
}) { }) {
if (!this.s3Config) {
this.inform(
`[s3] Se intentó subir ${fileName} pero no tenemos creds de S3`
);
return;
}
const parallelUploads3 = new Upload({ const parallelUploads3 = new Upload({
client: this.s3Config.s3, client: s3,
params: { params: {
Bucket: this.s3Config.bucketName, Bucket: S3_BUCKET_NAME,
Key: fileName, Key: fileName,
Body: file, Body: file,
}, },
}); });
await parallelUploads3.done(); await parallelUploads3.done();
} }
inform(msg: string) { function inform(msg: string) {
this.sendTelegramMsg(msg); sendTelegramMsg(msg);
console.info(msg); console.info(msg);
} }
report(msg: string) { function report(msg: string) {
this.inform(msg); inform(msg);
const error = new Error(msg); const error = new Error(msg);
return error; return error;
}
async sendTelegramMsg(text: string) {
if (!this.telegramConfig) return;
const url = new URL(
`https://api.telegram.org/bot${this.telegramConfig.token}/sendMessage`
);
url.searchParams.set("chat_id", this.telegramConfig.chatId);
url.searchParams.set("text", text);
await fetch(url);
}
} }
// await recompress("sqlite.db.gz", "sqlite.db.zst");
// no se llama exists porque bun tiene un bug en el que usa fs.exists por mas que exista una funcion llamada exists async function exists(path: string) {
async function fileExists(path: string) {
try { try {
access(path); access(path);
return true; return true;
@ -254,6 +210,17 @@ async function fileExists(path: string) {
} }
} }
async function sendTelegramMsg(text: string) {
if (!process.env.TELEGRAM_BOT_TOKEN || !process.env.TELEGRAM_BOT_CHAT_ID)
return;
const url = new URL(
`https://api.telegram.org/bot${process.env.TELEGRAM_BOT_TOKEN}/sendMessage`
);
url.searchParams.set("chat_id", process.env.TELEGRAM_BOT_CHAT_ID);
url.searchParams.set("text", text);
await fetch(url);
}
function formatMs(ms: number) { function formatMs(ms: number) {
return formatDuration(intervalToDuration({ start: 0, end: Math.round(ms) })); return formatDuration(intervalToDuration({ start: 0, end: Math.round(ms) }));
} }

View file

@ -1,19 +0,0 @@
import { auto } from "./auto.js";
import { parseWarc } from "./scrap.js";
if (process.argv[2] === "auto") {
await auto();
} else if (process.argv[2] === "scrap") {
const warcPaths = process.argv.slice(3);
if (warcPaths.length > 0) {
for (const path of warcPaths) {
await parseWarc(path);
}
} else {
console.error("Especificá WARCs para scrapear.");
process.exit(1);
}
} else {
console.error("Especificá una acción (tipo `auto` o `scrap`) para hacer.");
process.exit(1);
}

View file

@ -5,7 +5,7 @@
"description": "", "description": "",
"main": "index.js", "main": "index.js",
"scripts": { "scripts": {
"build:container": "podman build -t gitea.nulo.in/nulo/preciazo/scraper -f ./Containerfile .." "test": "echo \"Error: no test specified\" && exit 1"
}, },
"keywords": [], "keywords": [],
"author": "", "author": "",

View file

@ -10,16 +10,17 @@ import { getCotoProduct } from "./parsers/coto.js";
import { join } from "path"; import { join } from "path";
import { and, eq, sql } from "drizzle-orm"; import { and, eq, sql } from "drizzle-orm";
import { DB_PATH } from "db-datos/drizzle.config.js"; import { DB_PATH } from "db-datos/drizzle.config.js";
import { migrateDb } from "db-datos/migrate.js";
const DEBUG = false; const DEBUG = false;
const PARSER_VERSION = 2; const PARSER_VERSION = 2;
migrateDb();
const sqlite = new Database(DB_PATH); const sqlite = new Database(DB_PATH);
const db = drizzle(sqlite, { schema }); const db = drizzle(sqlite, { schema });
sqlite.run(`
pragma journal_mode = WAL;
PRAGMA synchronous = NORMAL;
`);
const getPrevPrecio = db const getPrevPrecio = db
.select({ id: schema.precios.id }) .select({ id: schema.precios.id })
.from(schema.precios) .from(schema.precios)
@ -32,6 +33,12 @@ const getPrevPrecio = db
.limit(1) .limit(1)
.prepare(); .prepare();
if (process.argv[1].endsWith("/scrap.ts")) {
for (const path of process.argv.slice(2)) {
await parseWarc(path);
}
}
export type Precio = typeof schema.precios.$inferInsert; export type Precio = typeof schema.precios.$inferInsert;
export type Precioish = Omit< export type Precioish = Omit<
Precio, Precio,

View file

@ -7,7 +7,4 @@ EXPOSE 3000
ENV PROTOCOL_HEADER=x-forwarded-proto ENV PROTOCOL_HEADER=x-forwarded-proto
ENV HOST_HEADER=x-forwarded-host ENV HOST_HEADER=x-forwarded-host
VOLUME /db
ENV DB_PATH=/db/db.db
CMD ["bun", "run", "start"] CMD ["bun", "run", "start"]

View file

@ -5,7 +5,6 @@
"scripts": { "scripts": {
"dev": "vite dev", "dev": "vite dev",
"build": "vite build", "build": "vite build",
"build:container": "bun --bun vite build && podman build -t gitea.nulo.in/nulo/preciazo/sitio .",
"preview": "vite preview", "preview": "vite preview",
"check": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json", "check": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json",
"check:watch": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json --watch", "check:watch": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json --watch",

View file

@ -1,40 +1,28 @@
<script lang="ts"> <script lang="ts">
import { Supermercado, hosts } from "db-datos/supermercado";
import type { PageData } from "./$types"; import type { PageData } from "./$types";
import Chart from "./Chart.svelte"; import Chart from "./Chart.svelte";
export let data: PageData; export let data: PageData;
let urls: Map<Supermercado, string>;
$: urls = data.precios.toReversed().reduce((prev, curr) => {
const url = new URL(curr.url);
const supermercado = hosts[url.hostname];
prev.set(supermercado, curr.url);
return prev;
}, new Map<Supermercado, string>());
const classBySupermercado: { [supermercado in Supermercado]: string } = {
[Supermercado.Dia]: "bg-[#d52b1e] focus:ring-[#d52b1e]",
[Supermercado.Carrefour]: "bg-[#19549d] focus:ring-[#19549d]",
[Supermercado.Coto]: "bg-[#e20025] focus:ring-[#e20025]",
};
</script> </script>
{#if data.meta} {#if data.meta}
<h1 class="text-3xl font-bold">{data.meta.name}</h1> <h1 class="text-3xl font-bold">{data.meta.name}</h1>
<img src={data.meta.imageUrl} class="max-h-48" /> <img src={data.meta.imageUrl} class="max-h-48" />
<div class="flex gap-2">
{#each urls as [supermercado, url]}
<a
href={url}
rel="noreferrer noopener"
target="_blank"
class={`focus:shadow-outline inline-flex items-center justify-center rounded-md ${classBySupermercado[supermercado]} px-4 py-2 text-sm font-medium tracking-wide text-white transition-colors duration-200 hover:bg-opacity-80 focus:outline-none focus:ring-2 focus:ring-offset-2`}
>
Ver en {supermercado}
</a>
{/each}
</div>
{/if} {/if}
<ul>
{#each data.precios as precio}
<li>
{precio.url}
:
{#if precio.precioCentavos}
{precio.precioCentavos / 100}
{:else}
{precio.inStock}
{/if}
({precio.fetchedAt})
</li>
{/each}
</ul>
<Chart precios={data.precios} /> <Chart precios={data.precios} />