mejorar contenedor de scraper

poder importar warcs especificos
This commit is contained in:
Cat /dev/Nulo 2023-12-26 15:22:27 -03:00
parent 8a49ddab7d
commit fee0e1b872
6 changed files with 242 additions and 195 deletions

BIN
bun.lockb

Binary file not shown.

View file

@ -1,4 +1,5 @@
import Database from "bun:sqlite";
import { join } from "node:path";
import { drizzle } from "drizzle-orm/bun-sqlite";
import { migrate } from "drizzle-orm/bun-sqlite/migrator";
import * as schema from "./schema.js";
@ -8,7 +9,7 @@ export function migrateDb() {
const sqlite = new Database(DB_PATH);
const db = drizzle(sqlite, { schema });
migrate(db, { migrationsFolder: "./drizzle" });
migrate(db, { migrationsFolder: join(import.meta.dir, "drizzle") });
sqlite.run(`
pragma journal_mode = WAL;
PRAGMA synchronous = NORMAL;

View file

@ -5,22 +5,23 @@ FROM base AS builder
ENV NODE_ENV=production
COPY . .
RUN bun install --frozen-lockfile \
&& bun build scraper/auto.ts --target=bun --outfile=/tmp/auto.build.js \
&& bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js \
&& rm -rf node_modules/
FROM base
RUN apk add --no-cache wget zstd cronie tini
RUN printf "#!/bin/sh\nexec bun /app/built.js" > /etc/periodic/daily/scraper \
RUN printf "#!/bin/sh\nexec bun /bin/scraper auto" > /etc/periodic/daily/scraper \
&& chmod +x /etc/periodic/daily/scraper
COPY --from=builder /tmp/auto.build.js /app/built.js
COPY --from=builder /usr/src/app/db-datos/drizzle /app/drizzle
COPY --from=builder /tmp/cli.build.js /bin/scraper
COPY --from=builder /usr/src/app/db-datos/drizzle /bin/drizzle
COPY --from=builder /usr/src/app/data /listas
WORKDIR /app
VOLUME /db
ENV NODE_ENV=production
ENV DB_PATH=/db/db.db
ENV LISTS_DIR=/listas/
CMD ["tini", "/usr/sbin/crond", "-n"]
# CMD ["bun", "/app/built.js"]
# CMD ["bun", "/bin/scraper"]

View file

@ -10,28 +10,6 @@ import { S3Client } from "@aws-sdk/client-s3";
import { Upload } from "@aws-sdk/lib-storage";
import { BunFile } from "bun";
if (
!process.env.S3_ACCESS_KEY_ID ||
!process.env.S3_SECRET_ACCESS_KEY ||
!process.env.S3_BUCKET_NAME
)
throw new Error("missing s3 creds");
if (!process.env.TELEGRAM_BOT_TOKEN)
console.warn("no hay TELEGRAM_BOT_TOKEN, no voy a loggear por allá");
if (!process.env.TELEGRAM_BOT_CHAT_ID)
console.warn("no hay TELEGRAM_BOT_CHAT_ID, no voy a loggear por allá");
const { S3_BUCKET_NAME, S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY } = process.env;
// https://www.backblaze.com/docs/cloud-storage-use-the-aws-sdk-for-javascript-v3-with-backblaze-b2
const s3 = new S3Client({
endpoint: "https://s3.us-west-004.backblazeb2.com",
region: "us-west-004",
credentials: {
accessKeyId: S3_ACCESS_KEY_ID,
secretAccessKey: S3_SECRET_ACCESS_KEY,
},
});
const supermercados: Supermercado[] = [
Supermercado.Carrefour,
Supermercado.Coto,
@ -44,10 +22,53 @@ const compressionQueue = new PQueue({ concurrency: 1 });
// hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU
const scrapQueue = new PQueue({ concurrency: 1 });
supermercados.forEach(downloadList);
// await recompress("sqlite.db.gz", "sqlite.db.zst");
export async function auto() {
const a = new Auto();
await Promise.all(supermercados.map((supr) => a.downloadList(supr)));
}
async function downloadList(supermercado: Supermercado) {
class Auto {
s3Config?: { s3: S3Client; bucketName: string };
telegramConfig?: { token: string; chatId: string };
constructor() {
if (
!process.env.S3_ACCESS_KEY_ID ||
!process.env.S3_SECRET_ACCESS_KEY ||
!process.env.S3_BUCKET_NAME
) {
if (process.env.NODE_ENV === "development") {
console.warn("faltan creds de s3, no voy a subir a s3");
} else {
throw new Error("faltan creds de s3");
}
} else {
this.s3Config = {
// https://www.backblaze.com/docs/cloud-storage-use-the-aws-sdk-for-javascript-v3-with-backblaze-b2
s3: new S3Client({
endpoint: "https://s3.us-west-004.backblazeb2.com",
region: "us-west-004",
credentials: {
accessKeyId: process.env.S3_ACCESS_KEY_ID,
secretAccessKey: process.env.S3_SECRET_ACCESS_KEY,
},
}),
bucketName: process.env.S3_BUCKET_NAME,
};
}
if (!process.env.TELEGRAM_BOT_TOKEN)
console.warn("no hay TELEGRAM_BOT_TOKEN, no voy a loggear por allá");
else if (!process.env.TELEGRAM_BOT_CHAT_ID)
console.warn("no hay TELEGRAM_BOT_CHAT_ID, no voy a loggear por allá");
else
this.telegramConfig = {
token: process.env.TELEGRAM_BOT_TOKEN,
chatId: process.env.TELEGRAM_BOT_CHAT_ID,
};
}
async downloadList(supermercado: Supermercado) {
const listPath = resolve(
join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`)
);
@ -74,36 +95,40 @@ async function downloadList(supermercado: Supermercado) {
});
const t0 = performance.now();
await subproc.exited;
inform(`wget para ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}`);
this.inform(
`wget para ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}`
);
const gzippedWarcPath = join(ctxPath, "temp.warc.gz");
if (!(await fileExists(gzippedWarcPath))) {
const err = report(`no encontré el ${gzippedWarcPath}`);
const err = this.report(`no encontré el ${gzippedWarcPath}`);
throw err;
}
await compressionQueue.add(() => recompress(gzippedWarcPath, zstdWarcPath));
await compressionQueue.add(() =>
this.recompress(gzippedWarcPath, zstdWarcPath)
);
if (!(await fileExists(zstdWarcPath))) {
const err = report(`no encontré el ${zstdWarcPath}`);
const err = this.report(`no encontré el ${zstdWarcPath}`);
throw err;
}
scrapAndInform({ zstdWarcPath, zstdWarcName });
this.scrapAndInform({ zstdWarcPath, zstdWarcName });
try {
await uploadToBucket({
await this.uploadToBucket({
fileName: zstdWarcName,
file: Bun.file(zstdWarcPath),
});
} catch (error) {
inform(`Falló subir ${zstdWarcName} a S3; ${error}`);
this.inform(`Falló subir ${zstdWarcName} a S3; ${error}`);
console.error(error);
}
// TODO: borrar archivos temporales
}
async function scrapAndInform({
async scrapAndInform({
zstdWarcPath,
zstdWarcName,
}: {
@ -118,13 +143,13 @@ async function scrapAndInform({
if (res) {
const { took, progress } = res;
inform(
this.inform(
`Procesado ${zstdWarcName} (${progress.done} ok, ${
progress.errors.length
} errores) (tardó ${formatMs(took)})`
);
} else {
inform(`Algo falló en ${zstdWarcName}`);
this.inform(`Algo falló en ${zstdWarcName}`);
}
}
@ -132,7 +157,7 @@ async function scrapAndInform({
* toma un archivo gzippeado y lo recomprime con zstd.
* borra el archivo original.
*/
function recompress(inputPath: string, outputPath: string) {
recompress(inputPath: string, outputPath: string) {
// XXX: por alguna razón no funciona en Bun 1.0.20
// const decompressor = Bun.spawn({
// cmd: ["gzip", "-dc", inputPath],
@ -164,7 +189,7 @@ function recompress(inputPath: string, outputPath: string) {
decompressor.stdout.pipe(compressor.stdin);
compressor.on("close", (code) => {
if (code !== 0) {
const err = report(`zstd threw error code ${code}`);
const err = this.report(`zstd threw error code ${code}`);
reject(err);
}
resolve(void 0);
@ -172,17 +197,23 @@ function recompress(inputPath: string, outputPath: string) {
});
}
async function uploadToBucket({
async uploadToBucket({
fileName,
file,
}: {
fileName: string;
file: BunFile;
}) {
if (!this.s3Config) {
this.inform(
`[s3] Se intentó subir ${fileName} pero no tenemos creds de S3`
);
return;
}
const parallelUploads3 = new Upload({
client: s3,
client: this.s3Config.s3,
params: {
Bucket: S3_BUCKET_NAME,
Bucket: this.s3Config.bucketName,
Key: fileName,
Body: file,
},
@ -190,17 +221,29 @@ async function uploadToBucket({
await parallelUploads3.done();
}
function inform(msg: string) {
sendTelegramMsg(msg);
inform(msg: string) {
this.sendTelegramMsg(msg);
console.info(msg);
}
function report(msg: string) {
inform(msg);
report(msg: string) {
this.inform(msg);
const error = new Error(msg);
return error;
}
async sendTelegramMsg(text: string) {
if (!this.telegramConfig) return;
const url = new URL(
`https://api.telegram.org/bot${this.telegramConfig.token}/sendMessage`
);
url.searchParams.set("chat_id", this.telegramConfig.chatId);
url.searchParams.set("text", text);
await fetch(url);
}
}
// await recompress("sqlite.db.gz", "sqlite.db.zst");
// no se llama exists porque bun tiene un bug en el que usa fs.exists por mas que exista una funcion llamada exists
async function fileExists(path: string) {
try {
@ -211,17 +254,6 @@ async function fileExists(path: string) {
}
}
async function sendTelegramMsg(text: string) {
if (!process.env.TELEGRAM_BOT_TOKEN || !process.env.TELEGRAM_BOT_CHAT_ID)
return;
const url = new URL(
`https://api.telegram.org/bot${process.env.TELEGRAM_BOT_TOKEN}/sendMessage`
);
url.searchParams.set("chat_id", process.env.TELEGRAM_BOT_CHAT_ID);
url.searchParams.set("text", text);
await fetch(url);
}
function formatMs(ms: number) {
return formatDuration(intervalToDuration({ start: 0, end: Math.round(ms) }));
}

View file

@ -0,0 +1,19 @@
import { auto } from "./auto.js";
import { parseWarc } from "./scrap.js";
if (process.argv[2] === "auto") {
await auto();
} else if (process.argv[2] === "scrap") {
const warcPaths = process.argv.slice(3);
if (warcPaths.length > 0) {
for (const path of warcPaths) {
await parseWarc(path);
}
} else {
console.error("Especificá WARCs para scrapear.");
process.exit(1);
}
} else {
console.error("Especificá una acción (tipo `auto` o `scrap`) para hacer.");
process.exit(1);
}

View file

@ -32,12 +32,6 @@ const getPrevPrecio = db
.limit(1)
.prepare();
if (process.argv[1].endsWith("/scrap.ts")) {
for (const path of process.argv.slice(2)) {
await parseWarc(path);
}
}
export type Precio = typeof schema.precios.$inferInsert;
export type Precioish = Omit<
Precio,