From aae84b382967b8e87a37e85a08953028be2741d3 Mon Sep 17 00:00:00 2001 From: Nulo Date: Mon, 1 Jan 2024 01:53:15 -0300 Subject: [PATCH] auto: usar warcificator --- scraper/Containerfile | 15 ++++++++++ scraper/auto.ts | 70 ++----------------------------------------- 2 files changed, 18 insertions(+), 67 deletions(-) diff --git a/scraper/Containerfile b/scraper/Containerfile index 8fbf68e..7df71d2 100644 --- a/scraper/Containerfile +++ b/scraper/Containerfile @@ -8,12 +8,27 @@ RUN bun install --frozen-lockfile \ && bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js \ && rm -rf node_modules/ +# https://dev.to/deciduously/use-multi-stage-docker-builds-for-statically-linked-rust-binaries-3jgd +FROM docker.io/rust:1.74 AS warcificator-builder +WORKDIR /usr/src/ +RUN rustup target add x86_64-unknown-linux-musl +RUN apt-get update && apt-get install -y musl-tools musl-dev + +RUN USER=root cargo new warcificator +WORKDIR /usr/src/warcificator +COPY ./warcificator/Cargo.toml ./warcificator/Cargo.lock ./ +RUN cargo build --release + +COPY ./warcificator/src ./src +RUN cargo install --target x86_64-unknown-linux-musl --path . + FROM base RUN apk add --no-cache wget zstd tini RUN printf "#!/bin/sh\nexec bun /bin/scraper auto\n" > /etc/periodic/daily/scraper \ && chmod +x /etc/periodic/daily/scraper COPY --from=builder /tmp/cli.build.js /bin/scraper +COPY --from=warcificator-builder /usr/local/cargo/bin/warcificator /bin/ COPY --from=builder /usr/src/app/db-datos/drizzle /bin/drizzle COPY --from=builder /usr/src/app/data /listas WORKDIR /app diff --git a/scraper/auto.ts b/scraper/auto.ts index 0547fda..582cfd1 100644 --- a/scraper/auto.ts +++ b/scraper/auto.ts @@ -22,9 +22,6 @@ const supermercados: Supermercado[] = [ Supermercado.Dia, ]; -// hacemos una cola para la compresión para no sobrecargar la CPU -const compressionQueue = new PQueue({ concurrency: 1 }); - // hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU const scrapQueue = new PQueue({ concurrency: 1 }); @@ -77,7 +74,7 @@ class Auto { } async downloadList(supermercado: Supermercado) { - const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-")); + const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-download-")); let listPath: string; { @@ -117,15 +114,7 @@ class Auto { )}.warc.zst`; const zstdWarcPath = join(ctxPath, zstdWarcName); const subproc = Bun.spawn({ - cmd: [ - "wget", - "--no-verbose", - "--tries=3", - "--delete-after", - "--input-file", - listPath, - `--warc-file=temp`, - ], + cmd: ["warcificator", listPath, zstdWarcPath], stderr: "ignore", stdout: "ignore", cwd: ctxPath, @@ -133,18 +122,9 @@ class Auto { const t0 = performance.now(); await subproc.exited; this.inform( - `[wget] ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}` + `[downloader] ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}` ); - const gzippedWarcPath = join(ctxPath, "temp.warc.gz"); - if (!(await fileExists(gzippedWarcPath))) { - const err = this.report(`no encontré el ${gzippedWarcPath}`); - throw err; - } - - await compressionQueue.add(() => - this.recompress(gzippedWarcPath, zstdWarcPath) - ); if (!(await fileExists(zstdWarcPath))) { const err = this.report(`no encontré el ${zstdWarcPath}`); throw err; @@ -190,49 +170,6 @@ class Auto { } } - /** - * toma un archivo gzippeado y lo recomprime con zstd. - * borra el archivo original. - */ - recompress(inputPath: string, outputPath: string) { - // XXX: por alguna razón no funciona en Bun 1.0.20 - // const decompressor = Bun.spawn({ - // cmd: ["gzip", "-dc", inputPath], - // stderr: "inherit", - // }); - // const compressor = Bun.spawn({ - // cmd: ["zstd", "-T0", "-15", "--long", "-o", outputPath], - // stdin: decompressor.stdout, - // // stderr: "inherit", - // }); - // const errorCode = await compressor.exited; - // if (errorCode !== 0) { - // const err = report(`zstd threw error code ${errorCode}`); - // throw err; - // } - - return new Promise((resolve, reject) => { - const decompressor = spawn("gzip", ["-dc", inputPath], { - stdio: [null, "pipe", null], - }); - const compressor = spawn( - "zstd", - ["-T0", "-15", "--long", "-o", outputPath], - { - stdio: ["pipe", null, null], - } - ); - decompressor.stdout.pipe(compressor.stdin); - compressor.on("close", (code) => { - if (code !== 0) { - const err = this.report(`zstd threw error code ${code}`); - reject(err); - } - resolve(void 0); - }); - }); - } - async uploadToBucket({ fileName, file, @@ -278,7 +215,6 @@ class Auto { await fetch(url); } } -// await recompress("sqlite.db.gz", "sqlite.db.zst"); // no se llama exists porque bun tiene un bug en el que usa fs.exists por mas que exista una funcion llamada exists async function fileExists(path: string) {