auto: usar warcificator

This commit is contained in:
Cat /dev/Nulo 2024-01-01 01:53:15 -03:00 committed by Nulo
parent 1caba93ad6
commit aae84b3829
2 changed files with 18 additions and 67 deletions

View file

@ -8,12 +8,27 @@ RUN bun install --frozen-lockfile \
&& bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js \ && bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js \
&& rm -rf node_modules/ && rm -rf node_modules/
# https://dev.to/deciduously/use-multi-stage-docker-builds-for-statically-linked-rust-binaries-3jgd
FROM docker.io/rust:1.74 AS warcificator-builder
WORKDIR /usr/src/
RUN rustup target add x86_64-unknown-linux-musl
RUN apt-get update && apt-get install -y musl-tools musl-dev
RUN USER=root cargo new warcificator
WORKDIR /usr/src/warcificator
COPY ./warcificator/Cargo.toml ./warcificator/Cargo.lock ./
RUN cargo build --release
COPY ./warcificator/src ./src
RUN cargo install --target x86_64-unknown-linux-musl --path .
FROM base FROM base
RUN apk add --no-cache wget zstd tini RUN apk add --no-cache wget zstd tini
RUN printf "#!/bin/sh\nexec bun /bin/scraper auto\n" > /etc/periodic/daily/scraper \ RUN printf "#!/bin/sh\nexec bun /bin/scraper auto\n" > /etc/periodic/daily/scraper \
&& chmod +x /etc/periodic/daily/scraper && chmod +x /etc/periodic/daily/scraper
COPY --from=builder /tmp/cli.build.js /bin/scraper COPY --from=builder /tmp/cli.build.js /bin/scraper
COPY --from=warcificator-builder /usr/local/cargo/bin/warcificator /bin/
COPY --from=builder /usr/src/app/db-datos/drizzle /bin/drizzle COPY --from=builder /usr/src/app/db-datos/drizzle /bin/drizzle
COPY --from=builder /usr/src/app/data /listas COPY --from=builder /usr/src/app/data /listas
WORKDIR /app WORKDIR /app

View file

@ -22,9 +22,6 @@ const supermercados: Supermercado[] = [
Supermercado.Dia, Supermercado.Dia,
]; ];
// hacemos una cola para la compresión para no sobrecargar la CPU
const compressionQueue = new PQueue({ concurrency: 1 });
// hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU // hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU
const scrapQueue = new PQueue({ concurrency: 1 }); const scrapQueue = new PQueue({ concurrency: 1 });
@ -77,7 +74,7 @@ class Auto {
} }
async downloadList(supermercado: Supermercado) { async downloadList(supermercado: Supermercado) {
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-")); const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-download-"));
let listPath: string; let listPath: string;
{ {
@ -117,15 +114,7 @@ class Auto {
)}.warc.zst`; )}.warc.zst`;
const zstdWarcPath = join(ctxPath, zstdWarcName); const zstdWarcPath = join(ctxPath, zstdWarcName);
const subproc = Bun.spawn({ const subproc = Bun.spawn({
cmd: [ cmd: ["warcificator", listPath, zstdWarcPath],
"wget",
"--no-verbose",
"--tries=3",
"--delete-after",
"--input-file",
listPath,
`--warc-file=temp`,
],
stderr: "ignore", stderr: "ignore",
stdout: "ignore", stdout: "ignore",
cwd: ctxPath, cwd: ctxPath,
@ -133,18 +122,9 @@ class Auto {
const t0 = performance.now(); const t0 = performance.now();
await subproc.exited; await subproc.exited;
this.inform( this.inform(
`[wget] ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}` `[downloader] ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}`
); );
const gzippedWarcPath = join(ctxPath, "temp.warc.gz");
if (!(await fileExists(gzippedWarcPath))) {
const err = this.report(`no encontré el ${gzippedWarcPath}`);
throw err;
}
await compressionQueue.add(() =>
this.recompress(gzippedWarcPath, zstdWarcPath)
);
if (!(await fileExists(zstdWarcPath))) { if (!(await fileExists(zstdWarcPath))) {
const err = this.report(`no encontré el ${zstdWarcPath}`); const err = this.report(`no encontré el ${zstdWarcPath}`);
throw err; throw err;
@ -190,49 +170,6 @@ class Auto {
} }
} }
/**
* toma un archivo gzippeado y lo recomprime con zstd.
* borra el archivo original.
*/
recompress(inputPath: string, outputPath: string) {
// XXX: por alguna razón no funciona en Bun 1.0.20
// const decompressor = Bun.spawn({
// cmd: ["gzip", "-dc", inputPath],
// stderr: "inherit",
// });
// const compressor = Bun.spawn({
// cmd: ["zstd", "-T0", "-15", "--long", "-o", outputPath],
// stdin: decompressor.stdout,
// // stderr: "inherit",
// });
// const errorCode = await compressor.exited;
// if (errorCode !== 0) {
// const err = report(`zstd threw error code ${errorCode}`);
// throw err;
// }
return new Promise((resolve, reject) => {
const decompressor = spawn("gzip", ["-dc", inputPath], {
stdio: [null, "pipe", null],
});
const compressor = spawn(
"zstd",
["-T0", "-15", "--long", "-o", outputPath],
{
stdio: ["pipe", null, null],
}
);
decompressor.stdout.pipe(compressor.stdin);
compressor.on("close", (code) => {
if (code !== 0) {
const err = this.report(`zstd threw error code ${code}`);
reject(err);
}
resolve(void 0);
});
});
}
async uploadToBucket({ async uploadToBucket({
fileName, fileName,
file, file,
@ -278,7 +215,6 @@ class Auto {
await fetch(url); await fetch(url);
} }
} }
// await recompress("sqlite.db.gz", "sqlite.db.zst");
// no se llama exists porque bun tiene un bug en el que usa fs.exists por mas que exista una funcion llamada exists // no se llama exists porque bun tiene un bug en el que usa fs.exists por mas que exista una funcion llamada exists
async function fileExists(path: string) { async function fileExists(path: string) {