auto: usar warcificator

2024-11-22 22:26:19 +00:00 · 2024-01-01 01:53:15 -03:00 · 2024-01-01 01:53:15 -03:00 · aae84b3829
commit aae84b3829
parent 1caba93ad6
2 changed files with 18 additions and 67 deletions
--- a/scraper/Containerfile
+++ b/scraper/Containerfile
@ -8,12 +8,27 @@ RUN bun install --frozen-lockfile \
    && bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js \
    && rm -rf node_modules/
 # https://dev.to/deciduously/use-multi-stage-docker-builds-for-statically-linked-rust-binaries-3jgd
 FROM docker.io/rust:1.74 AS warcificator-builder
 WORKDIR /usr/src/
 RUN rustup target add x86_64-unknown-linux-musl
 RUN apt-get update && apt-get install -y musl-tools musl-dev
 RUN USER=root cargo new warcificator
 WORKDIR /usr/src/warcificator
 COPY ./warcificator/Cargo.toml ./warcificator/Cargo.lock ./
 RUN cargo build --release
 COPY ./warcificator/src ./src
 RUN cargo install --target x86_64-unknown-linux-musl --path .
 FROM base
 RUN apk add --no-cache wget zstd tini
 RUN printf "#!/bin/sh\nexec bun /bin/scraper auto\n" > /etc/periodic/daily/scraper \
    && chmod +x /etc/periodic/daily/scraper
 COPY --from=builder /tmp/cli.build.js /bin/scraper
 COPY --from=warcificator-builder /usr/local/cargo/bin/warcificator /bin/
 COPY --from=builder /usr/src/app/db-datos/drizzle /bin/drizzle
 COPY --from=builder /usr/src/app/data /listas
 WORKDIR /app
--- a/scraper/auto.ts
+++ b/scraper/auto.ts
@ -22,9 +22,6 @@ const supermercados: Supermercado[] = [
  Supermercado.Dia,
 ];
 // hacemos una cola para la compresión para no sobrecargar la CPU
 const compressionQueue = new PQueue({ concurrency: 1 });
 // hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU
 const scrapQueue = new PQueue({ concurrency: 1 });
@ -77,7 +74,7 @@ class Auto {
  }
  async downloadList(supermercado: Supermercado) {
-    const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-"));
+    const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-download-"));
    let listPath: string;
    {
@ -117,15 +114,7 @@ class Auto {
    )}.warc.zst`;
    const zstdWarcPath = join(ctxPath, zstdWarcName);
    const subproc = Bun.spawn({
-      cmd: [
+      cmd: ["warcificator", listPath, zstdWarcPath],
        "wget",
        "--no-verbose",
        "--tries=3",
        "--delete-after",
        "--input-file",
        listPath,
        `--warc-file=temp`,
      ],
      stderr: "ignore",
      stdout: "ignore",
      cwd: ctxPath,
@ -133,18 +122,9 @@ class Auto {
    const t0 = performance.now();
    await subproc.exited;
    this.inform(
-      `[wget] ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}`
+      `[downloader] ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}`
    );
    const gzippedWarcPath = join(ctxPath, "temp.warc.gz");
    if (!(await fileExists(gzippedWarcPath))) {
      const err = this.report(`no encontré el ${gzippedWarcPath}`);
      throw err;
    }
    await compressionQueue.add(() =>
      this.recompress(gzippedWarcPath, zstdWarcPath)
    );
    if (!(await fileExists(zstdWarcPath))) {
      const err = this.report(`no encontré el ${zstdWarcPath}`);
      throw err;
@ -190,49 +170,6 @@ class Auto {
    }
  }
  /**
   * toma un archivo gzippeado y lo recomprime con zstd.
   * borra el archivo original.
   */
  recompress(inputPath: string, outputPath: string) {
    // XXX: por alguna razón no funciona en Bun 1.0.20
    // const decompressor = Bun.spawn({
    //   cmd: ["gzip", "-dc", inputPath],
    //   stderr: "inherit",
    // });
    // const compressor = Bun.spawn({
    //   cmd: ["zstd", "-T0", "-15", "--long", "-o", outputPath],
    //   stdin: decompressor.stdout,
    //   // stderr: "inherit",
    // });
    // const errorCode = await compressor.exited;
    // if (errorCode !== 0) {
    //   const err = report(`zstd threw error code ${errorCode}`);
    //   throw err;
    // }
    return new Promise((resolve, reject) => {
      const decompressor = spawn("gzip", ["-dc", inputPath], {
        stdio: [null, "pipe", null],
      });
      const compressor = spawn(
        "zstd",
        ["-T0", "-15", "--long", "-o", outputPath],
        {
          stdio: ["pipe", null, null],
        }
      );
      decompressor.stdout.pipe(compressor.stdin);
      compressor.on("close", (code) => {
        if (code !== 0) {
          const err = this.report(`zstd threw error code ${code}`);
          reject(err);
        }
        resolve(void 0);
      });
    });
  }
  async uploadToBucket({
    fileName,
    file,
@ -278,7 +215,6 @@ class Auto {
    await fetch(url);
  }
 }
 // await recompress("sqlite.db.gz", "sqlite.db.zst");
 // no se llama exists porque bun tiene un bug en el que usa fs.exists por mas que exista una funcion llamada exists
 async function fileExists(path: string) {