From a54db788dfd0a1216ab2f72b14ebe29eea1f8ee1 Mon Sep 17 00:00:00 2001
From: Nulo <git@nulo.in>
Date: Fri, 22 Dec 2023 23:45:31 -0300
Subject: [PATCH] WIPcooking

---
 scraper/bench.ts | 130 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 scraper/bench.ts
diff --git a/scraper/bench.ts b/scraper/bench.ts
new file mode 100644
index 0000000..e78e324
--- /dev/null
+++ b/scraper/bench.ts
@@ -0,0 +1,130 @@
+// import { run, bench, group, baseline } from "mitata";
+import { createReadStream } from "node:fs";
+import { Writable } from "node:stream";
+import { pipeline } from "node:stream/promises";
+import { getCarrefourProduct } from "./carrefour.js";
+import { WARCParser } from "warcio";
+// import { ZSTDDecompress } from "simple-zstd";
+// import { AutoWARCParser } from "node-warc";
+
+// const html = await readFile("carrefour.html", "utf-8");
+// bench("carrefour", () => {
+//   getCarrefourProduct(html);
+// });
+
+// await bench("warcio", async () => {
+//   const warc = Bun.spawn(
+//     ["zstd", "-do", "/dev/stdout", "../data/carrefour.warc.zst"],
+//     {
+//       // stdin: Bun.file().stream(),
+//     }
+//   ).stdout;
+//   // const warc = Bun.file("../data/carrefour.warc").stream(1024 * 1024 * 512);
+//   // const warc = createReadStream("../data/carrefour.warc.zst").pipe(ZSTDDecompress());
+
+//   const parser = new WARCParser(warc);
+//   for await (const record of parser) {
+//     const html = await record.contentText();
+//   }
+// });
+
+const crlf = "\r\n";
+const crlfB = Buffer.from(crlf, "utf-8");
+const crlfcrlf = crlf + crlf;
+const crlfcrlfB = Buffer.from(crlfcrlf, "utf-8");
+const warc10B = Buffer.from("WARC/1.0", "utf-8");
+const emptyBuffer = Buffer.from("", "utf-8");
+
+await bench("warc", async () => {
+  // const warc = Bun.spawn(
+  //   ["zstd", "-do", "/dev/stdout", "../data/carrefour.warc.zst"],
+  //   {
+  //     stderr: "ignore",
+  //   }
+  // ).stdout;
+
+  const warc = Bun.stdin.stream();
+
+  // const warc = process.stdin;
+
+  let arrays: Buffer[] = [];
+
+  const myWritable = new Writable({
+    highWaterMark: 1024 * 1024 * 1024,
+    writev(chunks, callback) {},
+  });
+
+  for await (const chunk of warc) {
+    // console.debug(chunk.length);
+    const b = Buffer.from(chunk);
+    arrays.push(b);
+    if (
+      arrays.reduce((prev, curr) => prev + curr.length, 0) <
+      1024 * 1024 * 1024
+    )
+      continue;
+    let buf: Buffer;
+    while (
+      ((buf = arrays.length === 1 ? arrays[0] : Buffer.concat(arrays)),
+      buf.subarray(warc10B.length).includes(warc10B))
+    ) {
+      const until = buf.indexOf(crlfcrlfB);
+      const header = buf.subarray(0, until);
+
+      const lines = splitBuffer(header, crlfB);
+      let i = 0;
+      const nextLine = () => {
+        const line = lines[i];
+        i++;
+        return line ? line : emptyBuffer;
+      };
+      let line: Buffer;
+      if (!(line = nextLine()).equals(warc10B)) {
+        throw new Error(`No WARC 1.0 header in '${line}'`);
+      }
+
+      let field;
+      let fields = new Map();
+      while (
+        ((line = nextLine()),
+        (field = parseField(line.toString("utf8"))),
+        line.length !== 0)
+      ) {
+        fields.set(field[0], field[1]);
+      }
+      const length = parseInt(fields.get("Content-Length"));
+      const content = buf.subarray(0, length);
+      console.debug(fields.get("WARC-Date"), content.length);
+
+      arrays = [
+        buf.subarray(until + crlfcrlfB.length + length + crlfcrlfB.length),
+      ];
+    }
+  }
+});
+
+function splitBuffer(buffer: Buffer, val: Buffer): Buffer[] {
+  let bufs = [];
+  let rest = buffer;
+  let i;
+  while (((i = rest.indexOf(val)), i !== -1)) {
+    bufs.push(rest.subarray(0, i));
+    rest = rest.subarray(i + val.length);
+  }
+  bufs.push(rest);
+  return bufs;
+}
+
+function parseField(line: string): [string, string] {
+  const [key, val] = line.split(": ");
+  return [key, val];
+}
+
+async function bench(name: string, func: () => Promise<void>) {
+  const t0 = performance.now();
+  await func();
+  const t1 = performance.now();
+  console.debug(`${name} took ${t1 - t0}`);
+}
+
+// await run({});