From a54db788dfd0a1216ab2f72b14ebe29eea1f8ee1 Mon Sep 17 00:00:00 2001 From: Nulo Date: Fri, 22 Dec 2023 23:45:31 -0300 Subject: [PATCH] WIPcooking --- scraper/bench.ts | 130 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 scraper/bench.ts diff --git a/scraper/bench.ts b/scraper/bench.ts new file mode 100644 index 0000000..e78e324 --- /dev/null +++ b/scraper/bench.ts @@ -0,0 +1,130 @@ +// import { run, bench, group, baseline } from "mitata"; +import { createReadStream } from "node:fs"; +import { Writable } from "node:stream"; +import { pipeline } from "node:stream/promises"; +import { getCarrefourProduct } from "./carrefour.js"; +import { WARCParser } from "warcio"; +// import { ZSTDDecompress } from "simple-zstd"; +// import { AutoWARCParser } from "node-warc"; + +// const html = await readFile("carrefour.html", "utf-8"); +// bench("carrefour", () => { +// getCarrefourProduct(html); +// }); + +// await bench("warcio", async () => { +// const warc = Bun.spawn( +// ["zstd", "-do", "/dev/stdout", "../data/carrefour.warc.zst"], +// { +// // stdin: Bun.file().stream(), +// } +// ).stdout; +// // const warc = Bun.file("../data/carrefour.warc").stream(1024 * 1024 * 512); +// // const warc = createReadStream("../data/carrefour.warc.zst").pipe(ZSTDDecompress()); + +// const parser = new WARCParser(warc); +// for await (const record of parser) { +// const html = await record.contentText(); +// } +// }); + +const crlf = "\r\n"; +const crlfB = Buffer.from(crlf, "utf-8"); +const crlfcrlf = crlf + crlf; +const crlfcrlfB = Buffer.from(crlfcrlf, "utf-8"); +const warc10B = Buffer.from("WARC/1.0", "utf-8"); +const emptyBuffer = Buffer.from("", "utf-8"); + +await bench("warc", async () => { + // const warc = Bun.spawn( + // ["zstd", "-do", "/dev/stdout", "../data/carrefour.warc.zst"], + // { + // stderr: "ignore", + // } + // ).stdout; + + const warc = Bun.stdin.stream(); + + // const warc = process.stdin; + + let arrays: Buffer[] = []; + + const myWritable = new Writable({ + highWaterMark: 1024 * 1024 * 1024, + writev(chunks, callback) {}, + }); + + for await (const chunk of warc) { + // console.debug(chunk.length); + const b = Buffer.from(chunk); + arrays.push(b); + if ( + arrays.reduce((prev, curr) => prev + curr.length, 0) < + 1024 * 1024 * 1024 + ) + continue; + let buf: Buffer; + while ( + ((buf = arrays.length === 1 ? arrays[0] : Buffer.concat(arrays)), + buf.subarray(warc10B.length).includes(warc10B)) + ) { + const until = buf.indexOf(crlfcrlfB); + const header = buf.subarray(0, until); + + const lines = splitBuffer(header, crlfB); + let i = 0; + const nextLine = () => { + const line = lines[i]; + i++; + return line ? line : emptyBuffer; + }; + let line: Buffer; + if (!(line = nextLine()).equals(warc10B)) { + throw new Error(`No WARC 1.0 header in '${line}'`); + } + + let field; + let fields = new Map(); + while ( + ((line = nextLine()), + (field = parseField(line.toString("utf8"))), + line.length !== 0) + ) { + fields.set(field[0], field[1]); + } + const length = parseInt(fields.get("Content-Length")); + const content = buf.subarray(0, length); + console.debug(fields.get("WARC-Date"), content.length); + + arrays = [ + buf.subarray(until + crlfcrlfB.length + length + crlfcrlfB.length), + ]; + } + } +}); + +function splitBuffer(buffer: Buffer, val: Buffer): Buffer[] { + let bufs = []; + let rest = buffer; + let i; + while (((i = rest.indexOf(val)), i !== -1)) { + bufs.push(rest.subarray(0, i)); + rest = rest.subarray(i + val.length); + } + bufs.push(rest); + return bufs; +} + +function parseField(line: string): [string, string] { + const [key, val] = line.split(": "); + return [key, val]; +} + +async function bench(name: string, func: () => Promise) { + const t0 = performance.now(); + await func(); + const t1 = performance.now(); + console.debug(`${name} took ${t1 - t0}`); +} + +// await run({});