mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-27 03:56:19 +00:00
WIPcooking
This commit is contained in:
parent
01f97140ec
commit
a54db788df
1 changed files with 130 additions and 0 deletions
130
scraper/bench.ts
Normal file
130
scraper/bench.ts
Normal file
|
@ -0,0 +1,130 @@
|
|||
// import { run, bench, group, baseline } from "mitata";
|
||||
import { createReadStream } from "node:fs";
|
||||
import { Writable } from "node:stream";
|
||||
import { pipeline } from "node:stream/promises";
|
||||
import { getCarrefourProduct } from "./carrefour.js";
|
||||
import { WARCParser } from "warcio";
|
||||
// import { ZSTDDecompress } from "simple-zstd";
|
||||
// import { AutoWARCParser } from "node-warc";
|
||||
|
||||
// const html = await readFile("carrefour.html", "utf-8");
|
||||
// bench("carrefour", () => {
|
||||
// getCarrefourProduct(html);
|
||||
// });
|
||||
|
||||
// await bench("warcio", async () => {
|
||||
// const warc = Bun.spawn(
|
||||
// ["zstd", "-do", "/dev/stdout", "../data/carrefour.warc.zst"],
|
||||
// {
|
||||
// // stdin: Bun.file().stream(),
|
||||
// }
|
||||
// ).stdout;
|
||||
// // const warc = Bun.file("../data/carrefour.warc").stream(1024 * 1024 * 512);
|
||||
// // const warc = createReadStream("../data/carrefour.warc.zst").pipe(ZSTDDecompress());
|
||||
|
||||
// const parser = new WARCParser(warc);
|
||||
// for await (const record of parser) {
|
||||
// const html = await record.contentText();
|
||||
// }
|
||||
// });
|
||||
|
||||
const crlf = "\r\n";
|
||||
const crlfB = Buffer.from(crlf, "utf-8");
|
||||
const crlfcrlf = crlf + crlf;
|
||||
const crlfcrlfB = Buffer.from(crlfcrlf, "utf-8");
|
||||
const warc10B = Buffer.from("WARC/1.0", "utf-8");
|
||||
const emptyBuffer = Buffer.from("", "utf-8");
|
||||
|
||||
await bench("warc", async () => {
|
||||
// const warc = Bun.spawn(
|
||||
// ["zstd", "-do", "/dev/stdout", "../data/carrefour.warc.zst"],
|
||||
// {
|
||||
// stderr: "ignore",
|
||||
// }
|
||||
// ).stdout;
|
||||
|
||||
const warc = Bun.stdin.stream();
|
||||
|
||||
// const warc = process.stdin;
|
||||
|
||||
let arrays: Buffer[] = [];
|
||||
|
||||
const myWritable = new Writable({
|
||||
highWaterMark: 1024 * 1024 * 1024,
|
||||
writev(chunks, callback) {},
|
||||
});
|
||||
|
||||
for await (const chunk of warc) {
|
||||
// console.debug(chunk.length);
|
||||
const b = Buffer.from(chunk);
|
||||
arrays.push(b);
|
||||
if (
|
||||
arrays.reduce((prev, curr) => prev + curr.length, 0) <
|
||||
1024 * 1024 * 1024
|
||||
)
|
||||
continue;
|
||||
let buf: Buffer;
|
||||
while (
|
||||
((buf = arrays.length === 1 ? arrays[0] : Buffer.concat(arrays)),
|
||||
buf.subarray(warc10B.length).includes(warc10B))
|
||||
) {
|
||||
const until = buf.indexOf(crlfcrlfB);
|
||||
const header = buf.subarray(0, until);
|
||||
|
||||
const lines = splitBuffer(header, crlfB);
|
||||
let i = 0;
|
||||
const nextLine = () => {
|
||||
const line = lines[i];
|
||||
i++;
|
||||
return line ? line : emptyBuffer;
|
||||
};
|
||||
let line: Buffer;
|
||||
if (!(line = nextLine()).equals(warc10B)) {
|
||||
throw new Error(`No WARC 1.0 header in '${line}'`);
|
||||
}
|
||||
|
||||
let field;
|
||||
let fields = new Map();
|
||||
while (
|
||||
((line = nextLine()),
|
||||
(field = parseField(line.toString("utf8"))),
|
||||
line.length !== 0)
|
||||
) {
|
||||
fields.set(field[0], field[1]);
|
||||
}
|
||||
const length = parseInt(fields.get("Content-Length"));
|
||||
const content = buf.subarray(0, length);
|
||||
console.debug(fields.get("WARC-Date"), content.length);
|
||||
|
||||
arrays = [
|
||||
buf.subarray(until + crlfcrlfB.length + length + crlfcrlfB.length),
|
||||
];
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
function splitBuffer(buffer: Buffer, val: Buffer): Buffer[] {
|
||||
let bufs = [];
|
||||
let rest = buffer;
|
||||
let i;
|
||||
while (((i = rest.indexOf(val)), i !== -1)) {
|
||||
bufs.push(rest.subarray(0, i));
|
||||
rest = rest.subarray(i + val.length);
|
||||
}
|
||||
bufs.push(rest);
|
||||
return bufs;
|
||||
}
|
||||
|
||||
function parseField(line: string): [string, string] {
|
||||
const [key, val] = line.split(": ");
|
||||
return [key, val];
|
||||
}
|
||||
|
||||
async function bench(name: string, func: () => Promise<void>) {
|
||||
const t0 = performance.now();
|
||||
await func();
|
||||
const t1 = performance.now();
|
||||
console.debug(`${name} took ${t1 - t0}`);
|
||||
}
|
||||
|
||||
// await run({});
|
Loading…
Reference in a new issue