mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 19:46:19 +00:00
transform
This commit is contained in:
parent
a54db788df
commit
61727faa84
1 changed files with 27 additions and 17 deletions
|
@ -1,7 +1,6 @@
|
||||||
// import { run, bench, group, baseline } from "mitata";
|
// import { run, bench, group, baseline } from "mitata";
|
||||||
import { createReadStream } from "node:fs";
|
import { createReadStream } from "node:fs";
|
||||||
import { Writable } from "node:stream";
|
import { Readable } from "stream";
|
||||||
import { pipeline } from "node:stream/promises";
|
|
||||||
import { getCarrefourProduct } from "./carrefour.js";
|
import { getCarrefourProduct } from "./carrefour.js";
|
||||||
import { WARCParser } from "warcio";
|
import { WARCParser } from "warcio";
|
||||||
// import { ZSTDDecompress } from "simple-zstd";
|
// import { ZSTDDecompress } from "simple-zstd";
|
||||||
|
@ -44,25 +43,36 @@ await bench("warc", async () => {
|
||||||
// ).stdout;
|
// ).stdout;
|
||||||
|
|
||||||
const warc = Bun.stdin.stream();
|
const warc = Bun.stdin.stream();
|
||||||
|
// const warc = Readable.toWeb(process.stdin);
|
||||||
|
|
||||||
// const warc = process.stdin;
|
let buffer: Uint8Array[] = [];
|
||||||
|
const transform = new TransformStream<Uint8Array, Buffer>({
|
||||||
let arrays: Buffer[] = [];
|
transform(chunk, controller) {
|
||||||
|
buffer.push(chunk);
|
||||||
const myWritable = new Writable({
|
if (buffer.reduce((prev, curr) => prev + curr.length, 0) > 1024 * 1024) {
|
||||||
highWaterMark: 1024 * 1024 * 1024,
|
controller.enqueue(Buffer.concat(buffer));
|
||||||
writev(chunks, callback) {},
|
buffer = [];
|
||||||
|
}
|
||||||
|
},
|
||||||
|
flush(controller) {
|
||||||
|
controller.enqueue(Buffer.concat(buffer));
|
||||||
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
for await (const chunk of warc) {
|
warc.pipeTo(transform.writable);
|
||||||
|
|
||||||
|
let arrays: Buffer[] = [];
|
||||||
|
let n = 0;
|
||||||
|
for await (const chunk of transform.readable) {
|
||||||
|
console.debug(n);
|
||||||
// console.debug(chunk.length);
|
// console.debug(chunk.length);
|
||||||
const b = Buffer.from(chunk);
|
const b = Buffer.from(chunk);
|
||||||
arrays.push(b);
|
arrays.push(b);
|
||||||
if (
|
// if (
|
||||||
arrays.reduce((prev, curr) => prev + curr.length, 0) <
|
// arrays.reduce((prev, curr) => prev + curr.length, 0) <
|
||||||
1024 * 1024 * 1024
|
// 1024 * 1024 * 1024
|
||||||
)
|
// )
|
||||||
continue;
|
// continue;
|
||||||
let buf: Buffer;
|
let buf: Buffer;
|
||||||
while (
|
while (
|
||||||
((buf = arrays.length === 1 ? arrays[0] : Buffer.concat(arrays)),
|
((buf = arrays.length === 1 ? arrays[0] : Buffer.concat(arrays)),
|
||||||
|
@ -94,8 +104,8 @@ await bench("warc", async () => {
|
||||||
}
|
}
|
||||||
const length = parseInt(fields.get("Content-Length"));
|
const length = parseInt(fields.get("Content-Length"));
|
||||||
const content = buf.subarray(0, length);
|
const content = buf.subarray(0, length);
|
||||||
console.debug(fields.get("WARC-Date"), content.length);
|
// console.debug(fields.get("WARC-Date"), content.length);
|
||||||
|
n++;
|
||||||
arrays = [
|
arrays = [
|
||||||
buf.subarray(until + crlfcrlfB.length + length + crlfcrlfB.length),
|
buf.subarray(until + crlfcrlfB.length + length + crlfcrlfB.length),
|
||||||
];
|
];
|
||||||
|
|
Loading…
Reference in a new issue