revertir custom warc por ahora

This commit is contained in:
Cat /dev/Nulo 2023-12-23 15:45:00 -03:00
parent d006c6cf5c
commit ed1d5b2ef0
3 changed files with 14 additions and 191 deletions

View file

@ -44,18 +44,12 @@ importers:
linkedom:
specifier: ^0.16.5
version: 0.16.5
mitata:
specifier: ^0.1.6
version: 0.1.6
nanoid:
specifier: ^5.0.4
version: 5.0.4
p-map:
specifier: ^7.0.0
version: 7.0.0
simple-zstd:
specifier: ^1.4.2
version: 1.4.2
undici:
specifier: ^6.2.0
version: 6.2.0
@ -316,10 +310,6 @@ packages:
resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==}
dev: false
/buffer-from@1.1.2:
resolution: {integrity: sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==}
dev: false
/bun-types@1.0.18:
resolution: {integrity: sha512-1XZ7AxOF8oO8FZtw1xj006JAKxEjulK3dUhsktZVN95vXBlsf4NIjQxfistVdpt24v3H2I9BwHp+UU+gXSSpAw==}
dev: true
@ -344,10 +334,6 @@ packages:
resolution: {integrity: sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==}
dev: false
/core-util-is@1.0.3:
resolution: {integrity: sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==}
dev: false
/crypto-random-string@4.0.0:
resolution: {integrity: sha512-x8dy3RnvYdlUcPOjkEHqozhiwzKNSq7GcPuXFbnyMOCHxX8V3OgIg/pYuabl2sbUPfIJaeAQB7PMOK8DFIdoRA==}
engines: {node: '>=12'}
@ -464,29 +450,10 @@ packages:
optional: true
dev: false
/duplex-maker@1.0.0:
resolution: {integrity: sha512-KoHuzggxg7f+vvjqOHfXxaQYI1POzBm+ah0eec7YDssZmbt6QFBI8d1nl5GQwAgR2f+VQCPvyvZtmWWqWuFtlA==}
dev: false
/duplexify@3.7.1:
resolution: {integrity: sha512-07z8uv2wMyS51kKhD1KsdXJg5WQ6t93RneqRxUHnskXVtlYYkLqM0gqStQZ3pj073g687jPCHrqNfCzawLYh5g==}
dependencies:
end-of-stream: 1.4.4
inherits: 2.0.4
readable-stream: 2.3.8
stream-shift: 1.0.1
dev: false
/emoji-regex@8.0.0:
resolution: {integrity: sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==}
dev: false
/end-of-stream@1.4.4:
resolution: {integrity: sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==}
dependencies:
once: 1.4.0
dev: false
/entities@4.5.0:
resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==}
engines: {node: '>=0.12'}
@ -568,10 +535,6 @@ packages:
entities: 4.5.0
dev: false
/inherits@2.0.4:
resolution: {integrity: sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==}
dev: false
/is-fullwidth-code-point@3.0.0:
resolution: {integrity: sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==}
engines: {node: '>=8'}
@ -582,14 +545,6 @@ packages:
engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0}
dev: false
/is-zst@1.0.0:
resolution: {integrity: sha512-ZA5lvshKAl8z30dX7saXLpVhpsq3d2EHK9uf7qtUjnOtdw4XBpAoWb2RvZ5kyoaebdoidnGI0g2hn9Z7ObPbww==}
dev: false
/isarray@1.0.0:
resolution: {integrity: sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==}
dev: false
/linkedom@0.16.5:
resolution: {integrity: sha512-FtcuLuxDtlKWWilm5Z0HgmrfMwO0tOfC6tu47fRXj2/KGEeDSh4ihiDwFKZSbJj6zh520r8XZjZ7v2Jb30HAQA==}
dependencies:
@ -600,10 +555,6 @@ packages:
uhyphen: 0.2.0
dev: false
/mitata@0.1.6:
resolution: {integrity: sha512-VKQ0r3jriTOU9E2Z+mwbZrUmbg4Li4QyFfi7kfHKl6reZhGzL0AYlu3wE0VPXzIwA5xnFzmEQoBwCcNT8stUkA==}
dev: false
/nanoid@5.0.4:
resolution: {integrity: sha512-vAjmBf13gsmhXSgBrtIclinISzFFy22WwCYoyilZlsrRXNIHSwgFQ1bEdjRwMT3aoadeIF6HMuDRlOxzfXV8ig==}
engines: {node: ^18 || >=20}
@ -616,17 +567,6 @@ packages:
boolbase: 1.0.0
dev: false
/once@1.4.0:
resolution: {integrity: sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==}
dependencies:
wrappy: 1.0.2
dev: false
/os-tmpdir@1.0.2:
resolution: {integrity: sha512-D2FR03Vir7FIu45XBY20mTb+/ZSWB00sjU9jdQXt83gDrI4Ztz5Fs7/yy74g2N5SVQY4xY1qDr4rNddwYRVX0g==}
engines: {node: '>=0.10.0'}
dev: false
/p-map@7.0.0:
resolution: {integrity: sha512-EZl03dLKv3RypkrjlevZoNwQMSy4bAblWcR18zhonktnN4fUs3asFQKSe0awn982omGxamvbejqQKQYDJYHCEg==}
engines: {node: '>=18'}
@ -649,51 +589,6 @@ packages:
resolution: {integrity: sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==}
dev: false
/peek-stream@1.1.3:
resolution: {integrity: sha512-FhJ+YbOSBb9/rIl2ZeE/QHEsWn7PqNYt8ARAY3kIgNGOk13g9FGyIY6JIl/xB/3TFRVoTv5as0l11weORrTekA==}
dependencies:
buffer-from: 1.1.2
duplexify: 3.7.1
through2: 2.0.5
dev: false
/process-nextick-args@2.0.1:
resolution: {integrity: sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==}
dev: false
/process-streams@1.0.1:
resolution: {integrity: sha512-Z+FHhxiBhiQ4t/xTY3Bo2SxZG/CehflyckFsQirAXFRf/BfVnDePzpo58eq9JI4XfFu1RnX5C5EAE6V4sce1+g==}
dependencies:
duplex-maker: 1.0.0
quotemeta: 0.0.0
tempfile: 1.1.1
dev: false
/quotemeta@0.0.0:
resolution: {integrity: sha512-1XGObUh7RN5b58vKuAsrlfqT+Rc4vmw8N4pP9gFCq1GFlTdV0Ex/D2Ro1Drvrqj++HPi3ig0Np17XPslELeMRA==}
dev: false
/readable-stream@2.3.8:
resolution: {integrity: sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==}
dependencies:
core-util-is: 1.0.3
inherits: 2.0.4
isarray: 1.0.0
process-nextick-args: 2.0.1
safe-buffer: 5.1.2
string_decoder: 1.1.1
util-deprecate: 1.0.2
dev: false
/readable-stream@3.6.2:
resolution: {integrity: sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==}
engines: {node: '>= 6'}
dependencies:
inherits: 2.0.4
string_decoder: 1.3.0
util-deprecate: 1.0.2
dev: false
/require-directory@2.1.1:
resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==}
engines: {node: '>=0.10.0'}
@ -703,27 +598,6 @@ packages:
resolution: {integrity: sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==}
dev: false
/safe-buffer@5.1.2:
resolution: {integrity: sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==}
dev: false
/safe-buffer@5.2.1:
resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==}
dev: false
/simple-zstd@1.4.2:
resolution: {integrity: sha512-kGYEvT33M5XfyQvvW4wxl3eKcWbdbCc1V7OZzuElnaXft0qbVzoIIXHXiCm3JCUki+MZKKmvjl8p2VGLJc5Y/A==}
dependencies:
is-zst: 1.0.0
peek-stream: 1.1.3
process-streams: 1.0.1
through2: 4.0.2
dev: false
/stream-shift@1.0.1:
resolution: {integrity: sha512-AiisoFqQ0vbGcZgQPY1cdP2I76glaVA/RauYR4G4thNFgkTqr90yXTo4LYX60Jl+sIlPNHHdGSwo01AvbKUSVQ==}
dev: false
/string-width@4.2.3:
resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==}
engines: {node: '>=8'}
@ -733,18 +607,6 @@ packages:
strip-ansi: 6.0.1
dev: false
/string_decoder@1.1.1:
resolution: {integrity: sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==}
dependencies:
safe-buffer: 5.1.2
dev: false
/string_decoder@1.3.0:
resolution: {integrity: sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==}
dependencies:
safe-buffer: 5.2.1
dev: false
/strip-ansi@6.0.1:
resolution: {integrity: sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==}
engines: {node: '>=8'}
@ -757,14 +619,6 @@ packages:
engines: {node: '>=14.16'}
dev: false
/tempfile@1.1.1:
resolution: {integrity: sha512-NjT12fW6pSEKz1eVcADgaKfeM+XZ4+zSaqVz46XH7+CiEwcelnwtGWRRjF1p+xyW2PVgKKKS2UUw1LzRelntxg==}
engines: {node: '>=0.10.0'}
dependencies:
os-tmpdir: 1.0.2
uuid: 2.0.3
dev: false
/tempy@3.1.0:
resolution: {integrity: sha512-7jDLIdD2Zp0bDe5r3D2qtkd1QOCacylBuL7oa4udvN6v2pqr4+LcCr67C8DR1zkpaZ8XosF5m1yQSabKAW6f2g==}
engines: {node: '>=14.16'}
@ -775,19 +629,6 @@ packages:
unique-string: 3.0.0
dev: false
/through2@2.0.5:
resolution: {integrity: sha512-/mrRod8xqpA+IHSLyGCQ2s8SPHiCDEeQJSep1jqLYeEUClOFG2Qsh+4FU6G9VeqpZnGW/Su8LQGc4YKni5rYSQ==}
dependencies:
readable-stream: 2.3.8
xtend: 4.0.2
dev: false
/through2@4.0.2:
resolution: {integrity: sha512-iOqSav00cVxEEICeD7TjLB1sueEL+81Wpzp2bY17uZjZN0pWZPuo4suZ/61VujxmqSGFfgOcNuTZ85QJwNZQpw==}
dependencies:
readable-stream: 3.6.2
dev: false
/to-data-view@2.0.0:
resolution: {integrity: sha512-RGEM5KqlPHr+WVTPmGNAXNeFEmsBnlkxXaIfEpUYV0AST2Z5W1EGq9L/MENFrMMmL2WQr1wjkmZy/M92eKhjYA==}
engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0}
@ -838,19 +679,10 @@ packages:
crypto-random-string: 4.0.0
dev: false
/util-deprecate@1.0.2:
resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==}
dev: false
/uuid-random@1.3.2:
resolution: {integrity: sha512-UOzej0Le/UgkbWEO8flm+0y+G+ljUon1QWTEZOq1rnMAsxo2+SckbiZdKzAHHlVh6gJqI1TjC/xwgR50MuCrBQ==}
dev: false
/uuid@2.0.3:
resolution: {integrity: sha512-FULf7fayPdpASncVy4DLh3xydlXEJJpvIELjYjNeQWYUZ9pclcpvCZSr2gkmN2FrrGcI7G/cJsIEwk5/8vfXpg==}
deprecated: Please upgrade to version 7 or higher. Older versions may use Math.random() in certain circumstances, which is known to be problematic. See https://v8.dev/blog/math-random for details.
dev: false
/warcio@2.2.1:
resolution: {integrity: sha512-KPLoz3aFtdTjexG+QQaubMyuLiNANzvcadGMyNKdpcmhl0k6lBHQQVpxZw3Hx9+4pbyqDXyiF4cr/h2tS8kvcw==}
engines: {node: '>=18.0.0'}
@ -873,15 +705,6 @@ packages:
strip-ansi: 6.0.1
dev: false
/wrappy@1.0.2:
resolution: {integrity: sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==}
dev: false
/xtend@4.0.2:
resolution: {integrity: sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==}
engines: {node: '>=0.4'}
dev: false
/y18n@5.0.8:
resolution: {integrity: sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==}
engines: {node: '>=10'}

View file

@ -13,10 +13,8 @@
"dependencies": {
"drizzle-orm": "^0.29.1",
"linkedom": "^0.16.5",
"mitata": "^0.1.6",
"nanoid": "^5.0.4",
"p-map": "^7.0.0",
"simple-zstd": "^1.4.2",
"undici": "^6.2.0",
"warcio": "^2.2.1",
"zod": "^3.22.4"

View file

@ -13,9 +13,8 @@ import { getDiaProduct } from "./dia.js";
import { getCotoProduct } from "./coto.js";
import { join } from "path";
import pMap from "p-map";
import { parseWARC } from "./warc.js";
const DEBUG = true;
const DEBUG = false;
const sqlite = new Database("sqlite.db");
const db = drizzle(sqlite);
@ -52,14 +51,17 @@ async function storePrecioPoint(point: Precio) {
async function parseWarc(path: string) {
// const warc = createReadStream(path);
const parser = parseWARC(path);
for await (const record of parser) {
if (record.fields.get("WARC-Type") === "response") {
const rawUri = record.fields.get("WARC-Target-URI");
if (!rawUri) continue;
const html = record.content.toString();
const warc = Bun.spawn(["zstd", "-do", "/dev/stdout", path], {
stderr: "ignore",
}).stdout;
const url = new URL(rawUri.replace(/^</, "").replace(/>$/, ""));
const parser = new WARCParser(warc);
for await (const record of parser) {
if (record.warcType === "response") {
if (!record.warcTargetURI) continue;
const html = await record.contentText();
const url = new URL(record.warcTargetURI);
try {
let ish: Precioish | undefined = undefined;
if (url.hostname === "www.carrefour.com.ar")
@ -72,8 +74,8 @@ async function parseWarc(path: string) {
const p: Precio = {
...ish,
fetchedAt: new Date(record.fields.get("WARC-Date")!),
url: url.toString(),
fetchedAt: new Date(record.warcDate!),
url: record.warcTargetURI,
};
if (ish) await storePrecioPoint(p);
@ -86,7 +88,7 @@ async function parseWarc(path: string) {
if (DEBUG) {
const urlHash = createHash("md5")
.update(url.toString())
.update(record.warcTargetURI!)
.digest("hex");
const output = join("debug", `${urlHash}.html`);
await writeFile(output, html);