mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 11:36:20 +00:00
revertir custom warc por ahora
This commit is contained in:
parent
d006c6cf5c
commit
ed1d5b2ef0
3 changed files with 14 additions and 191 deletions
177
pnpm-lock.yaml
177
pnpm-lock.yaml
|
@ -44,18 +44,12 @@ importers:
|
|||
linkedom:
|
||||
specifier: ^0.16.5
|
||||
version: 0.16.5
|
||||
mitata:
|
||||
specifier: ^0.1.6
|
||||
version: 0.1.6
|
||||
nanoid:
|
||||
specifier: ^5.0.4
|
||||
version: 5.0.4
|
||||
p-map:
|
||||
specifier: ^7.0.0
|
||||
version: 7.0.0
|
||||
simple-zstd:
|
||||
specifier: ^1.4.2
|
||||
version: 1.4.2
|
||||
undici:
|
||||
specifier: ^6.2.0
|
||||
version: 6.2.0
|
||||
|
@ -316,10 +310,6 @@ packages:
|
|||
resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==}
|
||||
dev: false
|
||||
|
||||
/buffer-from@1.1.2:
|
||||
resolution: {integrity: sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==}
|
||||
dev: false
|
||||
|
||||
/bun-types@1.0.18:
|
||||
resolution: {integrity: sha512-1XZ7AxOF8oO8FZtw1xj006JAKxEjulK3dUhsktZVN95vXBlsf4NIjQxfistVdpt24v3H2I9BwHp+UU+gXSSpAw==}
|
||||
dev: true
|
||||
|
@ -344,10 +334,6 @@ packages:
|
|||
resolution: {integrity: sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==}
|
||||
dev: false
|
||||
|
||||
/core-util-is@1.0.3:
|
||||
resolution: {integrity: sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==}
|
||||
dev: false
|
||||
|
||||
/crypto-random-string@4.0.0:
|
||||
resolution: {integrity: sha512-x8dy3RnvYdlUcPOjkEHqozhiwzKNSq7GcPuXFbnyMOCHxX8V3OgIg/pYuabl2sbUPfIJaeAQB7PMOK8DFIdoRA==}
|
||||
engines: {node: '>=12'}
|
||||
|
@ -464,29 +450,10 @@ packages:
|
|||
optional: true
|
||||
dev: false
|
||||
|
||||
/duplex-maker@1.0.0:
|
||||
resolution: {integrity: sha512-KoHuzggxg7f+vvjqOHfXxaQYI1POzBm+ah0eec7YDssZmbt6QFBI8d1nl5GQwAgR2f+VQCPvyvZtmWWqWuFtlA==}
|
||||
dev: false
|
||||
|
||||
/duplexify@3.7.1:
|
||||
resolution: {integrity: sha512-07z8uv2wMyS51kKhD1KsdXJg5WQ6t93RneqRxUHnskXVtlYYkLqM0gqStQZ3pj073g687jPCHrqNfCzawLYh5g==}
|
||||
dependencies:
|
||||
end-of-stream: 1.4.4
|
||||
inherits: 2.0.4
|
||||
readable-stream: 2.3.8
|
||||
stream-shift: 1.0.1
|
||||
dev: false
|
||||
|
||||
/emoji-regex@8.0.0:
|
||||
resolution: {integrity: sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==}
|
||||
dev: false
|
||||
|
||||
/end-of-stream@1.4.4:
|
||||
resolution: {integrity: sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==}
|
||||
dependencies:
|
||||
once: 1.4.0
|
||||
dev: false
|
||||
|
||||
/entities@4.5.0:
|
||||
resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==}
|
||||
engines: {node: '>=0.12'}
|
||||
|
@ -568,10 +535,6 @@ packages:
|
|||
entities: 4.5.0
|
||||
dev: false
|
||||
|
||||
/inherits@2.0.4:
|
||||
resolution: {integrity: sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==}
|
||||
dev: false
|
||||
|
||||
/is-fullwidth-code-point@3.0.0:
|
||||
resolution: {integrity: sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==}
|
||||
engines: {node: '>=8'}
|
||||
|
@ -582,14 +545,6 @@ packages:
|
|||
engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0}
|
||||
dev: false
|
||||
|
||||
/is-zst@1.0.0:
|
||||
resolution: {integrity: sha512-ZA5lvshKAl8z30dX7saXLpVhpsq3d2EHK9uf7qtUjnOtdw4XBpAoWb2RvZ5kyoaebdoidnGI0g2hn9Z7ObPbww==}
|
||||
dev: false
|
||||
|
||||
/isarray@1.0.0:
|
||||
resolution: {integrity: sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==}
|
||||
dev: false
|
||||
|
||||
/linkedom@0.16.5:
|
||||
resolution: {integrity: sha512-FtcuLuxDtlKWWilm5Z0HgmrfMwO0tOfC6tu47fRXj2/KGEeDSh4ihiDwFKZSbJj6zh520r8XZjZ7v2Jb30HAQA==}
|
||||
dependencies:
|
||||
|
@ -600,10 +555,6 @@ packages:
|
|||
uhyphen: 0.2.0
|
||||
dev: false
|
||||
|
||||
/mitata@0.1.6:
|
||||
resolution: {integrity: sha512-VKQ0r3jriTOU9E2Z+mwbZrUmbg4Li4QyFfi7kfHKl6reZhGzL0AYlu3wE0VPXzIwA5xnFzmEQoBwCcNT8stUkA==}
|
||||
dev: false
|
||||
|
||||
/nanoid@5.0.4:
|
||||
resolution: {integrity: sha512-vAjmBf13gsmhXSgBrtIclinISzFFy22WwCYoyilZlsrRXNIHSwgFQ1bEdjRwMT3aoadeIF6HMuDRlOxzfXV8ig==}
|
||||
engines: {node: ^18 || >=20}
|
||||
|
@ -616,17 +567,6 @@ packages:
|
|||
boolbase: 1.0.0
|
||||
dev: false
|
||||
|
||||
/once@1.4.0:
|
||||
resolution: {integrity: sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==}
|
||||
dependencies:
|
||||
wrappy: 1.0.2
|
||||
dev: false
|
||||
|
||||
/os-tmpdir@1.0.2:
|
||||
resolution: {integrity: sha512-D2FR03Vir7FIu45XBY20mTb+/ZSWB00sjU9jdQXt83gDrI4Ztz5Fs7/yy74g2N5SVQY4xY1qDr4rNddwYRVX0g==}
|
||||
engines: {node: '>=0.10.0'}
|
||||
dev: false
|
||||
|
||||
/p-map@7.0.0:
|
||||
resolution: {integrity: sha512-EZl03dLKv3RypkrjlevZoNwQMSy4bAblWcR18zhonktnN4fUs3asFQKSe0awn982omGxamvbejqQKQYDJYHCEg==}
|
||||
engines: {node: '>=18'}
|
||||
|
@ -649,51 +589,6 @@ packages:
|
|||
resolution: {integrity: sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==}
|
||||
dev: false
|
||||
|
||||
/peek-stream@1.1.3:
|
||||
resolution: {integrity: sha512-FhJ+YbOSBb9/rIl2ZeE/QHEsWn7PqNYt8ARAY3kIgNGOk13g9FGyIY6JIl/xB/3TFRVoTv5as0l11weORrTekA==}
|
||||
dependencies:
|
||||
buffer-from: 1.1.2
|
||||
duplexify: 3.7.1
|
||||
through2: 2.0.5
|
||||
dev: false
|
||||
|
||||
/process-nextick-args@2.0.1:
|
||||
resolution: {integrity: sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==}
|
||||
dev: false
|
||||
|
||||
/process-streams@1.0.1:
|
||||
resolution: {integrity: sha512-Z+FHhxiBhiQ4t/xTY3Bo2SxZG/CehflyckFsQirAXFRf/BfVnDePzpo58eq9JI4XfFu1RnX5C5EAE6V4sce1+g==}
|
||||
dependencies:
|
||||
duplex-maker: 1.0.0
|
||||
quotemeta: 0.0.0
|
||||
tempfile: 1.1.1
|
||||
dev: false
|
||||
|
||||
/quotemeta@0.0.0:
|
||||
resolution: {integrity: sha512-1XGObUh7RN5b58vKuAsrlfqT+Rc4vmw8N4pP9gFCq1GFlTdV0Ex/D2Ro1Drvrqj++HPi3ig0Np17XPslELeMRA==}
|
||||
dev: false
|
||||
|
||||
/readable-stream@2.3.8:
|
||||
resolution: {integrity: sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==}
|
||||
dependencies:
|
||||
core-util-is: 1.0.3
|
||||
inherits: 2.0.4
|
||||
isarray: 1.0.0
|
||||
process-nextick-args: 2.0.1
|
||||
safe-buffer: 5.1.2
|
||||
string_decoder: 1.1.1
|
||||
util-deprecate: 1.0.2
|
||||
dev: false
|
||||
|
||||
/readable-stream@3.6.2:
|
||||
resolution: {integrity: sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==}
|
||||
engines: {node: '>= 6'}
|
||||
dependencies:
|
||||
inherits: 2.0.4
|
||||
string_decoder: 1.3.0
|
||||
util-deprecate: 1.0.2
|
||||
dev: false
|
||||
|
||||
/require-directory@2.1.1:
|
||||
resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==}
|
||||
engines: {node: '>=0.10.0'}
|
||||
|
@ -703,27 +598,6 @@ packages:
|
|||
resolution: {integrity: sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==}
|
||||
dev: false
|
||||
|
||||
/safe-buffer@5.1.2:
|
||||
resolution: {integrity: sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==}
|
||||
dev: false
|
||||
|
||||
/safe-buffer@5.2.1:
|
||||
resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==}
|
||||
dev: false
|
||||
|
||||
/simple-zstd@1.4.2:
|
||||
resolution: {integrity: sha512-kGYEvT33M5XfyQvvW4wxl3eKcWbdbCc1V7OZzuElnaXft0qbVzoIIXHXiCm3JCUki+MZKKmvjl8p2VGLJc5Y/A==}
|
||||
dependencies:
|
||||
is-zst: 1.0.0
|
||||
peek-stream: 1.1.3
|
||||
process-streams: 1.0.1
|
||||
through2: 4.0.2
|
||||
dev: false
|
||||
|
||||
/stream-shift@1.0.1:
|
||||
resolution: {integrity: sha512-AiisoFqQ0vbGcZgQPY1cdP2I76glaVA/RauYR4G4thNFgkTqr90yXTo4LYX60Jl+sIlPNHHdGSwo01AvbKUSVQ==}
|
||||
dev: false
|
||||
|
||||
/string-width@4.2.3:
|
||||
resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==}
|
||||
engines: {node: '>=8'}
|
||||
|
@ -733,18 +607,6 @@ packages:
|
|||
strip-ansi: 6.0.1
|
||||
dev: false
|
||||
|
||||
/string_decoder@1.1.1:
|
||||
resolution: {integrity: sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==}
|
||||
dependencies:
|
||||
safe-buffer: 5.1.2
|
||||
dev: false
|
||||
|
||||
/string_decoder@1.3.0:
|
||||
resolution: {integrity: sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==}
|
||||
dependencies:
|
||||
safe-buffer: 5.2.1
|
||||
dev: false
|
||||
|
||||
/strip-ansi@6.0.1:
|
||||
resolution: {integrity: sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==}
|
||||
engines: {node: '>=8'}
|
||||
|
@ -757,14 +619,6 @@ packages:
|
|||
engines: {node: '>=14.16'}
|
||||
dev: false
|
||||
|
||||
/tempfile@1.1.1:
|
||||
resolution: {integrity: sha512-NjT12fW6pSEKz1eVcADgaKfeM+XZ4+zSaqVz46XH7+CiEwcelnwtGWRRjF1p+xyW2PVgKKKS2UUw1LzRelntxg==}
|
||||
engines: {node: '>=0.10.0'}
|
||||
dependencies:
|
||||
os-tmpdir: 1.0.2
|
||||
uuid: 2.0.3
|
||||
dev: false
|
||||
|
||||
/tempy@3.1.0:
|
||||
resolution: {integrity: sha512-7jDLIdD2Zp0bDe5r3D2qtkd1QOCacylBuL7oa4udvN6v2pqr4+LcCr67C8DR1zkpaZ8XosF5m1yQSabKAW6f2g==}
|
||||
engines: {node: '>=14.16'}
|
||||
|
@ -775,19 +629,6 @@ packages:
|
|||
unique-string: 3.0.0
|
||||
dev: false
|
||||
|
||||
/through2@2.0.5:
|
||||
resolution: {integrity: sha512-/mrRod8xqpA+IHSLyGCQ2s8SPHiCDEeQJSep1jqLYeEUClOFG2Qsh+4FU6G9VeqpZnGW/Su8LQGc4YKni5rYSQ==}
|
||||
dependencies:
|
||||
readable-stream: 2.3.8
|
||||
xtend: 4.0.2
|
||||
dev: false
|
||||
|
||||
/through2@4.0.2:
|
||||
resolution: {integrity: sha512-iOqSav00cVxEEICeD7TjLB1sueEL+81Wpzp2bY17uZjZN0pWZPuo4suZ/61VujxmqSGFfgOcNuTZ85QJwNZQpw==}
|
||||
dependencies:
|
||||
readable-stream: 3.6.2
|
||||
dev: false
|
||||
|
||||
/to-data-view@2.0.0:
|
||||
resolution: {integrity: sha512-RGEM5KqlPHr+WVTPmGNAXNeFEmsBnlkxXaIfEpUYV0AST2Z5W1EGq9L/MENFrMMmL2WQr1wjkmZy/M92eKhjYA==}
|
||||
engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0}
|
||||
|
@ -838,19 +679,10 @@ packages:
|
|||
crypto-random-string: 4.0.0
|
||||
dev: false
|
||||
|
||||
/util-deprecate@1.0.2:
|
||||
resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==}
|
||||
dev: false
|
||||
|
||||
/uuid-random@1.3.2:
|
||||
resolution: {integrity: sha512-UOzej0Le/UgkbWEO8flm+0y+G+ljUon1QWTEZOq1rnMAsxo2+SckbiZdKzAHHlVh6gJqI1TjC/xwgR50MuCrBQ==}
|
||||
dev: false
|
||||
|
||||
/uuid@2.0.3:
|
||||
resolution: {integrity: sha512-FULf7fayPdpASncVy4DLh3xydlXEJJpvIELjYjNeQWYUZ9pclcpvCZSr2gkmN2FrrGcI7G/cJsIEwk5/8vfXpg==}
|
||||
deprecated: Please upgrade to version 7 or higher. Older versions may use Math.random() in certain circumstances, which is known to be problematic. See https://v8.dev/blog/math-random for details.
|
||||
dev: false
|
||||
|
||||
/warcio@2.2.1:
|
||||
resolution: {integrity: sha512-KPLoz3aFtdTjexG+QQaubMyuLiNANzvcadGMyNKdpcmhl0k6lBHQQVpxZw3Hx9+4pbyqDXyiF4cr/h2tS8kvcw==}
|
||||
engines: {node: '>=18.0.0'}
|
||||
|
@ -873,15 +705,6 @@ packages:
|
|||
strip-ansi: 6.0.1
|
||||
dev: false
|
||||
|
||||
/wrappy@1.0.2:
|
||||
resolution: {integrity: sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==}
|
||||
dev: false
|
||||
|
||||
/xtend@4.0.2:
|
||||
resolution: {integrity: sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==}
|
||||
engines: {node: '>=0.4'}
|
||||
dev: false
|
||||
|
||||
/y18n@5.0.8:
|
||||
resolution: {integrity: sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==}
|
||||
engines: {node: '>=10'}
|
||||
|
|
|
@ -13,10 +13,8 @@
|
|||
"dependencies": {
|
||||
"drizzle-orm": "^0.29.1",
|
||||
"linkedom": "^0.16.5",
|
||||
"mitata": "^0.1.6",
|
||||
"nanoid": "^5.0.4",
|
||||
"p-map": "^7.0.0",
|
||||
"simple-zstd": "^1.4.2",
|
||||
"undici": "^6.2.0",
|
||||
"warcio": "^2.2.1",
|
||||
"zod": "^3.22.4"
|
||||
|
|
|
@ -13,9 +13,8 @@ import { getDiaProduct } from "./dia.js";
|
|||
import { getCotoProduct } from "./coto.js";
|
||||
import { join } from "path";
|
||||
import pMap from "p-map";
|
||||
import { parseWARC } from "./warc.js";
|
||||
|
||||
const DEBUG = true;
|
||||
const DEBUG = false;
|
||||
|
||||
const sqlite = new Database("sqlite.db");
|
||||
const db = drizzle(sqlite);
|
||||
|
@ -52,14 +51,17 @@ async function storePrecioPoint(point: Precio) {
|
|||
async function parseWarc(path: string) {
|
||||
// const warc = createReadStream(path);
|
||||
|
||||
const parser = parseWARC(path);
|
||||
for await (const record of parser) {
|
||||
if (record.fields.get("WARC-Type") === "response") {
|
||||
const rawUri = record.fields.get("WARC-Target-URI");
|
||||
if (!rawUri) continue;
|
||||
const html = record.content.toString();
|
||||
const warc = Bun.spawn(["zstd", "-do", "/dev/stdout", path], {
|
||||
stderr: "ignore",
|
||||
}).stdout;
|
||||
|
||||
const url = new URL(rawUri.replace(/^</, "").replace(/>$/, ""));
|
||||
const parser = new WARCParser(warc);
|
||||
for await (const record of parser) {
|
||||
if (record.warcType === "response") {
|
||||
if (!record.warcTargetURI) continue;
|
||||
const html = await record.contentText();
|
||||
|
||||
const url = new URL(record.warcTargetURI);
|
||||
try {
|
||||
let ish: Precioish | undefined = undefined;
|
||||
if (url.hostname === "www.carrefour.com.ar")
|
||||
|
@ -72,8 +74,8 @@ async function parseWarc(path: string) {
|
|||
|
||||
const p: Precio = {
|
||||
...ish,
|
||||
fetchedAt: new Date(record.fields.get("WARC-Date")!),
|
||||
url: url.toString(),
|
||||
fetchedAt: new Date(record.warcDate!),
|
||||
url: record.warcTargetURI,
|
||||
};
|
||||
|
||||
if (ish) await storePrecioPoint(p);
|
||||
|
@ -86,7 +88,7 @@ async function parseWarc(path: string) {
|
|||
|
||||
if (DEBUG) {
|
||||
const urlHash = createHash("md5")
|
||||
.update(url.toString())
|
||||
.update(record.warcTargetURI!)
|
||||
.digest("hex");
|
||||
const output = join("debug", `${urlHash}.html`);
|
||||
await writeFile(output, html);
|
||||
|
|
Loading…
Reference in a new issue