From ed1d5b2ef0dd391332fb28018a032c3ae949eedc Mon Sep 17 00:00:00 2001 From: Nulo Date: Sat, 23 Dec 2023 15:45:00 -0300 Subject: [PATCH] revertir custom warc por ahora --- pnpm-lock.yaml | 177 ------------------------------------------- scraper/package.json | 2 - scraper/scrap.ts | 26 ++++--- 3 files changed, 14 insertions(+), 191 deletions(-) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 0fded50..16dc20f 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -44,18 +44,12 @@ importers: linkedom: specifier: ^0.16.5 version: 0.16.5 - mitata: - specifier: ^0.1.6 - version: 0.1.6 nanoid: specifier: ^5.0.4 version: 5.0.4 p-map: specifier: ^7.0.0 version: 7.0.0 - simple-zstd: - specifier: ^1.4.2 - version: 1.4.2 undici: specifier: ^6.2.0 version: 6.2.0 @@ -316,10 +310,6 @@ packages: resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==} dev: false - /buffer-from@1.1.2: - resolution: {integrity: sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==} - dev: false - /bun-types@1.0.18: resolution: {integrity: sha512-1XZ7AxOF8oO8FZtw1xj006JAKxEjulK3dUhsktZVN95vXBlsf4NIjQxfistVdpt24v3H2I9BwHp+UU+gXSSpAw==} dev: true @@ -344,10 +334,6 @@ packages: resolution: {integrity: sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==} dev: false - /core-util-is@1.0.3: - resolution: {integrity: sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==} - dev: false - /crypto-random-string@4.0.0: resolution: {integrity: sha512-x8dy3RnvYdlUcPOjkEHqozhiwzKNSq7GcPuXFbnyMOCHxX8V3OgIg/pYuabl2sbUPfIJaeAQB7PMOK8DFIdoRA==} engines: {node: '>=12'} @@ -464,29 +450,10 @@ packages: optional: true dev: false - /duplex-maker@1.0.0: - resolution: {integrity: sha512-KoHuzggxg7f+vvjqOHfXxaQYI1POzBm+ah0eec7YDssZmbt6QFBI8d1nl5GQwAgR2f+VQCPvyvZtmWWqWuFtlA==} - dev: false - - /duplexify@3.7.1: - resolution: {integrity: sha512-07z8uv2wMyS51kKhD1KsdXJg5WQ6t93RneqRxUHnskXVtlYYkLqM0gqStQZ3pj073g687jPCHrqNfCzawLYh5g==} - dependencies: - end-of-stream: 1.4.4 - inherits: 2.0.4 - readable-stream: 2.3.8 - stream-shift: 1.0.1 - dev: false - /emoji-regex@8.0.0: resolution: {integrity: sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==} dev: false - /end-of-stream@1.4.4: - resolution: {integrity: sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==} - dependencies: - once: 1.4.0 - dev: false - /entities@4.5.0: resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==} engines: {node: '>=0.12'} @@ -568,10 +535,6 @@ packages: entities: 4.5.0 dev: false - /inherits@2.0.4: - resolution: {integrity: sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==} - dev: false - /is-fullwidth-code-point@3.0.0: resolution: {integrity: sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==} engines: {node: '>=8'} @@ -582,14 +545,6 @@ packages: engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} dev: false - /is-zst@1.0.0: - resolution: {integrity: sha512-ZA5lvshKAl8z30dX7saXLpVhpsq3d2EHK9uf7qtUjnOtdw4XBpAoWb2RvZ5kyoaebdoidnGI0g2hn9Z7ObPbww==} - dev: false - - /isarray@1.0.0: - resolution: {integrity: sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==} - dev: false - /linkedom@0.16.5: resolution: {integrity: sha512-FtcuLuxDtlKWWilm5Z0HgmrfMwO0tOfC6tu47fRXj2/KGEeDSh4ihiDwFKZSbJj6zh520r8XZjZ7v2Jb30HAQA==} dependencies: @@ -600,10 +555,6 @@ packages: uhyphen: 0.2.0 dev: false - /mitata@0.1.6: - resolution: {integrity: sha512-VKQ0r3jriTOU9E2Z+mwbZrUmbg4Li4QyFfi7kfHKl6reZhGzL0AYlu3wE0VPXzIwA5xnFzmEQoBwCcNT8stUkA==} - dev: false - /nanoid@5.0.4: resolution: {integrity: sha512-vAjmBf13gsmhXSgBrtIclinISzFFy22WwCYoyilZlsrRXNIHSwgFQ1bEdjRwMT3aoadeIF6HMuDRlOxzfXV8ig==} engines: {node: ^18 || >=20} @@ -616,17 +567,6 @@ packages: boolbase: 1.0.0 dev: false - /once@1.4.0: - resolution: {integrity: sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==} - dependencies: - wrappy: 1.0.2 - dev: false - - /os-tmpdir@1.0.2: - resolution: {integrity: sha512-D2FR03Vir7FIu45XBY20mTb+/ZSWB00sjU9jdQXt83gDrI4Ztz5Fs7/yy74g2N5SVQY4xY1qDr4rNddwYRVX0g==} - engines: {node: '>=0.10.0'} - dev: false - /p-map@7.0.0: resolution: {integrity: sha512-EZl03dLKv3RypkrjlevZoNwQMSy4bAblWcR18zhonktnN4fUs3asFQKSe0awn982omGxamvbejqQKQYDJYHCEg==} engines: {node: '>=18'} @@ -649,51 +589,6 @@ packages: resolution: {integrity: sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==} dev: false - /peek-stream@1.1.3: - resolution: {integrity: sha512-FhJ+YbOSBb9/rIl2ZeE/QHEsWn7PqNYt8ARAY3kIgNGOk13g9FGyIY6JIl/xB/3TFRVoTv5as0l11weORrTekA==} - dependencies: - buffer-from: 1.1.2 - duplexify: 3.7.1 - through2: 2.0.5 - dev: false - - /process-nextick-args@2.0.1: - resolution: {integrity: sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==} - dev: false - - /process-streams@1.0.1: - resolution: {integrity: sha512-Z+FHhxiBhiQ4t/xTY3Bo2SxZG/CehflyckFsQirAXFRf/BfVnDePzpo58eq9JI4XfFu1RnX5C5EAE6V4sce1+g==} - dependencies: - duplex-maker: 1.0.0 - quotemeta: 0.0.0 - tempfile: 1.1.1 - dev: false - - /quotemeta@0.0.0: - resolution: {integrity: sha512-1XGObUh7RN5b58vKuAsrlfqT+Rc4vmw8N4pP9gFCq1GFlTdV0Ex/D2Ro1Drvrqj++HPi3ig0Np17XPslELeMRA==} - dev: false - - /readable-stream@2.3.8: - resolution: {integrity: sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==} - dependencies: - core-util-is: 1.0.3 - inherits: 2.0.4 - isarray: 1.0.0 - process-nextick-args: 2.0.1 - safe-buffer: 5.1.2 - string_decoder: 1.1.1 - util-deprecate: 1.0.2 - dev: false - - /readable-stream@3.6.2: - resolution: {integrity: sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==} - engines: {node: '>= 6'} - dependencies: - inherits: 2.0.4 - string_decoder: 1.3.0 - util-deprecate: 1.0.2 - dev: false - /require-directory@2.1.1: resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==} engines: {node: '>=0.10.0'} @@ -703,27 +598,6 @@ packages: resolution: {integrity: sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==} dev: false - /safe-buffer@5.1.2: - resolution: {integrity: sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==} - dev: false - - /safe-buffer@5.2.1: - resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==} - dev: false - - /simple-zstd@1.4.2: - resolution: {integrity: sha512-kGYEvT33M5XfyQvvW4wxl3eKcWbdbCc1V7OZzuElnaXft0qbVzoIIXHXiCm3JCUki+MZKKmvjl8p2VGLJc5Y/A==} - dependencies: - is-zst: 1.0.0 - peek-stream: 1.1.3 - process-streams: 1.0.1 - through2: 4.0.2 - dev: false - - /stream-shift@1.0.1: - resolution: {integrity: sha512-AiisoFqQ0vbGcZgQPY1cdP2I76glaVA/RauYR4G4thNFgkTqr90yXTo4LYX60Jl+sIlPNHHdGSwo01AvbKUSVQ==} - dev: false - /string-width@4.2.3: resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==} engines: {node: '>=8'} @@ -733,18 +607,6 @@ packages: strip-ansi: 6.0.1 dev: false - /string_decoder@1.1.1: - resolution: {integrity: sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==} - dependencies: - safe-buffer: 5.1.2 - dev: false - - /string_decoder@1.3.0: - resolution: {integrity: sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==} - dependencies: - safe-buffer: 5.2.1 - dev: false - /strip-ansi@6.0.1: resolution: {integrity: sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==} engines: {node: '>=8'} @@ -757,14 +619,6 @@ packages: engines: {node: '>=14.16'} dev: false - /tempfile@1.1.1: - resolution: {integrity: sha512-NjT12fW6pSEKz1eVcADgaKfeM+XZ4+zSaqVz46XH7+CiEwcelnwtGWRRjF1p+xyW2PVgKKKS2UUw1LzRelntxg==} - engines: {node: '>=0.10.0'} - dependencies: - os-tmpdir: 1.0.2 - uuid: 2.0.3 - dev: false - /tempy@3.1.0: resolution: {integrity: sha512-7jDLIdD2Zp0bDe5r3D2qtkd1QOCacylBuL7oa4udvN6v2pqr4+LcCr67C8DR1zkpaZ8XosF5m1yQSabKAW6f2g==} engines: {node: '>=14.16'} @@ -775,19 +629,6 @@ packages: unique-string: 3.0.0 dev: false - /through2@2.0.5: - resolution: {integrity: sha512-/mrRod8xqpA+IHSLyGCQ2s8SPHiCDEeQJSep1jqLYeEUClOFG2Qsh+4FU6G9VeqpZnGW/Su8LQGc4YKni5rYSQ==} - dependencies: - readable-stream: 2.3.8 - xtend: 4.0.2 - dev: false - - /through2@4.0.2: - resolution: {integrity: sha512-iOqSav00cVxEEICeD7TjLB1sueEL+81Wpzp2bY17uZjZN0pWZPuo4suZ/61VujxmqSGFfgOcNuTZ85QJwNZQpw==} - dependencies: - readable-stream: 3.6.2 - dev: false - /to-data-view@2.0.0: resolution: {integrity: sha512-RGEM5KqlPHr+WVTPmGNAXNeFEmsBnlkxXaIfEpUYV0AST2Z5W1EGq9L/MENFrMMmL2WQr1wjkmZy/M92eKhjYA==} engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} @@ -838,19 +679,10 @@ packages: crypto-random-string: 4.0.0 dev: false - /util-deprecate@1.0.2: - resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==} - dev: false - /uuid-random@1.3.2: resolution: {integrity: sha512-UOzej0Le/UgkbWEO8flm+0y+G+ljUon1QWTEZOq1rnMAsxo2+SckbiZdKzAHHlVh6gJqI1TjC/xwgR50MuCrBQ==} dev: false - /uuid@2.0.3: - resolution: {integrity: sha512-FULf7fayPdpASncVy4DLh3xydlXEJJpvIELjYjNeQWYUZ9pclcpvCZSr2gkmN2FrrGcI7G/cJsIEwk5/8vfXpg==} - deprecated: Please upgrade to version 7 or higher. Older versions may use Math.random() in certain circumstances, which is known to be problematic. See https://v8.dev/blog/math-random for details. - dev: false - /warcio@2.2.1: resolution: {integrity: sha512-KPLoz3aFtdTjexG+QQaubMyuLiNANzvcadGMyNKdpcmhl0k6lBHQQVpxZw3Hx9+4pbyqDXyiF4cr/h2tS8kvcw==} engines: {node: '>=18.0.0'} @@ -873,15 +705,6 @@ packages: strip-ansi: 6.0.1 dev: false - /wrappy@1.0.2: - resolution: {integrity: sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==} - dev: false - - /xtend@4.0.2: - resolution: {integrity: sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==} - engines: {node: '>=0.4'} - dev: false - /y18n@5.0.8: resolution: {integrity: sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==} engines: {node: '>=10'} diff --git a/scraper/package.json b/scraper/package.json index c0f8ba9..3b6293a 100644 --- a/scraper/package.json +++ b/scraper/package.json @@ -13,10 +13,8 @@ "dependencies": { "drizzle-orm": "^0.29.1", "linkedom": "^0.16.5", - "mitata": "^0.1.6", "nanoid": "^5.0.4", "p-map": "^7.0.0", - "simple-zstd": "^1.4.2", "undici": "^6.2.0", "warcio": "^2.2.1", "zod": "^3.22.4" diff --git a/scraper/scrap.ts b/scraper/scrap.ts index eb387d9..d84c1ae 100644 --- a/scraper/scrap.ts +++ b/scraper/scrap.ts @@ -13,9 +13,8 @@ import { getDiaProduct } from "./dia.js"; import { getCotoProduct } from "./coto.js"; import { join } from "path"; import pMap from "p-map"; -import { parseWARC } from "./warc.js"; -const DEBUG = true; +const DEBUG = false; const sqlite = new Database("sqlite.db"); const db = drizzle(sqlite); @@ -52,14 +51,17 @@ async function storePrecioPoint(point: Precio) { async function parseWarc(path: string) { // const warc = createReadStream(path); - const parser = parseWARC(path); - for await (const record of parser) { - if (record.fields.get("WARC-Type") === "response") { - const rawUri = record.fields.get("WARC-Target-URI"); - if (!rawUri) continue; - const html = record.content.toString(); + const warc = Bun.spawn(["zstd", "-do", "/dev/stdout", path], { + stderr: "ignore", + }).stdout; - const url = new URL(rawUri.replace(/^$/, "")); + const parser = new WARCParser(warc); + for await (const record of parser) { + if (record.warcType === "response") { + if (!record.warcTargetURI) continue; + const html = await record.contentText(); + + const url = new URL(record.warcTargetURI); try { let ish: Precioish | undefined = undefined; if (url.hostname === "www.carrefour.com.ar") @@ -72,8 +74,8 @@ async function parseWarc(path: string) { const p: Precio = { ...ish, - fetchedAt: new Date(record.fields.get("WARC-Date")!), - url: url.toString(), + fetchedAt: new Date(record.warcDate!), + url: record.warcTargetURI, }; if (ish) await storePrecioPoint(p); @@ -86,7 +88,7 @@ async function parseWarc(path: string) { if (DEBUG) { const urlHash = createHash("md5") - .update(url.toString()) + .update(record.warcTargetURI!) .digest("hex"); const output = join("debug", `${urlHash}.html`); await writeFile(output, html);