From 9516fcc674df1e9bef09a5abed3a392df32daa43 Mon Sep 17 00:00:00 2001 From: Nulo Date: Mon, 27 Nov 2023 17:01:56 -0300 Subject: [PATCH] init --- .gitignore | 3 + download_json.js | 189 +++++++++++++++++++++++++++++++++++++++++++++++ package.json | 17 +++++ pnpm-lock.yaml | 169 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 378 insertions(+) create mode 100644 .gitignore create mode 100644 download_json.js create mode 100644 package.json create mode 100644 pnpm-lock.yaml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..de7af75 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +node_modules/ +dataJsons/ +log diff --git a/download_json.js b/download_json.js new file mode 100644 index 0000000..993110a --- /dev/null +++ b/download_json.js @@ -0,0 +1,189 @@ +// @ts-check +import { mkdir, open } from "node:fs/promises"; +import { Agent, fetch } from "undici"; +import { join, normalize } from "node:path"; +import { pipeline } from "node:stream/promises"; + +// lista de dominios que permitimos usar http: porque tienen HTTPS roto.. +const brokenHttps = [ + "datos.mindef.gov.ar", // cert para otro dominio + "datos.energia.gob.ar", // cert para otro dominio + "datos.minem.gob.ar", // vencido 2022-17-06 + "datos.agroindustria.gob.ar", // vencido 2022-03-10 + "andino.siu.edu.ar", // self signed, igual parece que todo tira 404 en este.. + "datos.salud.gob.ar", // timeout en HTTPS + "datos.jus.gob.ar", // HTTPS redirige incorrectamente a URLs inexistentes + "www.hidro.gob.ar", // no HTTPS +]; + +// FYI: al menos los siguientes dominios no tienen la cadena completa de certificados en HTTPS. tenemos que usar un hack (node_extra_ca_certs_mozilla_bundle) para conectarnos a estos sitios. (se puede ver con ssllabs.com) ojalá lxs administradorxs de estos servidores lo arreglen. +// www.enargas.gov.ar, transparencia.enargas.gov.ar, www.energia.gob.ar, www.economia.gob.ar, datos.yvera.gob.ar + +const dispatcher = new Agent({ + pipelining: 10, + maxRedirections: 20, +}); + +class StatusCodeError extends Error { + /** + * @param {number} code + */ + constructor(code) { + super(`Status code: ${code}`); + this.code = code; + } +} + +const outputPath = process.argv[2]; +if (!outputPath) { + console.error("Especificamente el output porfa"); + process.exit(1); +} +await mkdir(outputPath, { recursive: true }); + +// Leer JSON de stdin +const json = await process.stdin.toArray(); +const jsonString = json.join(""); +/** @type {{ dataset: Dataset[] }} */ +const parsed = JSON.parse(jsonString); + +const jobs = parsed.dataset.flatMap((dataset) => + dataset.distribution.map((dist) => ({ dataset, dist })), +); +// forma barrani de distribuir carga entre servidores +shuffleArray(jobs); +const totalJobs = jobs.length; +let nFinished = 0; + +const duplicated = hasDuplicates( + jobs.map((j) => `${j.dataset.identifier}/${j.dist.identifier}`), +); +if (duplicated) { + console.error( + "ADVERTENCIA: ¡encontré duplicados! es posible que se pisen archivos entre si", + ); +} + +const greens = Array(128) + .fill(0) + .map(() => + (async () => { + let job; + while ((job = jobs.pop())) { + const { dataset, dist } = job; + request: do { + try { + await downloadDist(dataset, dist); + } catch (error) { + if (error instanceof StatusCodeError) { + // algunos servidores usan 403 como coso para decir "calmate" + if (error.code === 403) { + console.debug( + `debug: reintentando ${dist.downloadURL} porque tiró 403`, + ); + await wait(15000); + continue request; + } + error = error.toString(); + } + console.error( + `error: Failed to download URL ${dist.downloadURL} (${dataset.identifier}/${dist.identifier}):`, + error, + ); + if (!(error instanceof StatusCodeError)) continue request; + } finally { + nFinished++; + } + } while (0); + } + })(), + ); + +const interval = setInterval(() => { + console.info(`info: ${nFinished}/${totalJobs} done`); +}, 15000); +await Promise.all(greens); +clearInterval(interval); + +/** + * @argument {Dataset} dataset + * @argument {Distribution} dist + */ +async function downloadDist(dataset, dist) { + const url = new URL(dist.downloadURL); + + // Siempre usar HTTPS excepto cuando está roto + if (brokenHttps.includes(url.host)) { + url.protocol = "http:"; + // console.debug(url); + } else url.protocol = "https:"; + + const res = await fetch(url.toString(), { + dispatcher, + }); + if (res.status >= 400) { + throw new StatusCodeError(res.status); + } + + const fileDirPath = join( + outputPath, + sanitizeSuffix(dataset.identifier), + sanitizeSuffix(dist.identifier), + ); + await mkdir(fileDirPath, { recursive: true }); + const filePath = join( + fileDirPath, + sanitizeSuffix(dist.fileName || dist.identifier), + ); + const outputFile = await open(filePath, "w"); + + if (!res.body) throw new Error("no body"); + await pipeline(res.body, outputFile.createWriteStream()); +} + +/** @typedef {object} Dataset + * @prop {string} identifier + * @prop {Distribution[]} distribution + */ +/** @typedef {object} Distribution + * @prop {string} identifier + * @prop {string} fileName + * @prop {string} downloadURL + */ + +// https://security.stackexchange.com/a/123723 +/** + * @argument {string} path + */ +function sanitizeSuffix(path) { + return normalize(path).replace(/^(\.\.(\/|\\|$))+/, ""); +} + +// https://stackoverflow.com/a/7376645 +/** + * @argument {any[]} array + */ +function hasDuplicates(array) { + return new Set(array).size !== array.length; +} + +// https://stackoverflow.com/a/12646864 +/** + * @argument {any[]} array + */ +function shuffleArray(array) { + for (var i = array.length - 1; i > 0; i--) { + var j = Math.floor(Math.random() * (i + 1)); + var temp = array[i]; + array[i] = array[j]; + array[j] = temp; + } +} + +/** + * @argument {number} ms + */ +function wait(ms) { + if (ms < 0) return Promise.resolve(); + return new Promise((resolve) => setTimeout(resolve, ms)); +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..0c42442 --- /dev/null +++ b/package.json @@ -0,0 +1,17 @@ +{ + "name": "js", + "type": "module", + "version": "1.0.0", + "description": "", + "main": "index.js", + "scripts": { + "run": "env NODE_EXTRA_CA_CERTS=node_modules/node_extra_ca_certs_mozilla_bundle/ca_bundle/ca_intermediate_root_bundle.pem node download_json.js" + }, + "keywords": [], + "author": "", + "license": "ISC", + "dependencies": { + "node_extra_ca_certs_mozilla_bundle": "^1.0.5", + "undici": "^5.28.0" + } +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml new file mode 100644 index 0000000..5543c10 --- /dev/null +++ b/pnpm-lock.yaml @@ -0,0 +1,169 @@ +lockfileVersion: '6.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +dependencies: + node_extra_ca_certs_mozilla_bundle: + specifier: ^1.0.5 + version: 1.0.5 + undici: + specifier: ^5.28.0 + version: 5.28.0 + +packages: + + /@fastify/busboy@2.1.0: + resolution: {integrity: sha512-+KpH+QxZU7O4675t3mnkQKcZZg56u+K/Ct2K+N2AZYNVK8kyeo/bI18tI8aPm3tvNNRyTWfj6s5tnGNlcbQRsA==} + engines: {node: '>=14'} + dev: false + + /asynckit@0.4.0: + resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==} + dev: false + + /axios@0.27.2: + resolution: {integrity: sha512-t+yRIyySRTp/wua5xEr+z1q60QmLq8ABsS5O9Me1AsE5dfKqgnCFzwiCZZ/cGNd1lq4/7akDWMxdhVlucjmnOQ==} + dependencies: + follow-redirects: 1.15.3 + form-data: 4.0.0 + transitivePeerDependencies: + - debug + dev: false + + /bluebird@3.7.2: + resolution: {integrity: sha512-XpNj6GDQzdfW+r2Wnn7xiSAd7TM3jzkxGXBGTtWKuSXv1xUV+azxAm8jdWZN06QTQk+2N2XB9jRDkvbmQmcRtg==} + dev: false + + /combined-stream@1.0.8: + resolution: {integrity: sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==} + engines: {node: '>= 0.8'} + dependencies: + delayed-stream: 1.0.0 + dev: false + + /cross-env@6.0.3: + resolution: {integrity: sha512-+KqxF6LCvfhWvADcDPqo64yVIB31gv/jQulX2NGzKS/g3GEVz6/pt4wjHFtFWsHMddebWD/sDthJemzM4MaAag==} + engines: {node: '>=8.0'} + hasBin: true + dependencies: + cross-spawn: 7.0.3 + dev: false + + /cross-spawn@7.0.3: + resolution: {integrity: sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==} + engines: {node: '>= 8'} + dependencies: + path-key: 3.1.1 + shebang-command: 2.0.0 + which: 2.0.2 + dev: false + + /csvtojson@2.0.10: + resolution: {integrity: sha512-lUWFxGKyhraKCW8Qghz6Z0f2l/PqB1W3AO0HKJzGIQ5JRSlR651ekJDiGJbBT4sRNNv5ddnSGVEnsxP9XRCVpQ==} + engines: {node: '>=4.0.0'} + hasBin: true + dependencies: + bluebird: 3.7.2 + lodash: 4.17.21 + strip-bom: 2.0.0 + dev: false + + /delayed-stream@1.0.0: + resolution: {integrity: sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==} + engines: {node: '>=0.4.0'} + dev: false + + /follow-redirects@1.15.3: + resolution: {integrity: sha512-1VzOtuEM8pC9SFU1E+8KfTjZyMztRsgEfwQl44z8A25uy13jSzTj6dyK2Df52iV0vgHCfBwLhDWevLn95w5v6Q==} + engines: {node: '>=4.0'} + peerDependencies: + debug: '*' + peerDependenciesMeta: + debug: + optional: true + dev: false + + /form-data@4.0.0: + resolution: {integrity: sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==} + engines: {node: '>= 6'} + dependencies: + asynckit: 0.4.0 + combined-stream: 1.0.8 + mime-types: 2.1.35 + dev: false + + /is-utf8@0.2.1: + resolution: {integrity: sha512-rMYPYvCzsXywIsldgLaSoPlw5PfoB/ssr7hY4pLfcodrA5M/eArza1a9VmTiNIBNMjOGr1Ow9mTyU2o69U6U9Q==} + dev: false + + /isexe@2.0.0: + resolution: {integrity: sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==} + dev: false + + /lodash@4.17.21: + resolution: {integrity: sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==} + dev: false + + /mime-db@1.52.0: + resolution: {integrity: sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==} + engines: {node: '>= 0.6'} + dev: false + + /mime-types@2.1.35: + resolution: {integrity: sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==} + engines: {node: '>= 0.6'} + dependencies: + mime-db: 1.52.0 + dev: false + + /node_extra_ca_certs_mozilla_bundle@1.0.5: + resolution: {integrity: sha512-Y+wek3qK8WYybCIxArGTmCEJCJ/6uGud/HCJECBZPIgagF9ba90nhnQMxBcMUAwQaR53iphGYp0JzlVPpUBsjg==} + requiresBuild: true + dependencies: + axios: 0.27.2 + cross-env: 6.0.3 + csvtojson: 2.0.10 + transitivePeerDependencies: + - debug + dev: false + + /path-key@3.1.1: + resolution: {integrity: sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==} + engines: {node: '>=8'} + dev: false + + /shebang-command@2.0.0: + resolution: {integrity: sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==} + engines: {node: '>=8'} + dependencies: + shebang-regex: 3.0.0 + dev: false + + /shebang-regex@3.0.0: + resolution: {integrity: sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==} + engines: {node: '>=8'} + dev: false + + /strip-bom@2.0.0: + resolution: {integrity: sha512-kwrX1y7czp1E69n2ajbG65mIo9dqvJ+8aBQXOGVxqwvNbsXdFM6Lq37dLAY3mknUwru8CfcCbfOLL/gMo+fi3g==} + engines: {node: '>=0.10.0'} + dependencies: + is-utf8: 0.2.1 + dev: false + + /undici@5.28.0: + resolution: {integrity: sha512-gM12DkXhlAc5+/TPe60iy9P6ETgVfqTuRJ6aQ4w8RYu0MqKuXhaq3/b86GfzDQnNA3NUO6aUNdvevrKH59D0Nw==} + engines: {node: '>=14.0'} + dependencies: + '@fastify/busboy': 2.1.0 + dev: false + + /which@2.0.2: + resolution: {integrity: sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==} + engines: {node: '>= 8'} + hasBin: true + dependencies: + isexe: 2.0.0 + dev: false