From 704634c9797554d43b2d541cc3594340779e4210 Mon Sep 17 00:00:00 2001 From: Nulo Date: Tue, 28 Nov 2023 19:19:35 -0300 Subject: [PATCH] volver a descargar separado por host --- Containerfile | 26 +++++++++- download_json.js | 129 +++++++++++++++++++++++------------------------ package.json | 1 + pnpm-lock.yaml | 15 ++++++ 4 files changed, 102 insertions(+), 69 deletions(-) diff --git a/Containerfile b/Containerfile index 724390d..bd05e6c 100644 --- a/Containerfile +++ b/Containerfile @@ -1,6 +1,7 @@ FROM docker.io/alpine:3.18 as build RUN apk add --no-cache npm esbuild -COPY package.json download_json.js /tmp/build +RUN npm install -g esbuild +COPY package.json download_json.js /tmp/build/ RUN cd /tmp/build && \ npm install && \ esbuild --bundle --format=cjs --platform=node --outfile=build.js download_json.js @@ -11,4 +12,25 @@ COPY pki/ca_intermediate_root_bundle.pem /usr/lib/ca_intermediate_root_bundle.pe COPY --from=build /tmp/build/build.js /usr/local/bin/download_json.js ENV NODE_EXTRA_CA_CERTS=/usr/lib/ca_intermediate_root_bundle.pem WORKDIR /data -CMD ["/sbin/tini", "node", "/usr/local/bin/download_json.js", "https://datos.gob.ar/data.json"] \ No newline at end of file +CMD ["/sbin/tini", "node", "/usr/local/bin/download_json.js", "https://datos.gob.ar/data.json"] + +# https://datos.gob.ar/data.json +# http://datos.energia.gob.ar/data.json +# https://datos.magyp.gob.ar/data.json +# https://datos.acumar.gov.ar/data.json +# https://datasets.datos.mincyt.gob.ar/data.json +# https://datos.arsat.com.ar/data.json +# https://datos.cultura.gob.ar/data.json +# https://datos.mininterior.gob.ar/data.json +# https://datos.produccion.gob.ar/data.json +# https://datos.salud.gob.ar/data.json +# https://datos.transporte.gob.ar/data.json +# https://ckan.ciudaddemendoza.gov.ar/data.json +# https://datos.santafe.gob.ar/data.json +# https://datosabiertos.chaco.gob.ar/data.json +# https://datosabiertos.gualeguaychu.gov.ar/data.json +# https://datosabiertos.mercedes.gob.ar/data.json +# http://luj-bue-datos.paisdigital.innovacion.gob.ar/data.json + +#https://datos.mindef.gov.ar +#https://datosabiertos.desarrollosocial.gob.ar diff --git a/download_json.js b/download_json.js index bbde6d9..6db1980 100644 --- a/download_json.js +++ b/download_json.js @@ -2,6 +2,7 @@ import { mkdir, open, writeFile } from "node:fs/promises"; import { Agent, fetch, request, setGlobalDispatcher } from "undici"; import { join, normalize } from "node:path"; +import pLimit from "p-limit"; // FYI: al menos los siguientes dominios no tienen la cadena completa de certificados en HTTPS. tenemos que usar un hack (node_extra_ca_certs_mozilla_bundle) para conectarnos a estos sitios. (se puede ver con ssllabs.com) ojalá lxs administradorxs de estos servidores lo arreglen. // www.enargas.gov.ar, transparencia.enargas.gov.ar, www.energia.gob.ar, www.economia.gob.ar, datos.yvera.gob.ar @@ -12,6 +13,11 @@ setGlobalDispatcher( }) ); +/** key es host + * @type {Map} */ +const limiters = new Map(); +const nThreads = process.env.N_THREADS ? parseInt(process.env.N_THREADS) : 16; + class StatusCodeError extends Error { /** * @param {number} code @@ -63,68 +69,70 @@ async function downloadFromData(jsonUrlString) { shuffleArray(jobs); - const nThreads = process.env.N_THREADS ? parseInt(process.env.N_THREADS) : 64; - const greens = Array(nThreads) - .fill(0) - .map(() => - (async () => { - let job; - while ((job = jobs.pop())) { - if (job.waitUntil) await waitUntil(job.waitUntil); - try { - await downloadDist(job); - } catch (error) { - // algunos servidores usan 403 como coso para decir "calmate" - // intentar hasta 15 veces con 15 segundos de por medio - if ( - error instanceof StatusCodeError && - error.code === 403 && - job.url.host === "minsegar-my.sharepoint.com" && - job.attempts < 15 - ) { - jobs.unshift({ - ...job, - attempts: job.attempts + 1, - waitUntil: nowPlusNSeconds(15), - }); - continue; - } - // si no fue un error de http, reintentar hasta 5 veces con 5 segundos de por medio - else if ( - !(error instanceof StatusCodeError) && - !(error instanceof TooManyRedirectsError) && - job.attempts < 5 - ) { - jobs.unshift({ - ...job, - attempts: job.attempts + 1, - waitUntil: nowPlusNSeconds(5), - }); - continue; - } else { - await errorFile.write( - JSON.stringify({ - url: job.url.toString(), - ...encodeError(error), - }) + "\n" - ); - nErrors++; - } - } - nFinished++; - } - })() - ); - process.stderr.write(`greens: ${greens.length}\n`); + const promises = jobs.map((job) => { + let limit = limiters.get(job.url.host); + if (!limit) { + limit = pLimit(nThreads); + limiters.set(job.url.host, limit); + } + return limit(async () => { + try { + await downloadDistWithRetries(job); + } catch (error) { + await errorFile.write( + JSON.stringify({ + url: job.url.toString(), + ...encodeError(error), + }) + "\n" + ); + nErrors++; + } finally { + nFinished++; + } + }); + }); + process.stderr.write(`info: 0/${totalJobs} done\n`); const interval = setInterval(() => { process.stderr.write(`info: ${nFinished}/${totalJobs} done\n`); }, 30000); - await Promise.all(greens); + await Promise.all(promises); clearInterval(interval); if (nErrors > 0) console.error(`Finished with ${nErrors} errors`); } +/** + * @argument {DownloadJob} job + * @argument {number} attempts + */ +async function downloadDistWithRetries(job, attempts = 0) { + const { url } = job; + try { + await downloadDist(job); + } catch (error) { + // algunos servidores usan 403 como coso para decir "calmate" + // intentar hasta 15 veces con 15 segundos de por medio + if ( + error instanceof StatusCodeError && + error.code === 403 && + url.host === "minsegar-my.sharepoint.com" && + attempts < 15 + ) { + await wait(15000); + return await downloadDistWithRetries(job, attempts + 1); + } + // si no fue un error de http, reintentar hasta 5 veces con 5 segundos de por medio + else if ( + !(error instanceof StatusCodeError) && + !(error instanceof TooManyRedirectsError) && + attempts < 5 + ) { + await wait(5000); + return await downloadDistWithRetries(job, attempts + 1); + } else throw error; + } +} + /** * @argument {DownloadJob} job */ @@ -210,19 +218,6 @@ function hasDuplicates(array) { function wait(ms) { return new Promise((resolve) => setTimeout(resolve, ms)); } -/** @argument {Date} date */ -function waitUntil(date) { - return wait(Math.max(+new Date() - +date, 0)); -} -/** - * genera una Date de ahora+n segundos - * @param {number} seconds - */ -function nowPlusNSeconds(seconds) { - let d = new Date(); - d.setSeconds(d.getSeconds() + seconds); - return d; -} function encodeError(error) { if (error instanceof StatusCodeError) diff --git a/package.json b/package.json index 9fd9758..72ae195 100644 --- a/package.json +++ b/package.json @@ -11,6 +11,7 @@ "author": "", "license": "ISC", "dependencies": { + "p-limit": "^5.0.0", "undici": "^5.28.0" }, "devDependencies": { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 2c02bf1..7bb4b92 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -5,6 +5,9 @@ settings: excludeLinksFromLockfile: false dependencies: + p-limit: + specifier: ^5.0.0 + version: 5.0.0 undici: specifier: ^5.28.0 version: 5.28.0 @@ -27,6 +30,13 @@ packages: undici-types: 5.26.5 dev: true + /p-limit@5.0.0: + resolution: {integrity: sha512-/Eaoq+QyLSiXQ4lyYV23f14mZRQcXnxfHrN0vCai+ak9G0pp9iEQukIIZq5NccEvwRB8PUnZT0KsOoDCINS1qQ==} + engines: {node: '>=18'} + dependencies: + yocto-queue: 1.0.0 + dev: false + /undici-types@5.26.5: resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==} dev: true @@ -37,3 +47,8 @@ packages: dependencies: '@fastify/busboy': 2.1.0 dev: false + + /yocto-queue@1.0.0: + resolution: {integrity: sha512-9bnSc/HEW2uRy67wc+T8UwauLuPJVn28jb+GtJY16iiKWyvmYJRXVT4UamsAEGQfPohgr2q4Tq0sQbQlxTfi1g==} + engines: {node: '>=12.20'} + dev: false