From ef697d1caebfd0c44a1c43b26dc98ae342936150 Mon Sep 17 00:00:00 2001 From: Nulo Date: Tue, 28 Nov 2023 19:34:31 -0300 Subject: [PATCH] arreglar bugs y activar todos los data.json en container --- .gitignore | 3 +- Containerfile | 35 ++++--------- download_json.js | 134 ++++++++++++++++++++++++++++------------------- 3 files changed, 91 insertions(+), 81 deletions(-) diff --git a/.gitignore b/.gitignore index f5533f2..f0b51e9 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ node_modules/ dataJsons/ log prueba -datos.gob.ar* \ No newline at end of file +datos.gob.ar* +data/ \ No newline at end of file diff --git a/Containerfile b/Containerfile index bd05e6c..a8f69cf 100644 --- a/Containerfile +++ b/Containerfile @@ -1,10 +1,14 @@ + FROM docker.io/alpine:3.18 as build RUN apk add --no-cache npm esbuild RUN npm install -g esbuild -COPY package.json download_json.js /tmp/build/ -RUN cd /tmp/build && \ - npm install && \ - esbuild --bundle --format=cjs --platform=node --outfile=build.js download_json.js +WORKDIR /tmp/build + +COPY package.json . +RUN npm install + +COPY download_json.js . +RUN esbuild --bundle --format=cjs --platform=node --outfile=build.js download_json.js FROM docker.io/alpine:3.18 RUN apk add --no-cache nodejs-current tini @@ -12,25 +16,4 @@ COPY pki/ca_intermediate_root_bundle.pem /usr/lib/ca_intermediate_root_bundle.pe COPY --from=build /tmp/build/build.js /usr/local/bin/download_json.js ENV NODE_EXTRA_CA_CERTS=/usr/lib/ca_intermediate_root_bundle.pem WORKDIR /data -CMD ["/sbin/tini", "node", "/usr/local/bin/download_json.js", "https://datos.gob.ar/data.json"] - -# https://datos.gob.ar/data.json -# http://datos.energia.gob.ar/data.json -# https://datos.magyp.gob.ar/data.json -# https://datos.acumar.gov.ar/data.json -# https://datasets.datos.mincyt.gob.ar/data.json -# https://datos.arsat.com.ar/data.json -# https://datos.cultura.gob.ar/data.json -# https://datos.mininterior.gob.ar/data.json -# https://datos.produccion.gob.ar/data.json -# https://datos.salud.gob.ar/data.json -# https://datos.transporte.gob.ar/data.json -# https://ckan.ciudaddemendoza.gov.ar/data.json -# https://datos.santafe.gob.ar/data.json -# https://datosabiertos.chaco.gob.ar/data.json -# https://datosabiertos.gualeguaychu.gov.ar/data.json -# https://datosabiertos.mercedes.gob.ar/data.json -# http://luj-bue-datos.paisdigital.innovacion.gob.ar/data.json - -#https://datos.mindef.gov.ar -#https://datosabiertos.desarrollosocial.gob.ar +CMD ["/sbin/tini", "node", "/usr/local/bin/download_json.js", "https://datos.gob.ar/data.json", "http://datos.energia.gob.ar/data.json", "https://datos.magyp.gob.ar/data.json", "https://datos.acumar.gov.ar/data.json", "https://datasets.datos.mincyt.gob.ar/data.json", "https://datos.arsat.com.ar/data.json", "https://datos.cultura.gob.ar/data.json", "https://datos.mininterior.gob.ar/data.json", "https://datos.produccion.gob.ar/data.json", "https://datos.salud.gob.ar/data.json", "https://datos.transporte.gob.ar/data.json", "https://ckan.ciudaddemendoza.gov.ar/data.json", "https://datos.santafe.gob.ar/data.json", "https://datosabiertos.chaco.gob.ar/data.json", "https://datosabiertos.gualeguaychu.gov.ar/data.json", "https://datosabiertos.mercedes.gob.ar/data.json", "http://luj-bue-datos.paisdigital.innovacion.gob.ar/data.json", "https://datosabiertos.desarrollosocial.gob.ar", "http://datos.mindef.gov.ar/data.json"] diff --git a/download_json.js b/download_json.js index 6db1980..0e354c2 100644 --- a/download_json.js +++ b/download_json.js @@ -16,7 +16,7 @@ setGlobalDispatcher( /** key es host * @type {Map} */ const limiters = new Map(); -const nThreads = process.env.N_THREADS ? parseInt(process.env.N_THREADS) : 16; +const nThreads = process.env.N_THREADS ? parseInt(process.env.N_THREADS) : 8; class StatusCodeError extends Error { /** @@ -28,13 +28,15 @@ class StatusCodeError extends Error { } } class TooManyRedirectsError extends Error {} - -let jsonUrlString = process.argv[2]; -if (!jsonUrlString) { +const jsonUrls = process.argv.slice(2); +if (jsonUrls.length < 1) { console.error("Especificamente el url al json porfa"); process.exit(1); } -downloadFromData(jsonUrlString); +for (const url of jsonUrls) + downloadFromData(url).catch((error) => + console.error(`${url} FALLĂ“ CON`, error) + ); /** * @param {string} jsonUrlString @@ -43,62 +45,86 @@ async function downloadFromData(jsonUrlString) { const jsonUrl = new URL(jsonUrlString); const outputPath = jsonUrl.host; await mkdir(outputPath, { recursive: true }); - const errorFile = await open(join(outputPath, "errors.jsonl"), "w"); + const errorFile = ( + await open(join(outputPath, "errors.jsonl"), "w") + ).createWriteStream(); - const jsonRes = await fetch(jsonUrl); - // prettier-ignore - const parsed = /** @type {{ dataset: Dataset[] }} */(await jsonRes.json()) - await writeFile(join(outputPath, "data.json"), JSON.stringify(parsed)); + try { + const jsonRes = await fetch(jsonUrl); + // prettier-ignore + const parsed = /** @type {{ dataset: Dataset[] }} */(await jsonRes.json()) + await writeFile(join(outputPath, "data.json"), JSON.stringify(parsed)); - /** @type {DownloadJob[]} */ - const jobs = parsed.dataset.flatMap((dataset) => - dataset.distribution.map((dist) => ({ - dataset, - dist, - url: patchUrl(new URL(dist.downloadURL)), - outputPath, - attempts: 0, - })) - ); - const totalJobs = jobs.length; - let nFinished = 0; - let nErrors = 0; + /** @type {DownloadJob[]} */ + const jobs = parsed.dataset.flatMap((dataset) => + dataset.distribution + .filter((dist) => { + try { + patchUrl(new URL(dist.downloadURL)); + return true; + } catch (error) { + errorFile.write( + JSON.stringify({ + url: dist.downloadURL, + ...encodeError(error), + }) + "\n" + ); + return false; + } + }) + .map((dist) => ({ + dataset, + dist, + url: patchUrl(new URL(dist.downloadURL)), + outputPath, + attempts: 0, + })) + ); + const totalJobs = jobs.length; + let nFinished = 0; + let nErrors = 0; - // por las dudas verificar que no hayan archivos duplicados - chequearIdsDuplicados(jobs); + // por las dudas verificar que no hayan archivos duplicados + chequearIdsDuplicados(jobs); - shuffleArray(jobs); + shuffleArray(jobs); - const promises = jobs.map((job) => { - let limit = limiters.get(job.url.host); - if (!limit) { - limit = pLimit(nThreads); - limiters.set(job.url.host, limit); - } - return limit(async () => { - try { - await downloadDistWithRetries(job); - } catch (error) { - await errorFile.write( - JSON.stringify({ - url: job.url.toString(), - ...encodeError(error), - }) + "\n" - ); - nErrors++; - } finally { - nFinished++; + const promises = jobs.map((job) => { + let limit = limiters.get(job.url.host); + if (!limit) { + limit = pLimit(nThreads); + limiters.set(job.url.host, limit); } + return limit(async () => { + try { + await downloadDistWithRetries(job); + } catch (error) { + await errorFile.write( + JSON.stringify({ + url: job.url.toString(), + ...encodeError(error), + }) + "\n" + ); + nErrors++; + } finally { + nFinished++; + } + }); }); - }); - process.stderr.write(`info: 0/${totalJobs} done\n`); - const interval = setInterval(() => { - process.stderr.write(`info: ${nFinished}/${totalJobs} done\n`); - }, 30000); - await Promise.all(promises); - clearInterval(interval); - if (nErrors > 0) console.error(`Finished with ${nErrors} errors`); + process.stderr.write(`info[${jsonUrl.host}]: 0/${totalJobs} done\n`); + const interval = setInterval(() => { + process.stderr.write( + `info[${jsonUrl.host}]: ${nFinished}/${totalJobs} done\n` + ); + }, 30000); + await Promise.all(promises); + clearInterval(interval); + if (nErrors > 0) + console.error(`${jsonUrl.host}: Finished with ${nErrors} errors`); + } finally { + errorFile.close(); + } } /** @@ -125,7 +151,7 @@ async function downloadDistWithRetries(job, attempts = 0) { else if ( !(error instanceof StatusCodeError) && !(error instanceof TooManyRedirectsError) && - attempts < 5 + attempts < 10 ) { await wait(5000); return await downloadDistWithRetries(job, attempts + 1);