2023-11-27 20:01:56 +00:00
// @ts-check
2023-11-28 02:10:24 +00:00
import { mkdir , open , writeFile } from "node:fs/promises" ;
import { Agent , fetch , request , setGlobalDispatcher } from "undici" ;
2023-11-27 20:01:56 +00:00
import { join , normalize } from "node:path" ;
2023-11-28 22:19:35 +00:00
import pLimit from "p-limit" ;
2023-11-27 20:01:56 +00:00
2023-11-28 23:46:14 +00:00
const sitiosPorDefecto = [
"https://datos.gob.ar/data.json" ,
"http://datos.energia.gob.ar/data.json" ,
"https://datos.magyp.gob.ar/data.json" ,
"https://datos.acumar.gov.ar/data.json" ,
"https://datasets.datos.mincyt.gob.ar/data.json" ,
"https://datos.arsat.com.ar/data.json" ,
"https://datos.cultura.gob.ar/data.json" ,
"https://datos.mininterior.gob.ar/data.json" ,
"https://datos.produccion.gob.ar/data.json" ,
"https://datos.salud.gob.ar/data.json" ,
"https://datos.transporte.gob.ar/data.json" ,
"https://ckan.ciudaddemendoza.gov.ar/data.json" ,
"https://datos.santafe.gob.ar/data.json" ,
"https://datosabiertos.chaco.gob.ar/data.json" ,
"https://datosabiertos.mercedes.gob.ar/data.json" ,
"http://luj-bue-datos.paisdigital.innovacion.gob.ar/data.json" ,
"https://datosabiertos.desarrollosocial.gob.ar" ,
"http://datos.mindef.gov.ar/data.json" ,
] ;
// desactivado porque va MUY lento: datosabiertos.gualeguaychu.gov.ar
2023-11-27 20:01:56 +00:00
// FYI: al menos los siguientes dominios no tienen la cadena completa de certificados en HTTPS. tenemos que usar un hack (node_extra_ca_certs_mozilla_bundle) para conectarnos a estos sitios. (se puede ver con ssllabs.com) ojalá lxs administradorxs de estos servidores lo arreglen.
// www.enargas.gov.ar, transparencia.enargas.gov.ar, www.energia.gob.ar, www.economia.gob.ar, datos.yvera.gob.ar
2023-11-28 02:10:24 +00:00
setGlobalDispatcher (
new Agent ( {
pipelining : 0 ,
2023-11-28 23:46:14 +00:00
} ) ,
2023-11-28 02:10:24 +00:00
) ;
2023-11-27 20:01:56 +00:00
2023-11-28 22:19:35 +00:00
/ * * k e y e s h o s t
* @ type { Map < string , import ( "p-limit" ) . LimitFunction > } * /
const limiters = new Map ( ) ;
2023-11-28 22:34:31 +00:00
const nThreads = process . env . N _THREADS ? parseInt ( process . env . N _THREADS ) : 8 ;
2023-11-28 22:19:35 +00:00
2023-11-27 20:01:56 +00:00
class StatusCodeError extends Error {
/ * *
* @ param { number } code
* /
constructor ( code ) {
super ( ` Status code: ${ code } ` ) ;
this . code = code ;
}
}
2023-11-28 02:19:09 +00:00
class TooManyRedirectsError extends Error { }
2023-11-28 23:46:14 +00:00
let jsonUrls = process . argv . slice ( 2 ) ;
2023-11-28 22:34:31 +00:00
if ( jsonUrls . length < 1 ) {
2023-11-28 23:46:14 +00:00
jsonUrls = sitiosPorDefecto ;
2023-11-27 20:01:56 +00:00
}
2023-11-28 22:34:31 +00:00
for ( const url of jsonUrls )
downloadFromData ( url ) . catch ( ( error ) =>
2023-11-28 23:46:14 +00:00
console . error ( ` ${ url } FALLÓ CON ` , error ) ,
2023-11-28 22:34:31 +00:00
) ;
2023-11-28 21:22:25 +00:00
/ * *
* @ param { string } jsonUrlString
* /
async function downloadFromData ( jsonUrlString ) {
const jsonUrl = new URL ( jsonUrlString ) ;
const outputPath = jsonUrl . host ;
await mkdir ( outputPath , { recursive : true } ) ;
2023-11-28 22:34:31 +00:00
const errorFile = (
await open ( join ( outputPath , "errors.jsonl" ) , "w" )
) . createWriteStream ( ) ;
2023-11-28 21:22:25 +00:00
2023-11-28 22:34:31 +00:00
try {
const jsonRes = await fetch ( jsonUrl ) ;
// prettier-ignore
const parsed = /** @type {{ dataset: Dataset[] }} */ ( await jsonRes . json ( ) )
await writeFile ( join ( outputPath , "data.json" ) , JSON . stringify ( parsed ) ) ;
2023-11-27 20:01:56 +00:00
2023-11-28 22:34:31 +00:00
/** @type {DownloadJob[]} */
const jobs = parsed . dataset . flatMap ( ( dataset ) =>
dataset . distribution
. filter ( ( dist ) => {
try {
patchUrl ( new URL ( dist . downloadURL ) ) ;
return true ;
} catch ( error ) {
errorFile . write (
2023-11-28 23:46:14 +00:00
JSON . stringify ( encodeError ( { dataset , dist } , error ) ) + "\n" ,
2023-11-28 22:34:31 +00:00
) ;
return false ;
}
} )
. map ( ( dist ) => ( {
dataset ,
dist ,
url : patchUrl ( new URL ( dist . downloadURL ) ) ,
outputPath ,
attempts : 0 ,
2023-11-28 23:46:14 +00:00
} ) ) ,
2023-11-28 22:34:31 +00:00
) ;
const totalJobs = jobs . length ;
let nFinished = 0 ;
let nErrors = 0 ;
2023-11-28 21:22:25 +00:00
2023-11-28 22:34:31 +00:00
// por las dudas verificar que no hayan archivos duplicados
chequearIdsDuplicados ( jobs ) ;
2023-11-28 21:22:25 +00:00
2023-11-28 22:34:31 +00:00
shuffleArray ( jobs ) ;
2023-11-28 21:22:25 +00:00
2023-11-28 22:34:31 +00:00
const promises = jobs . map ( ( job ) => {
let limit = limiters . get ( job . url . host ) ;
if ( ! limit ) {
limit = pLimit ( nThreads ) ;
limiters . set ( job . url . host , limit ) ;
2023-11-28 22:19:35 +00:00
}
2023-11-28 22:34:31 +00:00
return limit ( async ( ) => {
try {
await downloadDistWithRetries ( job ) ;
} catch ( error ) {
2023-11-28 22:57:35 +00:00
await errorFile . write ( JSON . stringify ( job , encodeError ( error ) ) + "\n" ) ;
2023-11-28 22:34:31 +00:00
nErrors ++ ;
} finally {
nFinished ++ ;
}
} ) ;
2023-11-28 22:19:35 +00:00
} ) ;
2023-11-27 20:01:56 +00:00
2023-11-28 22:34:31 +00:00
process . stderr . write ( ` info[ ${ jsonUrl . host } ]: 0/ ${ totalJobs } done \n ` ) ;
const interval = setInterval ( ( ) => {
process . stderr . write (
2023-11-28 23:46:14 +00:00
` info[ ${ jsonUrl . host } ]: ${ nFinished } / ${ totalJobs } done \n ` ,
2023-11-28 22:34:31 +00:00
) ;
} , 30000 ) ;
await Promise . all ( promises ) ;
clearInterval ( interval ) ;
if ( nErrors > 0 )
console . error ( ` ${ jsonUrl . host } : Finished with ${ nErrors } errors ` ) ;
} finally {
errorFile . close ( ) ;
}
2023-11-28 01:43:58 +00:00
}
2023-11-28 22:19:35 +00:00
/ * *
* @ argument { DownloadJob } job
* @ argument { number } attempts
* /
async function downloadDistWithRetries ( job , attempts = 0 ) {
const { url } = job ;
try {
await downloadDist ( job ) ;
} catch ( error ) {
// algunos servidores usan 403 como coso para decir "calmate"
// intentar hasta 15 veces con 15 segundos de por medio
if (
error instanceof StatusCodeError &&
error . code === 403 &&
url . host === "minsegar-my.sharepoint.com" &&
attempts < 15
) {
await wait ( 15000 ) ;
return await downloadDistWithRetries ( job , attempts + 1 ) ;
}
// si no fue un error de http, reintentar hasta 5 veces con 5 segundos de por medio
else if (
! ( error instanceof StatusCodeError ) &&
! ( error instanceof TooManyRedirectsError ) &&
2023-11-28 22:34:31 +00:00
attempts < 10
2023-11-28 22:19:35 +00:00
) {
await wait ( 5000 ) ;
return await downloadDistWithRetries ( job , attempts + 1 ) ;
} else throw error ;
}
}
2023-11-28 01:43:58 +00:00
/ * *
* @ argument { DownloadJob } job
* /
2023-11-28 21:22:25 +00:00
async function downloadDist ( { dist , dataset , url , outputPath } ) {
2023-11-28 03:32:53 +00:00
// sharepoint no le gusta compartir a bots lol
const spoofUserAgent = url . host . endsWith ( "sharepoint.com" ) ;
2023-11-28 02:19:09 +00:00
const res = await request ( url . toString ( ) , {
maxRedirections : 20 ,
2023-11-28 03:32:53 +00:00
headers : {
"User-Agent" : spoofUserAgent
? "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0"
: "transicion-desordenada (https://nulo.ar)" ,
} ,
2023-11-28 02:19:09 +00:00
} ) ;
if ( res . statusCode >= 300 && res . statusCode <= 399 )
throw new TooManyRedirectsError ( ) ;
2023-11-28 02:10:24 +00:00
if ( res . statusCode < 200 || res . statusCode > 299 ) {
throw new StatusCodeError ( res . statusCode ) ;
2023-11-27 20:01:56 +00:00
}
const fileDirPath = join (
outputPath ,
sanitizeSuffix ( dataset . identifier ) ,
2023-11-28 23:46:14 +00:00
sanitizeSuffix ( dist . identifier ) ,
2023-11-27 20:01:56 +00:00
) ;
await mkdir ( fileDirPath , { recursive : true } ) ;
const filePath = join (
fileDirPath ,
2023-11-28 23:46:14 +00:00
sanitizeSuffix ( dist . fileName || dist . identifier ) ,
2023-11-27 20:01:56 +00:00
) ;
if ( ! res . body ) throw new Error ( "no body" ) ;
2023-11-28 21:22:25 +00:00
await writeFile ( filePath , res . body ) ;
2023-11-27 20:01:56 +00:00
}
2023-11-28 01:43:58 +00:00
/ * * @ t y p e d e f D o w n l o a d J o b
* @ prop { Dataset } dataset
* @ prop { Distribution } dist
* @ prop { URL } url
2023-11-28 21:22:25 +00:00
* @ prop { string } outputPath
* @ prop { number } attempts
* @ prop { Date = } waitUntil
2023-11-28 01:43:58 +00:00
* /
/ * * @ t y p e d e f D a t a s e t
2023-11-27 20:01:56 +00:00
* @ prop { string } identifier
* @ prop { Distribution [ ] } distribution
* /
2023-11-28 01:43:58 +00:00
/ * * @ t y p e d e f D i s t r i b u t i o n
2023-11-27 20:01:56 +00:00
* @ prop { string } identifier
* @ prop { string } fileName
* @ prop { string } downloadURL
* /
// https://security.stackexchange.com/a/123723
/ * *
* @ argument { string } path
* /
function sanitizeSuffix ( path ) {
return normalize ( path ) . replace ( /^(\.\.(\/|\\|$))+/ , "" ) ;
}
2023-11-28 21:22:25 +00:00
/ * *
* @ param { DownloadJob [ ] } jobs
* /
function chequearIdsDuplicados ( jobs ) {
2023-11-28 01:43:58 +00:00
const duplicated = hasDuplicates (
2023-11-28 23:46:14 +00:00
jobs . map ( ( j ) => ` ${ j . dataset . identifier } / ${ j . dist . identifier } ` ) ,
2023-11-28 01:43:58 +00:00
) ;
if ( duplicated ) {
console . error (
2023-11-28 23:46:14 +00:00
"ADVERTENCIA: ¡encontré duplicados! es posible que se pisen archivos entre si" ,
2023-11-28 01:43:58 +00:00
) ;
}
}
2023-11-27 20:01:56 +00:00
// https://stackoverflow.com/a/7376645
2023-11-28 01:43:58 +00:00
/** @argument {any[]} array */
2023-11-27 20:01:56 +00:00
function hasDuplicates ( array ) {
return new Set ( array ) . size !== array . length ;
}
2023-11-28 01:43:58 +00:00
/** @argument {number} ms */
2023-11-27 20:01:56 +00:00
function wait ( ms ) {
return new Promise ( ( resolve ) => setTimeout ( resolve , ms ) ) ;
}
2023-11-28 01:43:58 +00:00
2023-11-28 22:57:35 +00:00
/ * *
* @ param { { dataset : Dataset , dist : Distribution , url ? : URL } } job
* @ param { any } error
* /
function encodeError ( job , error ) {
const always = {
url : job . url ? . toString || job . dist . downloadURL ,
datasetIdentifier : job . dataset . identifier ,
distributionIdentifier : job . dist . identifier ,
} ;
2023-11-28 01:43:58 +00:00
if ( error instanceof StatusCodeError )
2023-11-28 22:57:35 +00:00
return { ... always , kind : "http_error" , status _code : error . code } ;
2023-11-28 02:19:09 +00:00
else if ( error instanceof TooManyRedirectsError )
2023-11-28 22:57:35 +00:00
return { ... always , kind : "infinite_redirect" } ;
2023-11-28 01:43:58 +00:00
else {
2023-11-28 22:57:35 +00:00
return {
... always ,
kind : "generic_error" ,
error : error . code || error . message ,
} ;
2023-11-28 01:43:58 +00:00
}
}
2023-11-28 03:41:25 +00:00
/ * *
* parchea URLs que se rompen solas
* @ param { URL } url
* /
function patchUrl ( url ) {
if ( url . host === "www.ign.gob.ar" ) {
// por defecto, 'http://www.ign.gob.ar' redirige a 'https://ign.gob.ar' pero su certificado solo aplica para '*.ign.gob.ar'. se sirve todo el contenido correctamente en 'https://www.ign.gob.ar', así que vamos para ahí.
url . protocol = "https:" ;
}
return url ;
}
2023-11-28 21:22:25 +00:00
// https://stackoverflow.com/a/12646864
/** @param {any[]} array */
function shuffleArray ( array ) {
for ( let i = array . length - 1 ; i > 0 ; i -- ) {
const j = Math . floor ( Math . random ( ) * ( i + 1 ) ) ;
[ array [ i ] , array [ j ] ] = [ array [ j ] , array [ i ] ] ;
}
}