2023-11-27 20:01:56 +00:00
// @ts-check
import { mkdir , open } from "node:fs/promises" ;
import { Agent , fetch } from "undici" ;
import { join , normalize } from "node:path" ;
import { pipeline } from "node:stream/promises" ;
// lista de dominios que permitimos usar http: porque tienen HTTPS roto..
const brokenHttps = [
"datos.mindef.gov.ar" , // cert para otro dominio
"datos.energia.gob.ar" , // cert para otro dominio
"datos.minem.gob.ar" , // vencido 2022-17-06
"datos.agroindustria.gob.ar" , // vencido 2022-03-10
"andino.siu.edu.ar" , // self signed, igual parece que todo tira 404 en este..
"datos.salud.gob.ar" , // timeout en HTTPS
"datos.jus.gob.ar" , // HTTPS redirige incorrectamente a URLs inexistentes
"www.hidro.gob.ar" , // no HTTPS
] ;
// FYI: al menos los siguientes dominios no tienen la cadena completa de certificados en HTTPS. tenemos que usar un hack (node_extra_ca_certs_mozilla_bundle) para conectarnos a estos sitios. (se puede ver con ssllabs.com) ojalá lxs administradorxs de estos servidores lo arreglen.
// www.enargas.gov.ar, transparencia.enargas.gov.ar, www.energia.gob.ar, www.economia.gob.ar, datos.yvera.gob.ar
const dispatcher = new Agent ( {
pipelining : 10 ,
maxRedirections : 20 ,
} ) ;
class StatusCodeError extends Error {
/ * *
* @ param { number } code
* /
constructor ( code ) {
super ( ` Status code: ${ code } ` ) ;
this . code = code ;
}
}
const outputPath = process . argv [ 2 ] ;
if ( ! outputPath ) {
console . error ( "Especificamente el output porfa" ) ;
process . exit ( 1 ) ;
}
await mkdir ( outputPath , { recursive : true } ) ;
// Leer JSON de stdin
const json = await process . stdin . toArray ( ) ;
const jsonString = json . join ( "" ) ;
/** @type {{ dataset: Dataset[] }} */
const parsed = JSON . parse ( jsonString ) ;
const jobs = parsed . dataset . flatMap ( ( dataset ) =>
dataset . distribution . map ( ( dist ) => ( { dataset , dist } ) ) ,
) ;
// forma barrani de distribuir carga entre servidores
shuffleArray ( jobs ) ;
const totalJobs = jobs . length ;
let nFinished = 0 ;
const duplicated = hasDuplicates (
jobs . map ( ( j ) => ` ${ j . dataset . identifier } / ${ j . dist . identifier } ` ) ,
) ;
if ( duplicated ) {
console . error (
"ADVERTENCIA: ¡encontré duplicados! es posible que se pisen archivos entre si" ,
) ;
}
const greens = Array ( 128 )
. fill ( 0 )
. map ( ( ) =>
( async ( ) => {
let job ;
while ( ( job = jobs . pop ( ) ) ) {
const { dataset , dist } = job ;
request : do {
try {
await downloadDist ( dataset , dist ) ;
} catch ( error ) {
if ( error instanceof StatusCodeError ) {
// algunos servidores usan 403 como coso para decir "calmate"
2023-11-27 23:08:36 +00:00
if (
error . code === 403 &&
dist . downloadURL . includes ( "minsegar-my.sharepoint.com" )
) {
2023-11-27 20:01:56 +00:00
console . debug (
` debug: reintentando ${ dist . downloadURL } porque tiró 403 ` ,
) ;
await wait ( 15000 ) ;
continue request ;
}
error = error . toString ( ) ;
}
console . error (
` error: Failed to download URL ${ dist . downloadURL } ( ${ dataset . identifier } / ${ dist . identifier } ): ` ,
error ,
) ;
if ( ! ( error instanceof StatusCodeError ) ) continue request ;
} finally {
nFinished ++ ;
}
} while ( 0 ) ;
}
} ) ( ) ,
) ;
const interval = setInterval ( ( ) => {
console . info ( ` info: ${ nFinished } / ${ totalJobs } done ` ) ;
} , 15000 ) ;
await Promise . all ( greens ) ;
clearInterval ( interval ) ;
/ * *
* @ argument { Dataset } dataset
* @ argument { Distribution } dist
* /
async function downloadDist ( dataset , dist ) {
const url = new URL ( dist . downloadURL ) ;
// Siempre usar HTTPS excepto cuando está roto
if ( brokenHttps . includes ( url . host ) ) {
url . protocol = "http:" ;
// console.debug(url);
} else url . protocol = "https:" ;
const res = await fetch ( url . toString ( ) , {
dispatcher ,
} ) ;
if ( res . status >= 400 ) {
throw new StatusCodeError ( res . status ) ;
}
const fileDirPath = join (
outputPath ,
sanitizeSuffix ( dataset . identifier ) ,
sanitizeSuffix ( dist . identifier ) ,
) ;
await mkdir ( fileDirPath , { recursive : true } ) ;
const filePath = join (
fileDirPath ,
sanitizeSuffix ( dist . fileName || dist . identifier ) ,
) ;
const outputFile = await open ( filePath , "w" ) ;
if ( ! res . body ) throw new Error ( "no body" ) ;
await pipeline ( res . body , outputFile . createWriteStream ( ) ) ;
}
/ * * @ t y p e d e f { o b j e c t } D a t a s e t
* @ prop { string } identifier
* @ prop { Distribution [ ] } distribution
* /
/ * * @ t y p e d e f { o b j e c t } D i s t r i b u t i o n
* @ prop { string } identifier
* @ prop { string } fileName
* @ prop { string } downloadURL
* /
// https://security.stackexchange.com/a/123723
/ * *
* @ argument { string } path
* /
function sanitizeSuffix ( path ) {
return normalize ( path ) . replace ( /^(\.\.(\/|\\|$))+/ , "" ) ;
}
// https://stackoverflow.com/a/7376645
/ * *
* @ argument { any [ ] } array
* /
function hasDuplicates ( array ) {
return new Set ( array ) . size !== array . length ;
}
// https://stackoverflow.com/a/12646864
/ * *
* @ argument { any [ ] } array
* /
function shuffleArray ( array ) {
for ( var i = array . length - 1 ; i > 0 ; i -- ) {
var j = Math . floor ( Math . random ( ) * ( i + 1 ) ) ;
var temp = array [ i ] ;
array [ i ] = array [ j ] ;
array [ j ] = temp ;
}
}
/ * *
* @ argument { number } ms
* /
function wait ( ms ) {
if ( ms < 0 ) return Promise . resolve ( ) ;
return new Promise ( ( resolve ) => setTimeout ( resolve , ms ) ) ;
}