2023-11-27 20:01:56 +00:00
// @ts-check
import { mkdir , open } from "node:fs/promises" ;
import { Agent , fetch } from "undici" ;
import { join , normalize } from "node:path" ;
import { pipeline } from "node:stream/promises" ;
// FYI: al menos los siguientes dominios no tienen la cadena completa de certificados en HTTPS. tenemos que usar un hack (node_extra_ca_certs_mozilla_bundle) para conectarnos a estos sitios. (se puede ver con ssllabs.com) ojalá lxs administradorxs de estos servidores lo arreglen.
// www.enargas.gov.ar, transparencia.enargas.gov.ar, www.energia.gob.ar, www.economia.gob.ar, datos.yvera.gob.ar
const dispatcher = new Agent ( {
pipelining : 10 ,
maxRedirections : 20 ,
} ) ;
class StatusCodeError extends Error {
/ * *
* @ param { number } code
* /
constructor ( code ) {
super ( ` Status code: ${ code } ` ) ;
this . code = code ;
}
}
const outputPath = process . argv [ 2 ] ;
if ( ! outputPath ) {
console . error ( "Especificamente el output porfa" ) ;
process . exit ( 1 ) ;
}
await mkdir ( outputPath , { recursive : true } ) ;
// Leer JSON de stdin
const json = await process . stdin . toArray ( ) ;
const jsonString = json . join ( "" ) ;
/** @type {{ dataset: Dataset[] }} */
const parsed = JSON . parse ( jsonString ) ;
const jobs = parsed . dataset . flatMap ( ( dataset ) =>
2023-11-27 23:13:14 +00:00
dataset . distribution . map ( ( dist ) => ( {
dataset ,
dist ,
url : new URL ( dist . downloadURL ) ,
} ) )
2023-11-27 20:01:56 +00:00
) ;
const totalJobs = jobs . length ;
let nFinished = 0 ;
2023-11-27 23:13:14 +00:00
// por las dudas verificar que no hayan archivos duplicados
2023-11-27 20:01:56 +00:00
const duplicated = hasDuplicates (
2023-11-27 23:10:57 +00:00
jobs . map ( ( j ) => ` ${ j . dataset . identifier } / ${ j . dist . identifier } ` )
2023-11-27 20:01:56 +00:00
) ;
if ( duplicated ) {
console . error (
2023-11-27 23:10:57 +00:00
"ADVERTENCIA: ¡encontré duplicados! es posible que se pisen archivos entre si"
2023-11-27 20:01:56 +00:00
) ;
}
const greens = Array ( 128 )
. fill ( 0 )
. map ( ( ) =>
( async ( ) => {
let job ;
while ( ( job = jobs . pop ( ) ) ) {
const { dataset , dist } = job ;
request : do {
try {
await downloadDist ( dataset , dist ) ;
} catch ( error ) {
if ( error instanceof StatusCodeError ) {
// algunos servidores usan 403 como coso para decir "calmate"
2023-11-27 23:08:36 +00:00
if (
error . code === 403 &&
dist . downloadURL . includes ( "minsegar-my.sharepoint.com" )
) {
2023-11-27 20:01:56 +00:00
console . debug (
2023-11-27 23:10:57 +00:00
` debug: reintentando ${ dist . downloadURL } porque tiró 403 `
2023-11-27 20:01:56 +00:00
) ;
await wait ( 15000 ) ;
continue request ;
}
error = error . toString ( ) ;
}
console . error (
` error: Failed to download URL ${ dist . downloadURL } ( ${ dataset . identifier } / ${ dist . identifier } ): ` ,
2023-11-27 23:10:57 +00:00
error
2023-11-27 20:01:56 +00:00
) ;
if ( ! ( error instanceof StatusCodeError ) ) continue request ;
} finally {
nFinished ++ ;
}
} while ( 0 ) ;
}
2023-11-27 23:10:57 +00:00
} ) ( )
2023-11-27 20:01:56 +00:00
) ;
const interval = setInterval ( ( ) => {
console . info ( ` info: ${ nFinished } / ${ totalJobs } done ` ) ;
} , 15000 ) ;
await Promise . all ( greens ) ;
clearInterval ( interval ) ;
/ * *
* @ argument { Dataset } dataset
* @ argument { Distribution } dist
* /
async function downloadDist ( dataset , dist ) {
const url = new URL ( dist . downloadURL ) ;
const res = await fetch ( url . toString ( ) , {
dispatcher ,
} ) ;
if ( res . status >= 400 ) {
throw new StatusCodeError ( res . status ) ;
}
const fileDirPath = join (
outputPath ,
sanitizeSuffix ( dataset . identifier ) ,
2023-11-27 23:10:57 +00:00
sanitizeSuffix ( dist . identifier )
2023-11-27 20:01:56 +00:00
) ;
await mkdir ( fileDirPath , { recursive : true } ) ;
const filePath = join (
fileDirPath ,
2023-11-27 23:10:57 +00:00
sanitizeSuffix ( dist . fileName || dist . identifier )
2023-11-27 20:01:56 +00:00
) ;
const outputFile = await open ( filePath , "w" ) ;
if ( ! res . body ) throw new Error ( "no body" ) ;
await pipeline ( res . body , outputFile . createWriteStream ( ) ) ;
}
/ * * @ t y p e d e f { o b j e c t } D a t a s e t
* @ prop { string } identifier
* @ prop { Distribution [ ] } distribution
* /
/ * * @ t y p e d e f { o b j e c t } D i s t r i b u t i o n
* @ prop { string } identifier
* @ prop { string } fileName
* @ prop { string } downloadURL
* /
// https://security.stackexchange.com/a/123723
/ * *
* @ argument { string } path
* /
function sanitizeSuffix ( path ) {
return normalize ( path ) . replace ( /^(\.\.(\/|\\|$))+/ , "" ) ;
}
// https://stackoverflow.com/a/7376645
/ * *
* @ argument { any [ ] } array
* /
function hasDuplicates ( array ) {
return new Set ( array ) . size !== array . length ;
}
/ * *
* @ argument { number } ms
* /
function wait ( ms ) {
if ( ms < 0 ) return Promise . resolve ( ) ;
return new Promise ( ( resolve ) => setTimeout ( resolve , ms ) ) ;
}