2023-11-27 20:01:56 +00:00
// @ts-check
import { mkdir , open } from "node:fs/promises" ;
import { Agent , fetch } from "undici" ;
import { join , normalize } from "node:path" ;
import { pipeline } from "node:stream/promises" ;
// FYI: al menos los siguientes dominios no tienen la cadena completa de certificados en HTTPS. tenemos que usar un hack (node_extra_ca_certs_mozilla_bundle) para conectarnos a estos sitios. (se puede ver con ssllabs.com) ojalá lxs administradorxs de estos servidores lo arreglen.
// www.enargas.gov.ar, transparencia.enargas.gov.ar, www.energia.gob.ar, www.economia.gob.ar, datos.yvera.gob.ar
const dispatcher = new Agent ( {
2023-11-28 01:43:58 +00:00
pipelining : 50 ,
2023-11-27 20:01:56 +00:00
maxRedirections : 20 ,
} ) ;
class StatusCodeError extends Error {
/ * *
* @ param { number } code
* /
constructor ( code ) {
super ( ` Status code: ${ code } ` ) ;
this . code = code ;
}
}
const outputPath = process . argv [ 2 ] ;
if ( ! outputPath ) {
console . error ( "Especificamente el output porfa" ) ;
process . exit ( 1 ) ;
}
await mkdir ( outputPath , { recursive : true } ) ;
2023-11-28 01:43:58 +00:00
const errorFile = await open ( join ( outputPath , "errors.jsonl" ) , "w" ) ;
2023-11-27 20:01:56 +00:00
// Leer JSON de stdin
const json = await process . stdin . toArray ( ) ;
const jsonString = json . join ( "" ) ;
/** @type {{ dataset: Dataset[] }} */
const parsed = JSON . parse ( jsonString ) ;
const jobs = parsed . dataset . flatMap ( ( dataset ) =>
2023-11-27 23:13:14 +00:00
dataset . distribution . map ( ( dist ) => ( {
dataset ,
dist ,
url : new URL ( dist . downloadURL ) ,
} ) )
2023-11-27 20:01:56 +00:00
) ;
const totalJobs = jobs . length ;
let nFinished = 0 ;
2023-11-28 01:43:58 +00:00
let nErrors = 0 ;
2023-11-27 20:01:56 +00:00
2023-11-27 23:13:14 +00:00
// por las dudas verificar que no hayan archivos duplicados
2023-11-28 01:43:58 +00:00
chequearIdsDuplicados ( ) ;
/** @type {Map< string, DownloadJob[] >} */
let jobsPerHost = new Map ( ) ;
for ( const job of jobs ) {
jobsPerHost . set ( job . url . host , [
... ( jobsPerHost . get ( job . url . host ) || [ ] ) ,
job ,
] ) ;
2023-11-27 20:01:56 +00:00
}
2023-11-28 01:43:58 +00:00
const greens = [ ... jobsPerHost . entries ( ) ] . flatMap ( ( [ host , jobs ] ) => {
const nThreads = 128 ;
return Array ( nThreads )
. fill ( 0 )
. map ( ( ) =>
( async ( ) => {
let job ;
while ( ( job = jobs . pop ( ) ) ) {
2023-11-27 20:01:56 +00:00
try {
2023-11-28 01:43:58 +00:00
await downloadDistWithRetries ( job ) ;
2023-11-27 20:01:56 +00:00
} catch ( error ) {
2023-11-28 01:43:58 +00:00
await errorFile . write (
JSON . stringify ( { url : job . url . toString ( ) , ... encodeError ( error ) } )
2023-11-27 20:01:56 +00:00
) ;
2023-11-28 01:43:58 +00:00
nErrors ++ ;
2023-11-27 20:01:56 +00:00
} finally {
nFinished ++ ;
}
2023-11-28 01:43:58 +00:00
}
} ) ( )
) ;
} ) ;
process . stderr . write ( ` greens: ${ greens . length } \n ` ) ;
2023-11-27 20:01:56 +00:00
const interval = setInterval ( ( ) => {
2023-11-28 01:43:58 +00:00
process . stderr . write ( ` info: ${ nFinished } / ${ totalJobs } done \n ` ) ;
} , 30000 ) ;
2023-11-27 20:01:56 +00:00
await Promise . all ( greens ) ;
clearInterval ( interval ) ;
2023-11-28 01:43:58 +00:00
if ( nErrors > 0 ) console . error ( ` Finished with ${ nErrors } errors ` ) ;
2023-11-27 20:01:56 +00:00
/ * *
2023-11-28 01:43:58 +00:00
* @ argument { DownloadJob } job
* @ argument { number } tries
2023-11-27 20:01:56 +00:00
* /
2023-11-28 01:43:58 +00:00
async function downloadDistWithRetries ( job , tries = 0 ) {
const { url } = job ;
try {
await downloadDist ( job ) ;
} catch ( error ) {
// algunos servidores usan 403 como coso para decir "calmate"
// intentar hasta 15 veces con 15 segundos de por medio
if (
error instanceof StatusCodeError &&
error . code === 403 &&
url . host === "minsegar-my.sharepoint.com" &&
tries < 15
) {
await wait ( 15000 ) ;
return await downloadDistWithRetries ( job , tries + 1 ) ;
}
// si no fue un error de http, reintentar hasta 5 veces con 5 segundos de por medio
else if (
! ( error instanceof StatusCodeError ) &&
! errorIsInfiniteRedirect ( error ) &&
tries < 5
) {
await wait ( 5000 ) ;
return await downloadDistWithRetries ( job , tries + 1 ) ;
} else throw error ;
}
}
/ * *
* @ argument { DownloadJob } job
* /
async function downloadDist ( { dist , dataset } ) {
2023-11-27 20:01:56 +00:00
const url = new URL ( dist . downloadURL ) ;
const res = await fetch ( url . toString ( ) , {
dispatcher ,
} ) ;
2023-11-28 01:43:58 +00:00
if ( ! res . ok ) {
2023-11-27 20:01:56 +00:00
throw new StatusCodeError ( res . status ) ;
}
const fileDirPath = join (
outputPath ,
sanitizeSuffix ( dataset . identifier ) ,
2023-11-27 23:10:57 +00:00
sanitizeSuffix ( dist . identifier )
2023-11-27 20:01:56 +00:00
) ;
await mkdir ( fileDirPath , { recursive : true } ) ;
const filePath = join (
fileDirPath ,
2023-11-27 23:10:57 +00:00
sanitizeSuffix ( dist . fileName || dist . identifier )
2023-11-27 20:01:56 +00:00
) ;
const outputFile = await open ( filePath , "w" ) ;
if ( ! res . body ) throw new Error ( "no body" ) ;
await pipeline ( res . body , outputFile . createWriteStream ( ) ) ;
}
2023-11-28 01:43:58 +00:00
/ * * @ t y p e d e f D o w n l o a d J o b
* @ prop { Dataset } dataset
* @ prop { Distribution } dist
* @ prop { URL } url
* /
/ * * @ t y p e d e f D a t a s e t
2023-11-27 20:01:56 +00:00
* @ prop { string } identifier
* @ prop { Distribution [ ] } distribution
* /
2023-11-28 01:43:58 +00:00
/ * * @ t y p e d e f D i s t r i b u t i o n
2023-11-27 20:01:56 +00:00
* @ prop { string } identifier
* @ prop { string } fileName
* @ prop { string } downloadURL
* /
// https://security.stackexchange.com/a/123723
/ * *
* @ argument { string } path
* /
function sanitizeSuffix ( path ) {
return normalize ( path ) . replace ( /^(\.\.(\/|\\|$))+/ , "" ) ;
}
2023-11-28 01:43:58 +00:00
function chequearIdsDuplicados ( ) {
const duplicated = hasDuplicates (
jobs . map ( ( j ) => ` ${ j . dataset . identifier } / ${ j . dist . identifier } ` )
) ;
if ( duplicated ) {
console . error (
"ADVERTENCIA: ¡encontré duplicados! es posible que se pisen archivos entre si"
) ;
}
}
2023-11-27 20:01:56 +00:00
// https://stackoverflow.com/a/7376645
2023-11-28 01:43:58 +00:00
/** @argument {any[]} array */
2023-11-27 20:01:56 +00:00
function hasDuplicates ( array ) {
return new Set ( array ) . size !== array . length ;
}
2023-11-28 01:43:58 +00:00
/** @argument {number} ms */
2023-11-27 20:01:56 +00:00
function wait ( ms ) {
if ( ms < 0 ) return Promise . resolve ( ) ;
return new Promise ( ( resolve ) => setTimeout ( resolve , ms ) ) ;
}
2023-11-28 01:43:58 +00:00
function encodeError ( error ) {
if ( error instanceof StatusCodeError )
return { kind : "http_error" , status _code : error . code } ;
else if ( errorIsInfiniteRedirect ( error ) ) return { kind : "infinite_redirect" } ;
else {
console . error ( error , error . cause . message ) ;
return { kind : "generic_error" , error } ;
}
}
function errorIsInfiniteRedirect ( error ) {
return error ? . cause ? . message === "redirect count exceeded" ;
}