2023-11-27 20:01:56 +00:00
// @ts-check
2023-11-28 02:10:24 +00:00
import { mkdir , open , writeFile } from "node:fs/promises" ;
import { Agent , fetch , request , setGlobalDispatcher } from "undici" ;
2023-11-27 20:01:56 +00:00
import { join , normalize } from "node:path" ;
import { pipeline } from "node:stream/promises" ;
// FYI: al menos los siguientes dominios no tienen la cadena completa de certificados en HTTPS. tenemos que usar un hack (node_extra_ca_certs_mozilla_bundle) para conectarnos a estos sitios. (se puede ver con ssllabs.com) ojalá lxs administradorxs de estos servidores lo arreglen.
// www.enargas.gov.ar, transparencia.enargas.gov.ar, www.energia.gob.ar, www.economia.gob.ar, datos.yvera.gob.ar
2023-11-28 02:19:09 +00:00
// TODO: revisar por qué falla http://www.ign.gob.ar/descargas/geodatos/CSV/ign_municipio.csv
2023-11-28 02:10:24 +00:00
setGlobalDispatcher (
new Agent ( {
pipelining : 0 ,
} )
) ;
2023-11-27 20:01:56 +00:00
class StatusCodeError extends Error {
/ * *
* @ param { number } code
* /
constructor ( code ) {
super ( ` Status code: ${ code } ` ) ;
this . code = code ;
}
}
2023-11-28 02:19:09 +00:00
class TooManyRedirectsError extends Error { }
2023-11-27 20:01:56 +00:00
2023-11-28 02:10:24 +00:00
let jsonUrlString = process . argv [ 2 ] ;
if ( ! jsonUrlString ) {
console . error ( "Especificamente el url al json porfa" ) ;
2023-11-27 20:01:56 +00:00
process . exit ( 1 ) ;
}
2023-11-28 02:10:24 +00:00
const jsonUrl = new URL ( jsonUrlString ) ;
const outputPath = jsonUrl . host ;
2023-11-27 20:01:56 +00:00
await mkdir ( outputPath , { recursive : true } ) ;
2023-11-28 01:43:58 +00:00
const errorFile = await open ( join ( outputPath , "errors.jsonl" ) , "w" ) ;
2023-11-27 20:01:56 +00:00
2023-11-28 02:10:24 +00:00
const jsonRes = await fetch ( jsonUrl ) ;
// prettier-ignore
const parsed = /** @type {{ dataset: Dataset[] }} */ ( await jsonRes . json ( ) )
await writeFile ( join ( outputPath , "data.json" ) , JSON . stringify ( parsed ) ) ;
2023-11-27 20:01:56 +00:00
const jobs = parsed . dataset . flatMap ( ( dataset ) =>
2023-11-27 23:13:14 +00:00
dataset . distribution . map ( ( dist ) => ( {
dataset ,
dist ,
url : new URL ( dist . downloadURL ) ,
} ) )
2023-11-27 20:01:56 +00:00
) ;
const totalJobs = jobs . length ;
let nFinished = 0 ;
2023-11-28 01:43:58 +00:00
let nErrors = 0 ;
2023-11-27 20:01:56 +00:00
2023-11-27 23:13:14 +00:00
// por las dudas verificar que no hayan archivos duplicados
2023-11-28 01:43:58 +00:00
chequearIdsDuplicados ( ) ;
/** @type {Map< string, DownloadJob[] >} */
let jobsPerHost = new Map ( ) ;
for ( const job of jobs ) {
jobsPerHost . set ( job . url . host , [
... ( jobsPerHost . get ( job . url . host ) || [ ] ) ,
job ,
] ) ;
2023-11-27 20:01:56 +00:00
}
2023-11-28 01:43:58 +00:00
const greens = [ ... jobsPerHost . entries ( ) ] . flatMap ( ( [ host , jobs ] ) => {
2023-11-28 02:10:24 +00:00
const nThreads = 8 ;
2023-11-28 01:43:58 +00:00
return Array ( nThreads )
. fill ( 0 )
. map ( ( ) =>
( async ( ) => {
let job ;
while ( ( job = jobs . pop ( ) ) ) {
2023-11-27 20:01:56 +00:00
try {
2023-11-28 01:43:58 +00:00
await downloadDistWithRetries ( job ) ;
2023-11-27 20:01:56 +00:00
} catch ( error ) {
2023-11-28 01:43:58 +00:00
await errorFile . write (
2023-11-28 02:10:24 +00:00
JSON . stringify ( {
url : job . url . toString ( ) ,
... encodeError ( error ) ,
} ) + "\n"
2023-11-27 20:01:56 +00:00
) ;
2023-11-28 01:43:58 +00:00
nErrors ++ ;
2023-11-27 20:01:56 +00:00
} finally {
nFinished ++ ;
}
2023-11-28 01:43:58 +00:00
}
} ) ( )
) ;
} ) ;
process . stderr . write ( ` greens: ${ greens . length } \n ` ) ;
2023-11-27 20:01:56 +00:00
const interval = setInterval ( ( ) => {
2023-11-28 01:43:58 +00:00
process . stderr . write ( ` info: ${ nFinished } / ${ totalJobs } done \n ` ) ;
} , 30000 ) ;
2023-11-27 20:01:56 +00:00
await Promise . all ( greens ) ;
clearInterval ( interval ) ;
2023-11-28 01:43:58 +00:00
if ( nErrors > 0 ) console . error ( ` Finished with ${ nErrors } errors ` ) ;
2023-11-27 20:01:56 +00:00
/ * *
2023-11-28 01:43:58 +00:00
* @ argument { DownloadJob } job
* @ argument { number } tries
2023-11-27 20:01:56 +00:00
* /
2023-11-28 01:43:58 +00:00
async function downloadDistWithRetries ( job , tries = 0 ) {
const { url } = job ;
try {
await downloadDist ( job ) ;
} catch ( error ) {
// algunos servidores usan 403 como coso para decir "calmate"
// intentar hasta 15 veces con 15 segundos de por medio
if (
error instanceof StatusCodeError &&
error . code === 403 &&
url . host === "minsegar-my.sharepoint.com" &&
tries < 15
) {
await wait ( 15000 ) ;
return await downloadDistWithRetries ( job , tries + 1 ) ;
}
// si no fue un error de http, reintentar hasta 5 veces con 5 segundos de por medio
else if (
! ( error instanceof StatusCodeError ) &&
2023-11-28 02:19:09 +00:00
! ( error instanceof TooManyRedirectsError ) &&
2023-11-28 01:43:58 +00:00
tries < 5
) {
await wait ( 5000 ) ;
return await downloadDistWithRetries ( job , tries + 1 ) ;
} else throw error ;
}
}
/ * *
* @ argument { DownloadJob } job
* /
async function downloadDist ( { dist , dataset } ) {
2023-11-27 20:01:56 +00:00
const url = new URL ( dist . downloadURL ) ;
2023-11-28 02:19:09 +00:00
const res = await request ( url . toString ( ) , {
maxRedirections : 20 ,
} ) ;
if ( res . statusCode >= 300 && res . statusCode <= 399 )
throw new TooManyRedirectsError ( ) ;
2023-11-28 02:10:24 +00:00
if ( res . statusCode < 200 || res . statusCode > 299 ) {
throw new StatusCodeError ( res . statusCode ) ;
2023-11-27 20:01:56 +00:00
}
const fileDirPath = join (
outputPath ,
sanitizeSuffix ( dataset . identifier ) ,
2023-11-27 23:10:57 +00:00
sanitizeSuffix ( dist . identifier )
2023-11-27 20:01:56 +00:00
) ;
await mkdir ( fileDirPath , { recursive : true } ) ;
const filePath = join (
fileDirPath ,
2023-11-27 23:10:57 +00:00
sanitizeSuffix ( dist . fileName || dist . identifier )
2023-11-27 20:01:56 +00:00
) ;
const outputFile = await open ( filePath , "w" ) ;
if ( ! res . body ) throw new Error ( "no body" ) ;
await pipeline ( res . body , outputFile . createWriteStream ( ) ) ;
}
2023-11-28 01:43:58 +00:00
/ * * @ t y p e d e f D o w n l o a d J o b
* @ prop { Dataset } dataset
* @ prop { Distribution } dist
* @ prop { URL } url
* /
/ * * @ t y p e d e f D a t a s e t
2023-11-27 20:01:56 +00:00
* @ prop { string } identifier
* @ prop { Distribution [ ] } distribution
* /
2023-11-28 01:43:58 +00:00
/ * * @ t y p e d e f D i s t r i b u t i o n
2023-11-27 20:01:56 +00:00
* @ prop { string } identifier
* @ prop { string } fileName
* @ prop { string } downloadURL
* /
// https://security.stackexchange.com/a/123723
/ * *
* @ argument { string } path
* /
function sanitizeSuffix ( path ) {
return normalize ( path ) . replace ( /^(\.\.(\/|\\|$))+/ , "" ) ;
}
2023-11-28 01:43:58 +00:00
function chequearIdsDuplicados ( ) {
const duplicated = hasDuplicates (
jobs . map ( ( j ) => ` ${ j . dataset . identifier } / ${ j . dist . identifier } ` )
) ;
if ( duplicated ) {
console . error (
"ADVERTENCIA: ¡encontré duplicados! es posible que se pisen archivos entre si"
) ;
}
}
2023-11-27 20:01:56 +00:00
// https://stackoverflow.com/a/7376645
2023-11-28 01:43:58 +00:00
/** @argument {any[]} array */
2023-11-27 20:01:56 +00:00
function hasDuplicates ( array ) {
return new Set ( array ) . size !== array . length ;
}
2023-11-28 01:43:58 +00:00
/** @argument {number} ms */
2023-11-27 20:01:56 +00:00
function wait ( ms ) {
if ( ms < 0 ) return Promise . resolve ( ) ;
return new Promise ( ( resolve ) => setTimeout ( resolve , ms ) ) ;
}
2023-11-28 01:43:58 +00:00
function encodeError ( error ) {
if ( error instanceof StatusCodeError )
return { kind : "http_error" , status _code : error . code } ;
2023-11-28 02:19:09 +00:00
else if ( error instanceof TooManyRedirectsError )
return { kind : "infinite_redirect" } ;
2023-11-28 01:43:58 +00:00
else {
2023-11-28 02:19:09 +00:00
return { kind : "generic_error" , error : error . code || error . message } ;
2023-11-28 01:43:58 +00:00
}
}