2023-11-27 20:01:56 +00:00
// @ts-check
2023-11-28 02:10:24 +00:00
import { mkdir , open , writeFile } from "node:fs/promises" ;
import { Agent , fetch , request , setGlobalDispatcher } from "undici" ;
2023-11-27 20:01:56 +00:00
import { join , normalize } from "node:path" ;
2023-11-28 22:19:35 +00:00
import pLimit from "p-limit" ;
2023-11-27 20:01:56 +00:00
// FYI: al menos los siguientes dominios no tienen la cadena completa de certificados en HTTPS. tenemos que usar un hack (node_extra_ca_certs_mozilla_bundle) para conectarnos a estos sitios. (se puede ver con ssllabs.com) ojalá lxs administradorxs de estos servidores lo arreglen.
// www.enargas.gov.ar, transparencia.enargas.gov.ar, www.energia.gob.ar, www.economia.gob.ar, datos.yvera.gob.ar
2023-11-28 02:10:24 +00:00
setGlobalDispatcher (
new Agent ( {
pipelining : 0 ,
} )
) ;
2023-11-27 20:01:56 +00:00
2023-11-28 22:19:35 +00:00
/ * * k e y e s h o s t
* @ type { Map < string , import ( "p-limit" ) . LimitFunction > } * /
const limiters = new Map ( ) ;
const nThreads = process . env . N _THREADS ? parseInt ( process . env . N _THREADS ) : 16 ;
2023-11-27 20:01:56 +00:00
class StatusCodeError extends Error {
/ * *
* @ param { number } code
* /
constructor ( code ) {
super ( ` Status code: ${ code } ` ) ;
this . code = code ;
}
}
2023-11-28 02:19:09 +00:00
class TooManyRedirectsError extends Error { }
2023-11-27 20:01:56 +00:00
2023-11-28 02:10:24 +00:00
let jsonUrlString = process . argv [ 2 ] ;
if ( ! jsonUrlString ) {
console . error ( "Especificamente el url al json porfa" ) ;
2023-11-27 20:01:56 +00:00
process . exit ( 1 ) ;
}
2023-11-28 21:22:25 +00:00
downloadFromData ( jsonUrlString ) ;
/ * *
* @ param { string } jsonUrlString
* /
async function downloadFromData ( jsonUrlString ) {
const jsonUrl = new URL ( jsonUrlString ) ;
const outputPath = jsonUrl . host ;
await mkdir ( outputPath , { recursive : true } ) ;
const errorFile = await open ( join ( outputPath , "errors.jsonl" ) , "w" ) ;
const jsonRes = await fetch ( jsonUrl ) ;
// prettier-ignore
const parsed = /** @type {{ dataset: Dataset[] }} */ ( await jsonRes . json ( ) )
await writeFile ( join ( outputPath , "data.json" ) , JSON . stringify ( parsed ) ) ;
2023-11-27 20:01:56 +00:00
2023-11-28 21:22:25 +00:00
/** @type {DownloadJob[]} */
const jobs = parsed . dataset . flatMap ( ( dataset ) =>
dataset . distribution . map ( ( dist ) => ( {
dataset ,
dist ,
url : patchUrl ( new URL ( dist . downloadURL ) ) ,
outputPath ,
attempts : 0 ,
} ) )
) ;
const totalJobs = jobs . length ;
let nFinished = 0 ;
let nErrors = 0 ;
// por las dudas verificar que no hayan archivos duplicados
chequearIdsDuplicados ( jobs ) ;
shuffleArray ( jobs ) ;
2023-11-28 22:19:35 +00:00
const promises = jobs . map ( ( job ) => {
let limit = limiters . get ( job . url . host ) ;
if ( ! limit ) {
limit = pLimit ( nThreads ) ;
limiters . set ( job . url . host , limit ) ;
}
return limit ( async ( ) => {
try {
await downloadDistWithRetries ( job ) ;
} catch ( error ) {
await errorFile . write (
JSON . stringify ( {
url : job . url . toString ( ) ,
... encodeError ( error ) ,
} ) + "\n"
) ;
nErrors ++ ;
} finally {
nFinished ++ ;
}
} ) ;
} ) ;
2023-11-27 20:01:56 +00:00
2023-11-28 22:19:35 +00:00
process . stderr . write ( ` info: 0/ ${ totalJobs } done \n ` ) ;
2023-11-28 21:22:25 +00:00
const interval = setInterval ( ( ) => {
process . stderr . write ( ` info: ${ nFinished } / ${ totalJobs } done \n ` ) ;
} , 30000 ) ;
2023-11-28 22:19:35 +00:00
await Promise . all ( promises ) ;
2023-11-28 21:22:25 +00:00
clearInterval ( interval ) ;
if ( nErrors > 0 ) console . error ( ` Finished with ${ nErrors } errors ` ) ;
2023-11-28 01:43:58 +00:00
}
2023-11-28 22:19:35 +00:00
/ * *
* @ argument { DownloadJob } job
* @ argument { number } attempts
* /
async function downloadDistWithRetries ( job , attempts = 0 ) {
const { url } = job ;
try {
await downloadDist ( job ) ;
} catch ( error ) {
// algunos servidores usan 403 como coso para decir "calmate"
// intentar hasta 15 veces con 15 segundos de por medio
if (
error instanceof StatusCodeError &&
error . code === 403 &&
url . host === "minsegar-my.sharepoint.com" &&
attempts < 15
) {
await wait ( 15000 ) ;
return await downloadDistWithRetries ( job , attempts + 1 ) ;
}
// si no fue un error de http, reintentar hasta 5 veces con 5 segundos de por medio
else if (
! ( error instanceof StatusCodeError ) &&
! ( error instanceof TooManyRedirectsError ) &&
attempts < 5
) {
await wait ( 5000 ) ;
return await downloadDistWithRetries ( job , attempts + 1 ) ;
} else throw error ;
}
}
2023-11-28 01:43:58 +00:00
/ * *
* @ argument { DownloadJob } job
* /
2023-11-28 21:22:25 +00:00
async function downloadDist ( { dist , dataset , url , outputPath } ) {
2023-11-28 03:32:53 +00:00
// sharepoint no le gusta compartir a bots lol
const spoofUserAgent = url . host . endsWith ( "sharepoint.com" ) ;
2023-11-28 02:19:09 +00:00
const res = await request ( url . toString ( ) , {
maxRedirections : 20 ,
2023-11-28 03:32:53 +00:00
headers : {
"User-Agent" : spoofUserAgent
? "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0"
: "transicion-desordenada (https://nulo.ar)" ,
} ,
2023-11-28 02:19:09 +00:00
} ) ;
if ( res . statusCode >= 300 && res . statusCode <= 399 )
throw new TooManyRedirectsError ( ) ;
2023-11-28 02:10:24 +00:00
if ( res . statusCode < 200 || res . statusCode > 299 ) {
throw new StatusCodeError ( res . statusCode ) ;
2023-11-27 20:01:56 +00:00
}
const fileDirPath = join (
outputPath ,
sanitizeSuffix ( dataset . identifier ) ,
2023-11-27 23:10:57 +00:00
sanitizeSuffix ( dist . identifier )
2023-11-27 20:01:56 +00:00
) ;
await mkdir ( fileDirPath , { recursive : true } ) ;
const filePath = join (
fileDirPath ,
2023-11-27 23:10:57 +00:00
sanitizeSuffix ( dist . fileName || dist . identifier )
2023-11-27 20:01:56 +00:00
) ;
if ( ! res . body ) throw new Error ( "no body" ) ;
2023-11-28 21:22:25 +00:00
await writeFile ( filePath , res . body ) ;
2023-11-27 20:01:56 +00:00
}
2023-11-28 01:43:58 +00:00
/ * * @ t y p e d e f D o w n l o a d J o b
* @ prop { Dataset } dataset
* @ prop { Distribution } dist
* @ prop { URL } url
2023-11-28 21:22:25 +00:00
* @ prop { string } outputPath
* @ prop { number } attempts
* @ prop { Date = } waitUntil
2023-11-28 01:43:58 +00:00
* /
/ * * @ t y p e d e f D a t a s e t
2023-11-27 20:01:56 +00:00
* @ prop { string } identifier
* @ prop { Distribution [ ] } distribution
* /
2023-11-28 01:43:58 +00:00
/ * * @ t y p e d e f D i s t r i b u t i o n
2023-11-27 20:01:56 +00:00
* @ prop { string } identifier
* @ prop { string } fileName
* @ prop { string } downloadURL
* /
// https://security.stackexchange.com/a/123723
/ * *
* @ argument { string } path
* /
function sanitizeSuffix ( path ) {
return normalize ( path ) . replace ( /^(\.\.(\/|\\|$))+/ , "" ) ;
}
2023-11-28 21:22:25 +00:00
/ * *
* @ param { DownloadJob [ ] } jobs
* /
function chequearIdsDuplicados ( jobs ) {
2023-11-28 01:43:58 +00:00
const duplicated = hasDuplicates (
jobs . map ( ( j ) => ` ${ j . dataset . identifier } / ${ j . dist . identifier } ` )
) ;
if ( duplicated ) {
console . error (
"ADVERTENCIA: ¡encontré duplicados! es posible que se pisen archivos entre si"
) ;
}
}
2023-11-27 20:01:56 +00:00
// https://stackoverflow.com/a/7376645
2023-11-28 01:43:58 +00:00
/** @argument {any[]} array */
2023-11-27 20:01:56 +00:00
function hasDuplicates ( array ) {
return new Set ( array ) . size !== array . length ;
}
2023-11-28 01:43:58 +00:00
/** @argument {number} ms */
2023-11-27 20:01:56 +00:00
function wait ( ms ) {
return new Promise ( ( resolve ) => setTimeout ( resolve , ms ) ) ;
}
2023-11-28 01:43:58 +00:00
function encodeError ( error ) {
if ( error instanceof StatusCodeError )
return { kind : "http_error" , status _code : error . code } ;
2023-11-28 02:19:09 +00:00
else if ( error instanceof TooManyRedirectsError )
return { kind : "infinite_redirect" } ;
2023-11-28 01:43:58 +00:00
else {
2023-11-28 02:19:09 +00:00
return { kind : "generic_error" , error : error . code || error . message } ;
2023-11-28 01:43:58 +00:00
}
}
2023-11-28 03:41:25 +00:00
/ * *
* parchea URLs que se rompen solas
* @ param { URL } url
* /
function patchUrl ( url ) {
if ( url . host === "www.ign.gob.ar" ) {
// por defecto, 'http://www.ign.gob.ar' redirige a 'https://ign.gob.ar' pero su certificado solo aplica para '*.ign.gob.ar'. se sirve todo el contenido correctamente en 'https://www.ign.gob.ar', así que vamos para ahí.
url . protocol = "https:" ;
}
return url ;
}
2023-11-28 21:22:25 +00:00
// https://stackoverflow.com/a/12646864
/** @param {any[]} array */
function shuffleArray ( array ) {
for ( let i = array . length - 1 ; i > 0 ; i -- ) {
const j = Math . floor ( Math . random ( ) * ( i + 1 ) ) ;
[ array [ i ] , array [ j ] ] = [ array [ j ] , array [ i ] ] ;
}
}