mirror of
https://github.com/catdevnull/transicion-desordenada-diablo
synced 2024-11-15 02:21:39 +00:00
traer json, usar undici.request, otras cosas
This commit is contained in:
parent
94b909abfa
commit
b717af8ce9
2 changed files with 29 additions and 25 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -2,3 +2,4 @@ node_modules/
|
||||||
dataJsons/
|
dataJsons/
|
||||||
log
|
log
|
||||||
prueba
|
prueba
|
||||||
|
datos.gob.ar/
|
|
@ -1,16 +1,18 @@
|
||||||
// @ts-check
|
// @ts-check
|
||||||
import { mkdir, open } from "node:fs/promises";
|
import { mkdir, open, writeFile } from "node:fs/promises";
|
||||||
import { Agent, fetch } from "undici";
|
import { Agent, fetch, request, setGlobalDispatcher } from "undici";
|
||||||
import { join, normalize } from "node:path";
|
import { join, normalize } from "node:path";
|
||||||
import { pipeline } from "node:stream/promises";
|
import { pipeline } from "node:stream/promises";
|
||||||
|
|
||||||
// FYI: al menos los siguientes dominios no tienen la cadena completa de certificados en HTTPS. tenemos que usar un hack (node_extra_ca_certs_mozilla_bundle) para conectarnos a estos sitios. (se puede ver con ssllabs.com) ojalá lxs administradorxs de estos servidores lo arreglen.
|
// FYI: al menos los siguientes dominios no tienen la cadena completa de certificados en HTTPS. tenemos que usar un hack (node_extra_ca_certs_mozilla_bundle) para conectarnos a estos sitios. (se puede ver con ssllabs.com) ojalá lxs administradorxs de estos servidores lo arreglen.
|
||||||
// www.enargas.gov.ar, transparencia.enargas.gov.ar, www.energia.gob.ar, www.economia.gob.ar, datos.yvera.gob.ar
|
// www.enargas.gov.ar, transparencia.enargas.gov.ar, www.energia.gob.ar, www.economia.gob.ar, datos.yvera.gob.ar
|
||||||
|
|
||||||
const dispatcher = new Agent({
|
setGlobalDispatcher(
|
||||||
pipelining: 50,
|
new Agent({
|
||||||
|
pipelining: 0,
|
||||||
maxRedirections: 20,
|
maxRedirections: 20,
|
||||||
});
|
})
|
||||||
|
);
|
||||||
|
|
||||||
class StatusCodeError extends Error {
|
class StatusCodeError extends Error {
|
||||||
/**
|
/**
|
||||||
|
@ -22,19 +24,20 @@ class StatusCodeError extends Error {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const outputPath = process.argv[2];
|
let jsonUrlString = process.argv[2];
|
||||||
if (!outputPath) {
|
if (!jsonUrlString) {
|
||||||
console.error("Especificamente el output porfa");
|
console.error("Especificamente el url al json porfa");
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
|
const jsonUrl = new URL(jsonUrlString);
|
||||||
|
const outputPath = jsonUrl.host;
|
||||||
await mkdir(outputPath, { recursive: true });
|
await mkdir(outputPath, { recursive: true });
|
||||||
const errorFile = await open(join(outputPath, "errors.jsonl"), "w");
|
const errorFile = await open(join(outputPath, "errors.jsonl"), "w");
|
||||||
|
|
||||||
// Leer JSON de stdin
|
const jsonRes = await fetch(jsonUrl);
|
||||||
const json = await process.stdin.toArray();
|
// prettier-ignore
|
||||||
const jsonString = json.join("");
|
const parsed = /** @type {{ dataset: Dataset[] }} */(await jsonRes.json())
|
||||||
/** @type {{ dataset: Dataset[] }} */
|
await writeFile(join(outputPath, "data.json"), JSON.stringify(parsed));
|
||||||
const parsed = JSON.parse(jsonString);
|
|
||||||
|
|
||||||
const jobs = parsed.dataset.flatMap((dataset) =>
|
const jobs = parsed.dataset.flatMap((dataset) =>
|
||||||
dataset.distribution.map((dist) => ({
|
dataset.distribution.map((dist) => ({
|
||||||
|
@ -60,7 +63,7 @@ for (const job of jobs) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const greens = [...jobsPerHost.entries()].flatMap(([host, jobs]) => {
|
const greens = [...jobsPerHost.entries()].flatMap(([host, jobs]) => {
|
||||||
const nThreads = 128;
|
const nThreads = 8;
|
||||||
return Array(nThreads)
|
return Array(nThreads)
|
||||||
.fill(0)
|
.fill(0)
|
||||||
.map(() =>
|
.map(() =>
|
||||||
|
@ -71,7 +74,10 @@ const greens = [...jobsPerHost.entries()].flatMap(([host, jobs]) => {
|
||||||
await downloadDistWithRetries(job);
|
await downloadDistWithRetries(job);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
await errorFile.write(
|
await errorFile.write(
|
||||||
JSON.stringify({ url: job.url.toString(), ...encodeError(error) })
|
JSON.stringify({
|
||||||
|
url: job.url.toString(),
|
||||||
|
...encodeError(error),
|
||||||
|
}) + "\n"
|
||||||
);
|
);
|
||||||
nErrors++;
|
nErrors++;
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -128,11 +134,9 @@ async function downloadDistWithRetries(job, tries = 0) {
|
||||||
async function downloadDist({ dist, dataset }) {
|
async function downloadDist({ dist, dataset }) {
|
||||||
const url = new URL(dist.downloadURL);
|
const url = new URL(dist.downloadURL);
|
||||||
|
|
||||||
const res = await fetch(url.toString(), {
|
const res = await request(url.toString());
|
||||||
dispatcher,
|
if (res.statusCode < 200 || res.statusCode > 299) {
|
||||||
});
|
throw new StatusCodeError(res.statusCode);
|
||||||
if (!res.ok) {
|
|
||||||
throw new StatusCodeError(res.status);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const fileDirPath = join(
|
const fileDirPath = join(
|
||||||
|
@ -201,10 +205,9 @@ function encodeError(error) {
|
||||||
return { kind: "http_error", status_code: error.code };
|
return { kind: "http_error", status_code: error.code };
|
||||||
else if (errorIsInfiniteRedirect(error)) return { kind: "infinite_redirect" };
|
else if (errorIsInfiniteRedirect(error)) return { kind: "infinite_redirect" };
|
||||||
else {
|
else {
|
||||||
console.error(error, error.cause.message);
|
return { kind: "generic_error", error: error.message };
|
||||||
return { kind: "generic_error", error };
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
function errorIsInfiniteRedirect(error) {
|
function errorIsInfiniteRedirect(error) {
|
||||||
return error?.cause?.message === "redirect count exceeded";
|
return error?.message === "redirect count exceeded";
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue