traer json, usar undici.request, otras cosas

This commit is contained in:
Cat /dev/Nulo 2023-11-27 23:10:24 -03:00
parent 94b909abfa
commit b717af8ce9
2 changed files with 29 additions and 25 deletions

3
.gitignore vendored
View file

@ -1,4 +1,5 @@
node_modules/ node_modules/
dataJsons/ dataJsons/
log log
prueba prueba
datos.gob.ar/

View file

@ -1,16 +1,18 @@
// @ts-check // @ts-check
import { mkdir, open } from "node:fs/promises"; import { mkdir, open, writeFile } from "node:fs/promises";
import { Agent, fetch } from "undici"; import { Agent, fetch, request, setGlobalDispatcher } from "undici";
import { join, normalize } from "node:path"; import { join, normalize } from "node:path";
import { pipeline } from "node:stream/promises"; import { pipeline } from "node:stream/promises";
// FYI: al menos los siguientes dominios no tienen la cadena completa de certificados en HTTPS. tenemos que usar un hack (node_extra_ca_certs_mozilla_bundle) para conectarnos a estos sitios. (se puede ver con ssllabs.com) ojalá lxs administradorxs de estos servidores lo arreglen. // FYI: al menos los siguientes dominios no tienen la cadena completa de certificados en HTTPS. tenemos que usar un hack (node_extra_ca_certs_mozilla_bundle) para conectarnos a estos sitios. (se puede ver con ssllabs.com) ojalá lxs administradorxs de estos servidores lo arreglen.
// www.enargas.gov.ar, transparencia.enargas.gov.ar, www.energia.gob.ar, www.economia.gob.ar, datos.yvera.gob.ar // www.enargas.gov.ar, transparencia.enargas.gov.ar, www.energia.gob.ar, www.economia.gob.ar, datos.yvera.gob.ar
const dispatcher = new Agent({ setGlobalDispatcher(
pipelining: 50, new Agent({
maxRedirections: 20, pipelining: 0,
}); maxRedirections: 20,
})
);
class StatusCodeError extends Error { class StatusCodeError extends Error {
/** /**
@ -22,19 +24,20 @@ class StatusCodeError extends Error {
} }
} }
const outputPath = process.argv[2]; let jsonUrlString = process.argv[2];
if (!outputPath) { if (!jsonUrlString) {
console.error("Especificamente el output porfa"); console.error("Especificamente el url al json porfa");
process.exit(1); process.exit(1);
} }
const jsonUrl = new URL(jsonUrlString);
const outputPath = jsonUrl.host;
await mkdir(outputPath, { recursive: true }); await mkdir(outputPath, { recursive: true });
const errorFile = await open(join(outputPath, "errors.jsonl"), "w"); const errorFile = await open(join(outputPath, "errors.jsonl"), "w");
// Leer JSON de stdin const jsonRes = await fetch(jsonUrl);
const json = await process.stdin.toArray(); // prettier-ignore
const jsonString = json.join(""); const parsed = /** @type {{ dataset: Dataset[] }} */(await jsonRes.json())
/** @type {{ dataset: Dataset[] }} */ await writeFile(join(outputPath, "data.json"), JSON.stringify(parsed));
const parsed = JSON.parse(jsonString);
const jobs = parsed.dataset.flatMap((dataset) => const jobs = parsed.dataset.flatMap((dataset) =>
dataset.distribution.map((dist) => ({ dataset.distribution.map((dist) => ({
@ -60,7 +63,7 @@ for (const job of jobs) {
} }
const greens = [...jobsPerHost.entries()].flatMap(([host, jobs]) => { const greens = [...jobsPerHost.entries()].flatMap(([host, jobs]) => {
const nThreads = 128; const nThreads = 8;
return Array(nThreads) return Array(nThreads)
.fill(0) .fill(0)
.map(() => .map(() =>
@ -71,7 +74,10 @@ const greens = [...jobsPerHost.entries()].flatMap(([host, jobs]) => {
await downloadDistWithRetries(job); await downloadDistWithRetries(job);
} catch (error) { } catch (error) {
await errorFile.write( await errorFile.write(
JSON.stringify({ url: job.url.toString(), ...encodeError(error) }) JSON.stringify({
url: job.url.toString(),
...encodeError(error),
}) + "\n"
); );
nErrors++; nErrors++;
} finally { } finally {
@ -128,11 +134,9 @@ async function downloadDistWithRetries(job, tries = 0) {
async function downloadDist({ dist, dataset }) { async function downloadDist({ dist, dataset }) {
const url = new URL(dist.downloadURL); const url = new URL(dist.downloadURL);
const res = await fetch(url.toString(), { const res = await request(url.toString());
dispatcher, if (res.statusCode < 200 || res.statusCode > 299) {
}); throw new StatusCodeError(res.statusCode);
if (!res.ok) {
throw new StatusCodeError(res.status);
} }
const fileDirPath = join( const fileDirPath = join(
@ -201,10 +205,9 @@ function encodeError(error) {
return { kind: "http_error", status_code: error.code }; return { kind: "http_error", status_code: error.code };
else if (errorIsInfiniteRedirect(error)) return { kind: "infinite_redirect" }; else if (errorIsInfiniteRedirect(error)) return { kind: "infinite_redirect" };
else { else {
console.error(error, error.cause.message); return { kind: "generic_error", error: error.message };
return { kind: "generic_error", error };
} }
} }
function errorIsInfiniteRedirect(error) { function errorIsInfiniteRedirect(error) {
return error?.cause?.message === "redirect count exceeded"; return error?.message === "redirect count exceeded";
} }