descargar de ckan directo inventando un data.json

por si no soporta data.json

y guardar cantidad de datasets en metadata
This commit is contained in:
Cat /dev/Nulo 2023-12-09 16:53:49 -03:00
parent 11a64468e5
commit 81de080d22
5 changed files with 243 additions and 62 deletions

View file

@ -28,6 +28,7 @@ export const zData = z.object({
homepage: z.string().optional(),
dataset: z.array(zDataset),
});
/** @typedef {z.infer<typeof zData>} Data */
export const zError = z.object({
url: z.string().optional(),

View file

@ -0,0 +1,152 @@
import { request } from "undici";
import z from "zod";
import { userAgent } from "./config.js";
import { basename } from "path";
const zCkanPackageList = z.object({
success: z.literal(true),
result: z.array(z.string()),
});
/**
* @param {string} url
*/
async function getJson(url) {
const res = await request(url, {
headers: {
"User-Agent": userAgent,
},
});
const json = await res.body.json();
return json;
}
/**
* descarga una lista de los names de los datasets
* @param {string} ckanUrl
* @returns {Promise<string[]>}
*/
async function getCkanPackageList(ckanUrl) {
const json = await getJson(`${ckanUrl}/api/3/action/package_list`);
return zCkanPackageList.parse(json).result;
}
const zCkanOrganization = z.object({
name: z.string(),
title: z.string(),
id: z.string(),
created: z.string(),
});
const zCkanResource = z.object({
id: z.string(),
name: z.string(),
description: z.string(),
format: z.string(),
url: z.string(),
});
const zCkanTag = z.object({
id: z.string(),
display_name: z.string(),
name: z.string(),
});
const zCkanGroup = z.object({
id: z.string(),
display_name: z.string(),
name: z.string(),
description: z.string(),
});
const zCkanPackage = z.object({
license_title: z.string(),
license_id: z.string(),
license_url: z.string(),
maintainer: z.string(),
maintainer_email: z.string(),
id: z.string(),
name: z.string(),
title: z.string(),
metadata_created: z.string(),
metadata_modified: z.string(),
author: z.string(),
author_email: z.string(),
resources: z.array(zCkanResource),
tags: z.array(zCkanTag),
groups: z.array(zCkanGroup),
organization: zCkanOrganization,
url: z.string(),
notes: z.string(),
});
const zCkanPackageShow = z.object({
success: z.literal(true),
result: zCkanPackage,
});
/**
* @param {string} ckanUrl
* @param {string} packageName
*/
async function getCkanPackage(ckanUrl, packageName) {
const json = await getJson(
`${ckanUrl}/api/3/action/package_show?id=${encodeURIComponent(packageName)}`
);
return zCkanPackageShow.parse(json).result;
}
const zCkanStatusShow = z.object({
success: z.literal(true),
result: z.object({
site_url: z.string().describe("Titulo del portal. A veces vacio."),
site_description: z
.string()
.describe("Descripción del portal. A veces vacio."),
site_title: z.string(),
error_emails_to: z.string(),
}),
});
/**
* Consigue información general sobre el portal
* @param {string} ckanUrl
*/
async function getCkanInfo(ckanUrl) {
const json = await getJson(`${ckanUrl}/api/3/action/status_show`);
return zCkanStatusShow.parse(json).result;
}
/**
* Genera un data.json a partir de un CKAN que quizás no tiene un data.json oficial.
* @param {string} ckanUrl
*/
export async function generateDataJsonFromCkan(ckanUrl) {
const list = await getCkanPackageList(ckanUrl);
const info = await getCkanInfo(ckanUrl);
const packages = await Promise.all(
list.map((n) => getCkanPackage(ckanUrl, n))
);
/** @type {import("common/schema.js").Data & { generatedBy: string }} */
const data = {
generatedBy:
"archivador de datos abiertos (ckan_to_datajson) <https://github.com/catdevnull/transicion-desordenada-diablo>",
title: info.site_title || ckanUrl,
description: info.site_description || "",
homepage: info.site_url || ckanUrl,
dataset: packages.map((p) => ({
title: p.title,
description: p.notes,
identifier: p.id,
publisher: {
name: p.maintainer,
mbox: p.maintainer_email,
},
landingPage: p.url,
distribution: p.resources.map((r) => ({
identifier: r.id,
title: r.name,
description: r.description,
fileName: basename(r.url),
format: r.format,
downloadURL: r.url,
})),
})),
};
return data;
}

View file

@ -1,52 +1,59 @@
export const sitiosPorDefecto = [
"https://datos.gob.ar/data.json",
"http://datos.energia.gob.ar/data.json",
"https://datos.magyp.gob.ar/data.json",
"https://datos.acumar.gov.ar/data.json",
"https://datasets.datos.mincyt.gob.ar/data.json",
"https://datos.arsat.com.ar/data.json",
"https://datos.cultura.gob.ar/data.json",
"https://datos.mininterior.gob.ar/data.json",
"https://datos.produccion.gob.ar/data.json",
"https://datos.salud.gob.ar/data.json",
"https://datos.transporte.gob.ar/data.json",
"https://ckan.ciudaddemendoza.gov.ar/data.json",
"https://datos.santafe.gob.ar/data.json",
"https://datosabiertos.chaco.gob.ar/data.json",
"https://datosabiertos.mercedes.gob.ar/data.json",
"http://luj-bue-datos.paisdigital.innovacion.gob.ar/data.json",
"https://datosabiertos.desarrollosocial.gob.ar/data.json",
"http://datos.mindef.gov.ar/data.json",
export const targetsPorDefecto = [
"datajson+https://datos.gob.ar/data.json",
"datajson+http://datos.energia.gob.ar/data.json",
"datajson+https://datos.magyp.gob.ar/data.json",
"datajson+https://datos.acumar.gov.ar/data.json",
"datajson+https://datasets.datos.mincyt.gob.ar/data.json",
"datajson+https://datos.arsat.com.ar/data.json",
"datajson+https://datos.cultura.gob.ar/data.json",
"datajson+https://datos.mininterior.gob.ar/data.json",
"datajson+https://datos.produccion.gob.ar/data.json",
"datajson+https://datos.salud.gob.ar/data.json",
"datajson+https://datos.transporte.gob.ar/data.json",
"datajson+https://ckan.ciudaddemendoza.gov.ar/data.json",
"datajson+https://datos.santafe.gob.ar/data.json",
"datajson+https://datosabiertos.chaco.gob.ar/data.json",
"datajson+https://datosabiertos.mercedes.gob.ar/data.json",
"datajson+http://luj-bue-datos.paisdigital.innovacion.gob.ar/data.json",
"datajson+https://datosabiertos.desarrollosocial.gob.ar/data.json",
"datajson+http://datos.mindef.gov.ar/data.json",
"datajson+http://datos.legislatura.gob.ar/data.json",
"https://monitoreo.datos.gob.ar/catalog/jgm/data.json",
// 'https://datosabiertos.enacom.gob.ar/data.json',
"https://monitoreo.datos.gob.ar/catalog/otros/data.json",
"https://monitoreo.datos.gob.ar/catalog/aaip/data.json",
"https://monitoreo.datos.gob.ar/media/catalog/sedronar/data.json",
"https://monitoreo.datos.gob.ar/catalog/modernizacion/data.json",
"https://monitoreo.datos.gob.ar/media/catalog/shn/data.json",
"https://monitoreo.datos.gob.ar/catalog/smn/data.json",
"https://monitoreo.datos.gob.ar/catalog/ign/data.json",
"https://monitoreo.datos.gob.ar/catalog/justicia/data.json",
"https://monitoreo.datos.gob.ar/catalog/seguridad/data.json",
"https://monitoreo.datos.gob.ar/media/catalog/ambiente/data.json",
// "http://andino.siu.edu.ar/data.json",
"https://monitoreo.datos.gob.ar/catalog/educacion/data.json",
"https://monitoreo.datos.gob.ar/media/catalog/inti/data.json",
"https://monitoreo.datos.gob.ar/catalog/ssprys/data.json",
"https://www.presupuestoabierto.gob.ar/sici/rest-api/catalog/public",
"https://transparencia.enargas.gob.ar/data.json",
"https://infra.datos.gob.ar/catalog/sspm/data.json",
"https://monitoreo.datos.gob.ar/catalog/ssprys/data.json",
"https://monitoreo.datos.gob.ar/catalog/siep/data.json",
"https://monitoreo.datos.gob.ar/catalog/exterior/data.json",
"http://datos.pami.org.ar/data.json",
"https://monitoreo.datos.gob.ar/media/catalog/trabajo/data.json",
"https://datos.yvera.gob.ar/data.json",
"https://monitoreo.datos.gob.ar/media/catalog/renaper/data.json",
"https://monitoreo.datos.gob.ar/media/catalog/dine/data.json",
"https://monitoreo.datos.gob.ar/media/catalog/obras/data.json",
"https://monitoreo.datos.gob.ar/media/catalog/generos/data.json",
"datajson+https://monitoreo.datos.gob.ar/catalog/jgm/data.json",
// "datajson+https://datosabiertos.enacom.gob.ar/data.json",
"datajson+https://monitoreo.datos.gob.ar/catalog/otros/data.json",
"datajson+https://monitoreo.datos.gob.ar/catalog/aaip/data.json",
"datajson+https://monitoreo.datos.gob.ar/media/catalog/sedronar/data.json",
"datajson+https://monitoreo.datos.gob.ar/catalog/modernizacion/data.json",
"datajson+https://monitoreo.datos.gob.ar/media/catalog/shn/data.json",
"datajson+https://monitoreo.datos.gob.ar/catalog/smn/data.json",
"datajson+https://monitoreo.datos.gob.ar/catalog/ign/data.json",
"datajson+https://monitoreo.datos.gob.ar/catalog/justicia/data.json",
"datajson+https://monitoreo.datos.gob.ar/catalog/seguridad/data.json",
"datajson+https://monitoreo.datos.gob.ar/media/catalog/ambiente/data.json",
// "datajson+http://andino.siu.edu.ar/data.json",
"datajson+https://monitoreo.datos.gob.ar/catalog/educacion/data.json",
"datajson+https://monitoreo.datos.gob.ar/media/catalog/inti/data.json",
"datajson+https://monitoreo.datos.gob.ar/catalog/ssprys/data.json",
"datajson+https://www.presupuestoabierto.gob.ar/sici/rest-api/catalog/public",
"datajson+https://transparencia.enargas.gob.ar/data.json",
"datajson+https://infra.datos.gob.ar/catalog/sspm/data.json",
"datajson+https://monitoreo.datos.gob.ar/catalog/ssprys/data.json",
"datajson+https://monitoreo.datos.gob.ar/catalog/siep/data.json",
"datajson+https://monitoreo.datos.gob.ar/catalog/exterior/data.json",
"datajson+http://datos.pami.org.ar/data.json",
"datajson+https://monitoreo.datos.gob.ar/media/catalog/trabajo/data.json",
"datajson+https://datos.yvera.gob.ar/data.json",
"datajson+https://monitoreo.datos.gob.ar/media/catalog/renaper/data.json",
"datajson+https://monitoreo.datos.gob.ar/media/catalog/dine/data.json",
"datajson+https://monitoreo.datos.gob.ar/media/catalog/obras/data.json",
"datajson+https://monitoreo.datos.gob.ar/media/catalog/generos/data.json",
"ckan+http://datos.jus.gob.ar", // justicia nacional
"ckan+https://datos.csjn.gov.ar", // corte suprema de justicia nacional
"ckan+https://datos.hcdn.gob.ar", // diputados nacional
"ckan+https://data.buenosaires.gob.ar", // CABA
"ckan+https://datos.tsjbaires.gov.ar", // tribunal superior de justicia CABA
];
// desactivado porque va MUY lento: datosabiertos.gualeguaychu.gov.ar

View file

@ -2,7 +2,8 @@ import { mkdir, open, writeFile } from "node:fs/promises";
import { Agent, fetch, request, setGlobalDispatcher } from "undici";
import { join, normalize } from "node:path";
import pLimit from "p-limit";
import { sitiosPorDefecto, userAgent } from "./config.js";
import { targetsPorDefecto, userAgent } from "./config.js";
import { generateDataJsonFromCkan } from "./ckan_to_datajson.js";
setGlobalDispatcher(
new Agent({
@ -25,26 +26,43 @@ class StatusCodeError extends Error {
}
}
class TooManyRedirectsError extends Error {}
let jsonUrls = process.argv.slice(2);
if (jsonUrls.length < 1) {
jsonUrls = sitiosPorDefecto;
let urls = process.argv.slice(2);
if (urls.length < 1) {
urls = targetsPorDefecto;
}
for (const url of jsonUrls)
downloadFromData(url).catch((error) =>
console.error(`${url} FALLÓ CON`, error)
/** @typedef {{type: "data.json" | "ckan"; url: string;}} Target */
/** @type {Target[]} */
const targets = urls.map((url) => {
if (url.startsWith("datajson+")) {
return { type: "data.json", url: url.slice("datajson+".length) };
} else if (url.startsWith("ckan+")) {
return { type: "ckan", url: url.slice("ckan+".length) };
} else return { type: "data.json", url };
});
for (const target of targets)
downloadFromData(target).catch((error) =>
console.error(`${target} FALLÓ CON`, error)
);
/**
* @param {string} jsonUrl
* @param {Target} target
*/
async function downloadFromData(jsonUrl) {
const outputPath = generateOutputPath(jsonUrl);
const jsonRes = await fetch(jsonUrl);
async function downloadFromData(target) {
const outputPath = generateOutputPath(target.url);
let json;
if (target.type === "ckan") {
json = await generateDataJsonFromCkan(target.url);
} else if (target.type === "data.json") {
const jsonRes = await fetch(target.url);
json = await jsonRes.json();
}
// prettier-ignore
const parsed = /** @type {{ dataset: Dataset[] }} */(await jsonRes.json())
const parsed = /** @type {{ dataset: Dataset[] }} */(json)
await mkdir(outputPath, { recursive: true });
await writeFile(join(outputPath, "data.json"), JSON.stringify(parsed));
await writeFile(join(outputPath, "url.txt"), jsonUrl);
await writeFile(join(outputPath, "url.txt"), `${target.type}+${target.url}`);
const errorFile = (
await open(join(outputPath, "errors.jsonl"), "w")
).createWriteStream();

View file

@ -22,12 +22,15 @@ async function generateMetadata(dumpDir) {
.map(async (file) => {
const path = join(file.path, file.name);
const data = await loadDataJson(path);
const url = await readFile(join(path, "url.txt"), "utf-8");
let url = await readFile(join(path, "url.txt"), "utf-8");
if (url.startsWith("datajson+") || url.startsWith("ckan+"))
url = url.slice(url.indexOf("+") + 1);
return {
title: data.title,
description: data.description,
url,
path: file.name,
nDatasets: data.dataset.length,
};
})
);