mirror of
https://github.com/catdevnull/transicion-desordenada-diablo
synced 2024-11-23 08:26:20 +00:00
descargar de ckan directo inventando un data.json
por si no soporta data.json y guardar cantidad de datasets en metadata
This commit is contained in:
parent
11a64468e5
commit
81de080d22
5 changed files with 243 additions and 62 deletions
|
@ -28,6 +28,7 @@ export const zData = z.object({
|
||||||
homepage: z.string().optional(),
|
homepage: z.string().optional(),
|
||||||
dataset: z.array(zDataset),
|
dataset: z.array(zDataset),
|
||||||
});
|
});
|
||||||
|
/** @typedef {z.infer<typeof zData>} Data */
|
||||||
|
|
||||||
export const zError = z.object({
|
export const zError = z.object({
|
||||||
url: z.string().optional(),
|
url: z.string().optional(),
|
||||||
|
|
152
downloader/ckan_to_datajson.js
Normal file
152
downloader/ckan_to_datajson.js
Normal file
|
@ -0,0 +1,152 @@
|
||||||
|
import { request } from "undici";
|
||||||
|
import z from "zod";
|
||||||
|
import { userAgent } from "./config.js";
|
||||||
|
import { basename } from "path";
|
||||||
|
|
||||||
|
const zCkanPackageList = z.object({
|
||||||
|
success: z.literal(true),
|
||||||
|
result: z.array(z.string()),
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param {string} url
|
||||||
|
*/
|
||||||
|
async function getJson(url) {
|
||||||
|
const res = await request(url, {
|
||||||
|
headers: {
|
||||||
|
"User-Agent": userAgent,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
const json = await res.body.json();
|
||||||
|
return json;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* descarga una lista de los names de los datasets
|
||||||
|
* @param {string} ckanUrl
|
||||||
|
* @returns {Promise<string[]>}
|
||||||
|
*/
|
||||||
|
async function getCkanPackageList(ckanUrl) {
|
||||||
|
const json = await getJson(`${ckanUrl}/api/3/action/package_list`);
|
||||||
|
return zCkanPackageList.parse(json).result;
|
||||||
|
}
|
||||||
|
|
||||||
|
const zCkanOrganization = z.object({
|
||||||
|
name: z.string(),
|
||||||
|
title: z.string(),
|
||||||
|
id: z.string(),
|
||||||
|
created: z.string(),
|
||||||
|
});
|
||||||
|
const zCkanResource = z.object({
|
||||||
|
id: z.string(),
|
||||||
|
name: z.string(),
|
||||||
|
description: z.string(),
|
||||||
|
format: z.string(),
|
||||||
|
url: z.string(),
|
||||||
|
});
|
||||||
|
const zCkanTag = z.object({
|
||||||
|
id: z.string(),
|
||||||
|
display_name: z.string(),
|
||||||
|
name: z.string(),
|
||||||
|
});
|
||||||
|
const zCkanGroup = z.object({
|
||||||
|
id: z.string(),
|
||||||
|
display_name: z.string(),
|
||||||
|
name: z.string(),
|
||||||
|
description: z.string(),
|
||||||
|
});
|
||||||
|
const zCkanPackage = z.object({
|
||||||
|
license_title: z.string(),
|
||||||
|
license_id: z.string(),
|
||||||
|
license_url: z.string(),
|
||||||
|
maintainer: z.string(),
|
||||||
|
maintainer_email: z.string(),
|
||||||
|
id: z.string(),
|
||||||
|
name: z.string(),
|
||||||
|
title: z.string(),
|
||||||
|
metadata_created: z.string(),
|
||||||
|
metadata_modified: z.string(),
|
||||||
|
author: z.string(),
|
||||||
|
author_email: z.string(),
|
||||||
|
resources: z.array(zCkanResource),
|
||||||
|
tags: z.array(zCkanTag),
|
||||||
|
groups: z.array(zCkanGroup),
|
||||||
|
organization: zCkanOrganization,
|
||||||
|
url: z.string(),
|
||||||
|
notes: z.string(),
|
||||||
|
});
|
||||||
|
const zCkanPackageShow = z.object({
|
||||||
|
success: z.literal(true),
|
||||||
|
result: zCkanPackage,
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param {string} ckanUrl
|
||||||
|
* @param {string} packageName
|
||||||
|
*/
|
||||||
|
async function getCkanPackage(ckanUrl, packageName) {
|
||||||
|
const json = await getJson(
|
||||||
|
`${ckanUrl}/api/3/action/package_show?id=${encodeURIComponent(packageName)}`
|
||||||
|
);
|
||||||
|
return zCkanPackageShow.parse(json).result;
|
||||||
|
}
|
||||||
|
|
||||||
|
const zCkanStatusShow = z.object({
|
||||||
|
success: z.literal(true),
|
||||||
|
result: z.object({
|
||||||
|
site_url: z.string().describe("Titulo del portal. A veces vacio."),
|
||||||
|
site_description: z
|
||||||
|
.string()
|
||||||
|
.describe("Descripción del portal. A veces vacio."),
|
||||||
|
site_title: z.string(),
|
||||||
|
error_emails_to: z.string(),
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Consigue información general sobre el portal
|
||||||
|
* @param {string} ckanUrl
|
||||||
|
*/
|
||||||
|
async function getCkanInfo(ckanUrl) {
|
||||||
|
const json = await getJson(`${ckanUrl}/api/3/action/status_show`);
|
||||||
|
return zCkanStatusShow.parse(json).result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Genera un data.json a partir de un CKAN que quizás no tiene un data.json oficial.
|
||||||
|
* @param {string} ckanUrl
|
||||||
|
*/
|
||||||
|
export async function generateDataJsonFromCkan(ckanUrl) {
|
||||||
|
const list = await getCkanPackageList(ckanUrl);
|
||||||
|
const info = await getCkanInfo(ckanUrl);
|
||||||
|
const packages = await Promise.all(
|
||||||
|
list.map((n) => getCkanPackage(ckanUrl, n))
|
||||||
|
);
|
||||||
|
/** @type {import("common/schema.js").Data & { generatedBy: string }} */
|
||||||
|
const data = {
|
||||||
|
generatedBy:
|
||||||
|
"archivador de datos abiertos (ckan_to_datajson) <https://github.com/catdevnull/transicion-desordenada-diablo>",
|
||||||
|
title: info.site_title || ckanUrl,
|
||||||
|
description: info.site_description || "",
|
||||||
|
homepage: info.site_url || ckanUrl,
|
||||||
|
dataset: packages.map((p) => ({
|
||||||
|
title: p.title,
|
||||||
|
description: p.notes,
|
||||||
|
identifier: p.id,
|
||||||
|
publisher: {
|
||||||
|
name: p.maintainer,
|
||||||
|
mbox: p.maintainer_email,
|
||||||
|
},
|
||||||
|
landingPage: p.url,
|
||||||
|
distribution: p.resources.map((r) => ({
|
||||||
|
identifier: r.id,
|
||||||
|
title: r.name,
|
||||||
|
description: r.description,
|
||||||
|
fileName: basename(r.url),
|
||||||
|
format: r.format,
|
||||||
|
downloadURL: r.url,
|
||||||
|
})),
|
||||||
|
})),
|
||||||
|
};
|
||||||
|
return data;
|
||||||
|
}
|
|
@ -1,52 +1,59 @@
|
||||||
export const sitiosPorDefecto = [
|
export const targetsPorDefecto = [
|
||||||
"https://datos.gob.ar/data.json",
|
"datajson+https://datos.gob.ar/data.json",
|
||||||
"http://datos.energia.gob.ar/data.json",
|
"datajson+http://datos.energia.gob.ar/data.json",
|
||||||
"https://datos.magyp.gob.ar/data.json",
|
"datajson+https://datos.magyp.gob.ar/data.json",
|
||||||
"https://datos.acumar.gov.ar/data.json",
|
"datajson+https://datos.acumar.gov.ar/data.json",
|
||||||
"https://datasets.datos.mincyt.gob.ar/data.json",
|
"datajson+https://datasets.datos.mincyt.gob.ar/data.json",
|
||||||
"https://datos.arsat.com.ar/data.json",
|
"datajson+https://datos.arsat.com.ar/data.json",
|
||||||
"https://datos.cultura.gob.ar/data.json",
|
"datajson+https://datos.cultura.gob.ar/data.json",
|
||||||
"https://datos.mininterior.gob.ar/data.json",
|
"datajson+https://datos.mininterior.gob.ar/data.json",
|
||||||
"https://datos.produccion.gob.ar/data.json",
|
"datajson+https://datos.produccion.gob.ar/data.json",
|
||||||
"https://datos.salud.gob.ar/data.json",
|
"datajson+https://datos.salud.gob.ar/data.json",
|
||||||
"https://datos.transporte.gob.ar/data.json",
|
"datajson+https://datos.transporte.gob.ar/data.json",
|
||||||
"https://ckan.ciudaddemendoza.gov.ar/data.json",
|
"datajson+https://ckan.ciudaddemendoza.gov.ar/data.json",
|
||||||
"https://datos.santafe.gob.ar/data.json",
|
"datajson+https://datos.santafe.gob.ar/data.json",
|
||||||
"https://datosabiertos.chaco.gob.ar/data.json",
|
"datajson+https://datosabiertos.chaco.gob.ar/data.json",
|
||||||
"https://datosabiertos.mercedes.gob.ar/data.json",
|
"datajson+https://datosabiertos.mercedes.gob.ar/data.json",
|
||||||
"http://luj-bue-datos.paisdigital.innovacion.gob.ar/data.json",
|
"datajson+http://luj-bue-datos.paisdigital.innovacion.gob.ar/data.json",
|
||||||
"https://datosabiertos.desarrollosocial.gob.ar/data.json",
|
"datajson+https://datosabiertos.desarrollosocial.gob.ar/data.json",
|
||||||
"http://datos.mindef.gov.ar/data.json",
|
"datajson+http://datos.mindef.gov.ar/data.json",
|
||||||
|
"datajson+http://datos.legislatura.gob.ar/data.json",
|
||||||
|
|
||||||
"https://monitoreo.datos.gob.ar/catalog/jgm/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/catalog/jgm/data.json",
|
||||||
// 'https://datosabiertos.enacom.gob.ar/data.json',
|
// "datajson+https://datosabiertos.enacom.gob.ar/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/catalog/otros/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/catalog/otros/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/catalog/aaip/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/catalog/aaip/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/media/catalog/sedronar/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/media/catalog/sedronar/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/catalog/modernizacion/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/catalog/modernizacion/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/media/catalog/shn/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/media/catalog/shn/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/catalog/smn/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/catalog/smn/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/catalog/ign/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/catalog/ign/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/catalog/justicia/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/catalog/justicia/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/catalog/seguridad/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/catalog/seguridad/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/media/catalog/ambiente/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/media/catalog/ambiente/data.json",
|
||||||
// "http://andino.siu.edu.ar/data.json",
|
// "datajson+http://andino.siu.edu.ar/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/catalog/educacion/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/catalog/educacion/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/media/catalog/inti/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/media/catalog/inti/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/catalog/ssprys/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/catalog/ssprys/data.json",
|
||||||
"https://www.presupuestoabierto.gob.ar/sici/rest-api/catalog/public",
|
"datajson+https://www.presupuestoabierto.gob.ar/sici/rest-api/catalog/public",
|
||||||
"https://transparencia.enargas.gob.ar/data.json",
|
"datajson+https://transparencia.enargas.gob.ar/data.json",
|
||||||
"https://infra.datos.gob.ar/catalog/sspm/data.json",
|
"datajson+https://infra.datos.gob.ar/catalog/sspm/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/catalog/ssprys/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/catalog/ssprys/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/catalog/siep/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/catalog/siep/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/catalog/exterior/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/catalog/exterior/data.json",
|
||||||
"http://datos.pami.org.ar/data.json",
|
"datajson+http://datos.pami.org.ar/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/media/catalog/trabajo/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/media/catalog/trabajo/data.json",
|
||||||
"https://datos.yvera.gob.ar/data.json",
|
"datajson+https://datos.yvera.gob.ar/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/media/catalog/renaper/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/media/catalog/renaper/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/media/catalog/dine/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/media/catalog/dine/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/media/catalog/obras/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/media/catalog/obras/data.json",
|
||||||
"https://monitoreo.datos.gob.ar/media/catalog/generos/data.json",
|
"datajson+https://monitoreo.datos.gob.ar/media/catalog/generos/data.json",
|
||||||
|
|
||||||
|
"ckan+http://datos.jus.gob.ar", // justicia nacional
|
||||||
|
"ckan+https://datos.csjn.gov.ar", // corte suprema de justicia nacional
|
||||||
|
"ckan+https://datos.hcdn.gob.ar", // diputados nacional
|
||||||
|
"ckan+https://data.buenosaires.gob.ar", // CABA
|
||||||
|
"ckan+https://datos.tsjbaires.gov.ar", // tribunal superior de justicia CABA
|
||||||
];
|
];
|
||||||
|
|
||||||
// desactivado porque va MUY lento: datosabiertos.gualeguaychu.gov.ar
|
// desactivado porque va MUY lento: datosabiertos.gualeguaychu.gov.ar
|
||||||
|
|
|
@ -2,7 +2,8 @@ import { mkdir, open, writeFile } from "node:fs/promises";
|
||||||
import { Agent, fetch, request, setGlobalDispatcher } from "undici";
|
import { Agent, fetch, request, setGlobalDispatcher } from "undici";
|
||||||
import { join, normalize } from "node:path";
|
import { join, normalize } from "node:path";
|
||||||
import pLimit from "p-limit";
|
import pLimit from "p-limit";
|
||||||
import { sitiosPorDefecto, userAgent } from "./config.js";
|
import { targetsPorDefecto, userAgent } from "./config.js";
|
||||||
|
import { generateDataJsonFromCkan } from "./ckan_to_datajson.js";
|
||||||
|
|
||||||
setGlobalDispatcher(
|
setGlobalDispatcher(
|
||||||
new Agent({
|
new Agent({
|
||||||
|
@ -25,26 +26,43 @@ class StatusCodeError extends Error {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
class TooManyRedirectsError extends Error {}
|
class TooManyRedirectsError extends Error {}
|
||||||
let jsonUrls = process.argv.slice(2);
|
let urls = process.argv.slice(2);
|
||||||
if (jsonUrls.length < 1) {
|
if (urls.length < 1) {
|
||||||
jsonUrls = sitiosPorDefecto;
|
urls = targetsPorDefecto;
|
||||||
}
|
}
|
||||||
for (const url of jsonUrls)
|
/** @typedef {{type: "data.json" | "ckan"; url: string;}} Target */
|
||||||
downloadFromData(url).catch((error) =>
|
|
||||||
console.error(`${url} FALLÓ CON`, error)
|
/** @type {Target[]} */
|
||||||
|
const targets = urls.map((url) => {
|
||||||
|
if (url.startsWith("datajson+")) {
|
||||||
|
return { type: "data.json", url: url.slice("datajson+".length) };
|
||||||
|
} else if (url.startsWith("ckan+")) {
|
||||||
|
return { type: "ckan", url: url.slice("ckan+".length) };
|
||||||
|
} else return { type: "data.json", url };
|
||||||
|
});
|
||||||
|
for (const target of targets)
|
||||||
|
downloadFromData(target).catch((error) =>
|
||||||
|
console.error(`${target} FALLÓ CON`, error)
|
||||||
);
|
);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param {string} jsonUrl
|
* @param {Target} target
|
||||||
*/
|
*/
|
||||||
async function downloadFromData(jsonUrl) {
|
async function downloadFromData(target) {
|
||||||
const outputPath = generateOutputPath(jsonUrl);
|
const outputPath = generateOutputPath(target.url);
|
||||||
const jsonRes = await fetch(jsonUrl);
|
let json;
|
||||||
|
if (target.type === "ckan") {
|
||||||
|
json = await generateDataJsonFromCkan(target.url);
|
||||||
|
} else if (target.type === "data.json") {
|
||||||
|
const jsonRes = await fetch(target.url);
|
||||||
|
json = await jsonRes.json();
|
||||||
|
}
|
||||||
|
|
||||||
// prettier-ignore
|
// prettier-ignore
|
||||||
const parsed = /** @type {{ dataset: Dataset[] }} */(await jsonRes.json())
|
const parsed = /** @type {{ dataset: Dataset[] }} */(json)
|
||||||
await mkdir(outputPath, { recursive: true });
|
await mkdir(outputPath, { recursive: true });
|
||||||
await writeFile(join(outputPath, "data.json"), JSON.stringify(parsed));
|
await writeFile(join(outputPath, "data.json"), JSON.stringify(parsed));
|
||||||
await writeFile(join(outputPath, "url.txt"), jsonUrl);
|
await writeFile(join(outputPath, "url.txt"), `${target.type}+${target.url}`);
|
||||||
const errorFile = (
|
const errorFile = (
|
||||||
await open(join(outputPath, "errors.jsonl"), "w")
|
await open(join(outputPath, "errors.jsonl"), "w")
|
||||||
).createWriteStream();
|
).createWriteStream();
|
||||||
|
|
|
@ -22,12 +22,15 @@ async function generateMetadata(dumpDir) {
|
||||||
.map(async (file) => {
|
.map(async (file) => {
|
||||||
const path = join(file.path, file.name);
|
const path = join(file.path, file.name);
|
||||||
const data = await loadDataJson(path);
|
const data = await loadDataJson(path);
|
||||||
const url = await readFile(join(path, "url.txt"), "utf-8");
|
let url = await readFile(join(path, "url.txt"), "utf-8");
|
||||||
|
if (url.startsWith("datajson+") || url.startsWith("ckan+"))
|
||||||
|
url = url.slice(url.indexOf("+") + 1);
|
||||||
return {
|
return {
|
||||||
title: data.title,
|
title: data.title,
|
||||||
description: data.description,
|
description: data.description,
|
||||||
url,
|
url,
|
||||||
path: file.name,
|
path: file.name,
|
||||||
|
nDatasets: data.dataset.length,
|
||||||
};
|
};
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
|
Loading…
Reference in a new issue