mirror of
https://github.com/catdevnull/transicion-desordenada-diablo
synced 2024-11-23 00:16:20 +00:00
descargar de ckan directo inventando un data.json
por si no soporta data.json y guardar cantidad de datasets en metadata
This commit is contained in:
parent
11a64468e5
commit
81de080d22
5 changed files with 243 additions and 62 deletions
|
@ -28,6 +28,7 @@ export const zData = z.object({
|
|||
homepage: z.string().optional(),
|
||||
dataset: z.array(zDataset),
|
||||
});
|
||||
/** @typedef {z.infer<typeof zData>} Data */
|
||||
|
||||
export const zError = z.object({
|
||||
url: z.string().optional(),
|
||||
|
|
152
downloader/ckan_to_datajson.js
Normal file
152
downloader/ckan_to_datajson.js
Normal file
|
@ -0,0 +1,152 @@
|
|||
import { request } from "undici";
|
||||
import z from "zod";
|
||||
import { userAgent } from "./config.js";
|
||||
import { basename } from "path";
|
||||
|
||||
const zCkanPackageList = z.object({
|
||||
success: z.literal(true),
|
||||
result: z.array(z.string()),
|
||||
});
|
||||
|
||||
/**
|
||||
* @param {string} url
|
||||
*/
|
||||
async function getJson(url) {
|
||||
const res = await request(url, {
|
||||
headers: {
|
||||
"User-Agent": userAgent,
|
||||
},
|
||||
});
|
||||
const json = await res.body.json();
|
||||
return json;
|
||||
}
|
||||
|
||||
/**
|
||||
* descarga una lista de los names de los datasets
|
||||
* @param {string} ckanUrl
|
||||
* @returns {Promise<string[]>}
|
||||
*/
|
||||
async function getCkanPackageList(ckanUrl) {
|
||||
const json = await getJson(`${ckanUrl}/api/3/action/package_list`);
|
||||
return zCkanPackageList.parse(json).result;
|
||||
}
|
||||
|
||||
const zCkanOrganization = z.object({
|
||||
name: z.string(),
|
||||
title: z.string(),
|
||||
id: z.string(),
|
||||
created: z.string(),
|
||||
});
|
||||
const zCkanResource = z.object({
|
||||
id: z.string(),
|
||||
name: z.string(),
|
||||
description: z.string(),
|
||||
format: z.string(),
|
||||
url: z.string(),
|
||||
});
|
||||
const zCkanTag = z.object({
|
||||
id: z.string(),
|
||||
display_name: z.string(),
|
||||
name: z.string(),
|
||||
});
|
||||
const zCkanGroup = z.object({
|
||||
id: z.string(),
|
||||
display_name: z.string(),
|
||||
name: z.string(),
|
||||
description: z.string(),
|
||||
});
|
||||
const zCkanPackage = z.object({
|
||||
license_title: z.string(),
|
||||
license_id: z.string(),
|
||||
license_url: z.string(),
|
||||
maintainer: z.string(),
|
||||
maintainer_email: z.string(),
|
||||
id: z.string(),
|
||||
name: z.string(),
|
||||
title: z.string(),
|
||||
metadata_created: z.string(),
|
||||
metadata_modified: z.string(),
|
||||
author: z.string(),
|
||||
author_email: z.string(),
|
||||
resources: z.array(zCkanResource),
|
||||
tags: z.array(zCkanTag),
|
||||
groups: z.array(zCkanGroup),
|
||||
organization: zCkanOrganization,
|
||||
url: z.string(),
|
||||
notes: z.string(),
|
||||
});
|
||||
const zCkanPackageShow = z.object({
|
||||
success: z.literal(true),
|
||||
result: zCkanPackage,
|
||||
});
|
||||
|
||||
/**
|
||||
* @param {string} ckanUrl
|
||||
* @param {string} packageName
|
||||
*/
|
||||
async function getCkanPackage(ckanUrl, packageName) {
|
||||
const json = await getJson(
|
||||
`${ckanUrl}/api/3/action/package_show?id=${encodeURIComponent(packageName)}`
|
||||
);
|
||||
return zCkanPackageShow.parse(json).result;
|
||||
}
|
||||
|
||||
const zCkanStatusShow = z.object({
|
||||
success: z.literal(true),
|
||||
result: z.object({
|
||||
site_url: z.string().describe("Titulo del portal. A veces vacio."),
|
||||
site_description: z
|
||||
.string()
|
||||
.describe("Descripción del portal. A veces vacio."),
|
||||
site_title: z.string(),
|
||||
error_emails_to: z.string(),
|
||||
}),
|
||||
});
|
||||
|
||||
/**
|
||||
* Consigue información general sobre el portal
|
||||
* @param {string} ckanUrl
|
||||
*/
|
||||
async function getCkanInfo(ckanUrl) {
|
||||
const json = await getJson(`${ckanUrl}/api/3/action/status_show`);
|
||||
return zCkanStatusShow.parse(json).result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Genera un data.json a partir de un CKAN que quizás no tiene un data.json oficial.
|
||||
* @param {string} ckanUrl
|
||||
*/
|
||||
export async function generateDataJsonFromCkan(ckanUrl) {
|
||||
const list = await getCkanPackageList(ckanUrl);
|
||||
const info = await getCkanInfo(ckanUrl);
|
||||
const packages = await Promise.all(
|
||||
list.map((n) => getCkanPackage(ckanUrl, n))
|
||||
);
|
||||
/** @type {import("common/schema.js").Data & { generatedBy: string }} */
|
||||
const data = {
|
||||
generatedBy:
|
||||
"archivador de datos abiertos (ckan_to_datajson) <https://github.com/catdevnull/transicion-desordenada-diablo>",
|
||||
title: info.site_title || ckanUrl,
|
||||
description: info.site_description || "",
|
||||
homepage: info.site_url || ckanUrl,
|
||||
dataset: packages.map((p) => ({
|
||||
title: p.title,
|
||||
description: p.notes,
|
||||
identifier: p.id,
|
||||
publisher: {
|
||||
name: p.maintainer,
|
||||
mbox: p.maintainer_email,
|
||||
},
|
||||
landingPage: p.url,
|
||||
distribution: p.resources.map((r) => ({
|
||||
identifier: r.id,
|
||||
title: r.name,
|
||||
description: r.description,
|
||||
fileName: basename(r.url),
|
||||
format: r.format,
|
||||
downloadURL: r.url,
|
||||
})),
|
||||
})),
|
||||
};
|
||||
return data;
|
||||
}
|
|
@ -1,52 +1,59 @@
|
|||
export const sitiosPorDefecto = [
|
||||
"https://datos.gob.ar/data.json",
|
||||
"http://datos.energia.gob.ar/data.json",
|
||||
"https://datos.magyp.gob.ar/data.json",
|
||||
"https://datos.acumar.gov.ar/data.json",
|
||||
"https://datasets.datos.mincyt.gob.ar/data.json",
|
||||
"https://datos.arsat.com.ar/data.json",
|
||||
"https://datos.cultura.gob.ar/data.json",
|
||||
"https://datos.mininterior.gob.ar/data.json",
|
||||
"https://datos.produccion.gob.ar/data.json",
|
||||
"https://datos.salud.gob.ar/data.json",
|
||||
"https://datos.transporte.gob.ar/data.json",
|
||||
"https://ckan.ciudaddemendoza.gov.ar/data.json",
|
||||
"https://datos.santafe.gob.ar/data.json",
|
||||
"https://datosabiertos.chaco.gob.ar/data.json",
|
||||
"https://datosabiertos.mercedes.gob.ar/data.json",
|
||||
"http://luj-bue-datos.paisdigital.innovacion.gob.ar/data.json",
|
||||
"https://datosabiertos.desarrollosocial.gob.ar/data.json",
|
||||
"http://datos.mindef.gov.ar/data.json",
|
||||
export const targetsPorDefecto = [
|
||||
"datajson+https://datos.gob.ar/data.json",
|
||||
"datajson+http://datos.energia.gob.ar/data.json",
|
||||
"datajson+https://datos.magyp.gob.ar/data.json",
|
||||
"datajson+https://datos.acumar.gov.ar/data.json",
|
||||
"datajson+https://datasets.datos.mincyt.gob.ar/data.json",
|
||||
"datajson+https://datos.arsat.com.ar/data.json",
|
||||
"datajson+https://datos.cultura.gob.ar/data.json",
|
||||
"datajson+https://datos.mininterior.gob.ar/data.json",
|
||||
"datajson+https://datos.produccion.gob.ar/data.json",
|
||||
"datajson+https://datos.salud.gob.ar/data.json",
|
||||
"datajson+https://datos.transporte.gob.ar/data.json",
|
||||
"datajson+https://ckan.ciudaddemendoza.gov.ar/data.json",
|
||||
"datajson+https://datos.santafe.gob.ar/data.json",
|
||||
"datajson+https://datosabiertos.chaco.gob.ar/data.json",
|
||||
"datajson+https://datosabiertos.mercedes.gob.ar/data.json",
|
||||
"datajson+http://luj-bue-datos.paisdigital.innovacion.gob.ar/data.json",
|
||||
"datajson+https://datosabiertos.desarrollosocial.gob.ar/data.json",
|
||||
"datajson+http://datos.mindef.gov.ar/data.json",
|
||||
"datajson+http://datos.legislatura.gob.ar/data.json",
|
||||
|
||||
"https://monitoreo.datos.gob.ar/catalog/jgm/data.json",
|
||||
// 'https://datosabiertos.enacom.gob.ar/data.json',
|
||||
"https://monitoreo.datos.gob.ar/catalog/otros/data.json",
|
||||
"https://monitoreo.datos.gob.ar/catalog/aaip/data.json",
|
||||
"https://monitoreo.datos.gob.ar/media/catalog/sedronar/data.json",
|
||||
"https://monitoreo.datos.gob.ar/catalog/modernizacion/data.json",
|
||||
"https://monitoreo.datos.gob.ar/media/catalog/shn/data.json",
|
||||
"https://monitoreo.datos.gob.ar/catalog/smn/data.json",
|
||||
"https://monitoreo.datos.gob.ar/catalog/ign/data.json",
|
||||
"https://monitoreo.datos.gob.ar/catalog/justicia/data.json",
|
||||
"https://monitoreo.datos.gob.ar/catalog/seguridad/data.json",
|
||||
"https://monitoreo.datos.gob.ar/media/catalog/ambiente/data.json",
|
||||
// "http://andino.siu.edu.ar/data.json",
|
||||
"https://monitoreo.datos.gob.ar/catalog/educacion/data.json",
|
||||
"https://monitoreo.datos.gob.ar/media/catalog/inti/data.json",
|
||||
"https://monitoreo.datos.gob.ar/catalog/ssprys/data.json",
|
||||
"https://www.presupuestoabierto.gob.ar/sici/rest-api/catalog/public",
|
||||
"https://transparencia.enargas.gob.ar/data.json",
|
||||
"https://infra.datos.gob.ar/catalog/sspm/data.json",
|
||||
"https://monitoreo.datos.gob.ar/catalog/ssprys/data.json",
|
||||
"https://monitoreo.datos.gob.ar/catalog/siep/data.json",
|
||||
"https://monitoreo.datos.gob.ar/catalog/exterior/data.json",
|
||||
"http://datos.pami.org.ar/data.json",
|
||||
"https://monitoreo.datos.gob.ar/media/catalog/trabajo/data.json",
|
||||
"https://datos.yvera.gob.ar/data.json",
|
||||
"https://monitoreo.datos.gob.ar/media/catalog/renaper/data.json",
|
||||
"https://monitoreo.datos.gob.ar/media/catalog/dine/data.json",
|
||||
"https://monitoreo.datos.gob.ar/media/catalog/obras/data.json",
|
||||
"https://monitoreo.datos.gob.ar/media/catalog/generos/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/catalog/jgm/data.json",
|
||||
// "datajson+https://datosabiertos.enacom.gob.ar/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/catalog/otros/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/catalog/aaip/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/media/catalog/sedronar/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/catalog/modernizacion/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/media/catalog/shn/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/catalog/smn/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/catalog/ign/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/catalog/justicia/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/catalog/seguridad/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/media/catalog/ambiente/data.json",
|
||||
// "datajson+http://andino.siu.edu.ar/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/catalog/educacion/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/media/catalog/inti/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/catalog/ssprys/data.json",
|
||||
"datajson+https://www.presupuestoabierto.gob.ar/sici/rest-api/catalog/public",
|
||||
"datajson+https://transparencia.enargas.gob.ar/data.json",
|
||||
"datajson+https://infra.datos.gob.ar/catalog/sspm/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/catalog/ssprys/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/catalog/siep/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/catalog/exterior/data.json",
|
||||
"datajson+http://datos.pami.org.ar/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/media/catalog/trabajo/data.json",
|
||||
"datajson+https://datos.yvera.gob.ar/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/media/catalog/renaper/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/media/catalog/dine/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/media/catalog/obras/data.json",
|
||||
"datajson+https://monitoreo.datos.gob.ar/media/catalog/generos/data.json",
|
||||
|
||||
"ckan+http://datos.jus.gob.ar", // justicia nacional
|
||||
"ckan+https://datos.csjn.gov.ar", // corte suprema de justicia nacional
|
||||
"ckan+https://datos.hcdn.gob.ar", // diputados nacional
|
||||
"ckan+https://data.buenosaires.gob.ar", // CABA
|
||||
"ckan+https://datos.tsjbaires.gov.ar", // tribunal superior de justicia CABA
|
||||
];
|
||||
|
||||
// desactivado porque va MUY lento: datosabiertos.gualeguaychu.gov.ar
|
||||
|
|
|
@ -2,7 +2,8 @@ import { mkdir, open, writeFile } from "node:fs/promises";
|
|||
import { Agent, fetch, request, setGlobalDispatcher } from "undici";
|
||||
import { join, normalize } from "node:path";
|
||||
import pLimit from "p-limit";
|
||||
import { sitiosPorDefecto, userAgent } from "./config.js";
|
||||
import { targetsPorDefecto, userAgent } from "./config.js";
|
||||
import { generateDataJsonFromCkan } from "./ckan_to_datajson.js";
|
||||
|
||||
setGlobalDispatcher(
|
||||
new Agent({
|
||||
|
@ -25,26 +26,43 @@ class StatusCodeError extends Error {
|
|||
}
|
||||
}
|
||||
class TooManyRedirectsError extends Error {}
|
||||
let jsonUrls = process.argv.slice(2);
|
||||
if (jsonUrls.length < 1) {
|
||||
jsonUrls = sitiosPorDefecto;
|
||||
let urls = process.argv.slice(2);
|
||||
if (urls.length < 1) {
|
||||
urls = targetsPorDefecto;
|
||||
}
|
||||
for (const url of jsonUrls)
|
||||
downloadFromData(url).catch((error) =>
|
||||
console.error(`${url} FALLÓ CON`, error)
|
||||
/** @typedef {{type: "data.json" | "ckan"; url: string;}} Target */
|
||||
|
||||
/** @type {Target[]} */
|
||||
const targets = urls.map((url) => {
|
||||
if (url.startsWith("datajson+")) {
|
||||
return { type: "data.json", url: url.slice("datajson+".length) };
|
||||
} else if (url.startsWith("ckan+")) {
|
||||
return { type: "ckan", url: url.slice("ckan+".length) };
|
||||
} else return { type: "data.json", url };
|
||||
});
|
||||
for (const target of targets)
|
||||
downloadFromData(target).catch((error) =>
|
||||
console.error(`${target} FALLÓ CON`, error)
|
||||
);
|
||||
|
||||
/**
|
||||
* @param {string} jsonUrl
|
||||
* @param {Target} target
|
||||
*/
|
||||
async function downloadFromData(jsonUrl) {
|
||||
const outputPath = generateOutputPath(jsonUrl);
|
||||
const jsonRes = await fetch(jsonUrl);
|
||||
async function downloadFromData(target) {
|
||||
const outputPath = generateOutputPath(target.url);
|
||||
let json;
|
||||
if (target.type === "ckan") {
|
||||
json = await generateDataJsonFromCkan(target.url);
|
||||
} else if (target.type === "data.json") {
|
||||
const jsonRes = await fetch(target.url);
|
||||
json = await jsonRes.json();
|
||||
}
|
||||
|
||||
// prettier-ignore
|
||||
const parsed = /** @type {{ dataset: Dataset[] }} */(await jsonRes.json())
|
||||
const parsed = /** @type {{ dataset: Dataset[] }} */(json)
|
||||
await mkdir(outputPath, { recursive: true });
|
||||
await writeFile(join(outputPath, "data.json"), JSON.stringify(parsed));
|
||||
await writeFile(join(outputPath, "url.txt"), jsonUrl);
|
||||
await writeFile(join(outputPath, "url.txt"), `${target.type}+${target.url}`);
|
||||
const errorFile = (
|
||||
await open(join(outputPath, "errors.jsonl"), "w")
|
||||
).createWriteStream();
|
||||
|
|
|
@ -22,12 +22,15 @@ async function generateMetadata(dumpDir) {
|
|||
.map(async (file) => {
|
||||
const path = join(file.path, file.name);
|
||||
const data = await loadDataJson(path);
|
||||
const url = await readFile(join(path, "url.txt"), "utf-8");
|
||||
let url = await readFile(join(path, "url.txt"), "utf-8");
|
||||
if (url.startsWith("datajson+") || url.startsWith("ckan+"))
|
||||
url = url.slice(url.indexOf("+") + 1);
|
||||
return {
|
||||
title: data.title,
|
||||
description: data.description,
|
||||
url,
|
||||
path: file.name,
|
||||
nDatasets: data.dataset.length,
|
||||
};
|
||||
})
|
||||
);
|
||||
|
|
Loading…
Reference in a new issue