From 81de080d22ac5f968529703cbbf20ff33cefb179 Mon Sep 17 00:00:00 2001 From: Nulo Date: Sat, 9 Dec 2023 16:53:49 -0300 Subject: [PATCH] descargar de ckan directo inventando un data.json por si no soporta data.json y guardar cantidad de datasets en metadata --- common/schema.js | 1 + downloader/ckan_to_datajson.js | 152 +++++++++++++++++++++++++++ downloader/config.js | 103 +++++++++--------- downloader/download_json.js | 44 +++++--- downloader/generate_dump_metadata.js | 5 +- 5 files changed, 243 insertions(+), 62 deletions(-) create mode 100644 downloader/ckan_to_datajson.js diff --git a/common/schema.js b/common/schema.js index f399dbb..ed76540 100644 --- a/common/schema.js +++ b/common/schema.js @@ -28,6 +28,7 @@ export const zData = z.object({ homepage: z.string().optional(), dataset: z.array(zDataset), }); +/** @typedef {z.infer} Data */ export const zError = z.object({ url: z.string().optional(), diff --git a/downloader/ckan_to_datajson.js b/downloader/ckan_to_datajson.js new file mode 100644 index 0000000..6b15f96 --- /dev/null +++ b/downloader/ckan_to_datajson.js @@ -0,0 +1,152 @@ +import { request } from "undici"; +import z from "zod"; +import { userAgent } from "./config.js"; +import { basename } from "path"; + +const zCkanPackageList = z.object({ + success: z.literal(true), + result: z.array(z.string()), +}); + +/** + * @param {string} url + */ +async function getJson(url) { + const res = await request(url, { + headers: { + "User-Agent": userAgent, + }, + }); + const json = await res.body.json(); + return json; +} + +/** + * descarga una lista de los names de los datasets + * @param {string} ckanUrl + * @returns {Promise} + */ +async function getCkanPackageList(ckanUrl) { + const json = await getJson(`${ckanUrl}/api/3/action/package_list`); + return zCkanPackageList.parse(json).result; +} + +const zCkanOrganization = z.object({ + name: z.string(), + title: z.string(), + id: z.string(), + created: z.string(), +}); +const zCkanResource = z.object({ + id: z.string(), + name: z.string(), + description: z.string(), + format: z.string(), + url: z.string(), +}); +const zCkanTag = z.object({ + id: z.string(), + display_name: z.string(), + name: z.string(), +}); +const zCkanGroup = z.object({ + id: z.string(), + display_name: z.string(), + name: z.string(), + description: z.string(), +}); +const zCkanPackage = z.object({ + license_title: z.string(), + license_id: z.string(), + license_url: z.string(), + maintainer: z.string(), + maintainer_email: z.string(), + id: z.string(), + name: z.string(), + title: z.string(), + metadata_created: z.string(), + metadata_modified: z.string(), + author: z.string(), + author_email: z.string(), + resources: z.array(zCkanResource), + tags: z.array(zCkanTag), + groups: z.array(zCkanGroup), + organization: zCkanOrganization, + url: z.string(), + notes: z.string(), +}); +const zCkanPackageShow = z.object({ + success: z.literal(true), + result: zCkanPackage, +}); + +/** + * @param {string} ckanUrl + * @param {string} packageName + */ +async function getCkanPackage(ckanUrl, packageName) { + const json = await getJson( + `${ckanUrl}/api/3/action/package_show?id=${encodeURIComponent(packageName)}` + ); + return zCkanPackageShow.parse(json).result; +} + +const zCkanStatusShow = z.object({ + success: z.literal(true), + result: z.object({ + site_url: z.string().describe("Titulo del portal. A veces vacio."), + site_description: z + .string() + .describe("Descripción del portal. A veces vacio."), + site_title: z.string(), + error_emails_to: z.string(), + }), +}); + +/** + * Consigue información general sobre el portal + * @param {string} ckanUrl + */ +async function getCkanInfo(ckanUrl) { + const json = await getJson(`${ckanUrl}/api/3/action/status_show`); + return zCkanStatusShow.parse(json).result; +} + +/** + * Genera un data.json a partir de un CKAN que quizás no tiene un data.json oficial. + * @param {string} ckanUrl + */ +export async function generateDataJsonFromCkan(ckanUrl) { + const list = await getCkanPackageList(ckanUrl); + const info = await getCkanInfo(ckanUrl); + const packages = await Promise.all( + list.map((n) => getCkanPackage(ckanUrl, n)) + ); + /** @type {import("common/schema.js").Data & { generatedBy: string }} */ + const data = { + generatedBy: + "archivador de datos abiertos (ckan_to_datajson) ", + title: info.site_title || ckanUrl, + description: info.site_description || "", + homepage: info.site_url || ckanUrl, + dataset: packages.map((p) => ({ + title: p.title, + description: p.notes, + identifier: p.id, + publisher: { + name: p.maintainer, + mbox: p.maintainer_email, + }, + landingPage: p.url, + distribution: p.resources.map((r) => ({ + identifier: r.id, + title: r.name, + description: r.description, + fileName: basename(r.url), + format: r.format, + downloadURL: r.url, + })), + })), + }; + return data; +} diff --git a/downloader/config.js b/downloader/config.js index e0ae0d2..91988ab 100644 --- a/downloader/config.js +++ b/downloader/config.js @@ -1,52 +1,59 @@ -export const sitiosPorDefecto = [ - "https://datos.gob.ar/data.json", - "http://datos.energia.gob.ar/data.json", - "https://datos.magyp.gob.ar/data.json", - "https://datos.acumar.gov.ar/data.json", - "https://datasets.datos.mincyt.gob.ar/data.json", - "https://datos.arsat.com.ar/data.json", - "https://datos.cultura.gob.ar/data.json", - "https://datos.mininterior.gob.ar/data.json", - "https://datos.produccion.gob.ar/data.json", - "https://datos.salud.gob.ar/data.json", - "https://datos.transporte.gob.ar/data.json", - "https://ckan.ciudaddemendoza.gov.ar/data.json", - "https://datos.santafe.gob.ar/data.json", - "https://datosabiertos.chaco.gob.ar/data.json", - "https://datosabiertos.mercedes.gob.ar/data.json", - "http://luj-bue-datos.paisdigital.innovacion.gob.ar/data.json", - "https://datosabiertos.desarrollosocial.gob.ar/data.json", - "http://datos.mindef.gov.ar/data.json", +export const targetsPorDefecto = [ + "datajson+https://datos.gob.ar/data.json", + "datajson+http://datos.energia.gob.ar/data.json", + "datajson+https://datos.magyp.gob.ar/data.json", + "datajson+https://datos.acumar.gov.ar/data.json", + "datajson+https://datasets.datos.mincyt.gob.ar/data.json", + "datajson+https://datos.arsat.com.ar/data.json", + "datajson+https://datos.cultura.gob.ar/data.json", + "datajson+https://datos.mininterior.gob.ar/data.json", + "datajson+https://datos.produccion.gob.ar/data.json", + "datajson+https://datos.salud.gob.ar/data.json", + "datajson+https://datos.transporte.gob.ar/data.json", + "datajson+https://ckan.ciudaddemendoza.gov.ar/data.json", + "datajson+https://datos.santafe.gob.ar/data.json", + "datajson+https://datosabiertos.chaco.gob.ar/data.json", + "datajson+https://datosabiertos.mercedes.gob.ar/data.json", + "datajson+http://luj-bue-datos.paisdigital.innovacion.gob.ar/data.json", + "datajson+https://datosabiertos.desarrollosocial.gob.ar/data.json", + "datajson+http://datos.mindef.gov.ar/data.json", + "datajson+http://datos.legislatura.gob.ar/data.json", - "https://monitoreo.datos.gob.ar/catalog/jgm/data.json", - // 'https://datosabiertos.enacom.gob.ar/data.json', - "https://monitoreo.datos.gob.ar/catalog/otros/data.json", - "https://monitoreo.datos.gob.ar/catalog/aaip/data.json", - "https://monitoreo.datos.gob.ar/media/catalog/sedronar/data.json", - "https://monitoreo.datos.gob.ar/catalog/modernizacion/data.json", - "https://monitoreo.datos.gob.ar/media/catalog/shn/data.json", - "https://monitoreo.datos.gob.ar/catalog/smn/data.json", - "https://monitoreo.datos.gob.ar/catalog/ign/data.json", - "https://monitoreo.datos.gob.ar/catalog/justicia/data.json", - "https://monitoreo.datos.gob.ar/catalog/seguridad/data.json", - "https://monitoreo.datos.gob.ar/media/catalog/ambiente/data.json", - // "http://andino.siu.edu.ar/data.json", - "https://monitoreo.datos.gob.ar/catalog/educacion/data.json", - "https://monitoreo.datos.gob.ar/media/catalog/inti/data.json", - "https://monitoreo.datos.gob.ar/catalog/ssprys/data.json", - "https://www.presupuestoabierto.gob.ar/sici/rest-api/catalog/public", - "https://transparencia.enargas.gob.ar/data.json", - "https://infra.datos.gob.ar/catalog/sspm/data.json", - "https://monitoreo.datos.gob.ar/catalog/ssprys/data.json", - "https://monitoreo.datos.gob.ar/catalog/siep/data.json", - "https://monitoreo.datos.gob.ar/catalog/exterior/data.json", - "http://datos.pami.org.ar/data.json", - "https://monitoreo.datos.gob.ar/media/catalog/trabajo/data.json", - "https://datos.yvera.gob.ar/data.json", - "https://monitoreo.datos.gob.ar/media/catalog/renaper/data.json", - "https://monitoreo.datos.gob.ar/media/catalog/dine/data.json", - "https://monitoreo.datos.gob.ar/media/catalog/obras/data.json", - "https://monitoreo.datos.gob.ar/media/catalog/generos/data.json", + "datajson+https://monitoreo.datos.gob.ar/catalog/jgm/data.json", + // "datajson+https://datosabiertos.enacom.gob.ar/data.json", + "datajson+https://monitoreo.datos.gob.ar/catalog/otros/data.json", + "datajson+https://monitoreo.datos.gob.ar/catalog/aaip/data.json", + "datajson+https://monitoreo.datos.gob.ar/media/catalog/sedronar/data.json", + "datajson+https://monitoreo.datos.gob.ar/catalog/modernizacion/data.json", + "datajson+https://monitoreo.datos.gob.ar/media/catalog/shn/data.json", + "datajson+https://monitoreo.datos.gob.ar/catalog/smn/data.json", + "datajson+https://monitoreo.datos.gob.ar/catalog/ign/data.json", + "datajson+https://monitoreo.datos.gob.ar/catalog/justicia/data.json", + "datajson+https://monitoreo.datos.gob.ar/catalog/seguridad/data.json", + "datajson+https://monitoreo.datos.gob.ar/media/catalog/ambiente/data.json", + // "datajson+http://andino.siu.edu.ar/data.json", + "datajson+https://monitoreo.datos.gob.ar/catalog/educacion/data.json", + "datajson+https://monitoreo.datos.gob.ar/media/catalog/inti/data.json", + "datajson+https://monitoreo.datos.gob.ar/catalog/ssprys/data.json", + "datajson+https://www.presupuestoabierto.gob.ar/sici/rest-api/catalog/public", + "datajson+https://transparencia.enargas.gob.ar/data.json", + "datajson+https://infra.datos.gob.ar/catalog/sspm/data.json", + "datajson+https://monitoreo.datos.gob.ar/catalog/ssprys/data.json", + "datajson+https://monitoreo.datos.gob.ar/catalog/siep/data.json", + "datajson+https://monitoreo.datos.gob.ar/catalog/exterior/data.json", + "datajson+http://datos.pami.org.ar/data.json", + "datajson+https://monitoreo.datos.gob.ar/media/catalog/trabajo/data.json", + "datajson+https://datos.yvera.gob.ar/data.json", + "datajson+https://monitoreo.datos.gob.ar/media/catalog/renaper/data.json", + "datajson+https://monitoreo.datos.gob.ar/media/catalog/dine/data.json", + "datajson+https://monitoreo.datos.gob.ar/media/catalog/obras/data.json", + "datajson+https://monitoreo.datos.gob.ar/media/catalog/generos/data.json", + + "ckan+http://datos.jus.gob.ar", // justicia nacional + "ckan+https://datos.csjn.gov.ar", // corte suprema de justicia nacional + "ckan+https://datos.hcdn.gob.ar", // diputados nacional + "ckan+https://data.buenosaires.gob.ar", // CABA + "ckan+https://datos.tsjbaires.gov.ar", // tribunal superior de justicia CABA ]; // desactivado porque va MUY lento: datosabiertos.gualeguaychu.gov.ar diff --git a/downloader/download_json.js b/downloader/download_json.js index 9c24f15..41f46b5 100644 --- a/downloader/download_json.js +++ b/downloader/download_json.js @@ -2,7 +2,8 @@ import { mkdir, open, writeFile } from "node:fs/promises"; import { Agent, fetch, request, setGlobalDispatcher } from "undici"; import { join, normalize } from "node:path"; import pLimit from "p-limit"; -import { sitiosPorDefecto, userAgent } from "./config.js"; +import { targetsPorDefecto, userAgent } from "./config.js"; +import { generateDataJsonFromCkan } from "./ckan_to_datajson.js"; setGlobalDispatcher( new Agent({ @@ -25,26 +26,43 @@ class StatusCodeError extends Error { } } class TooManyRedirectsError extends Error {} -let jsonUrls = process.argv.slice(2); -if (jsonUrls.length < 1) { - jsonUrls = sitiosPorDefecto; +let urls = process.argv.slice(2); +if (urls.length < 1) { + urls = targetsPorDefecto; } -for (const url of jsonUrls) - downloadFromData(url).catch((error) => - console.error(`${url} FALLÓ CON`, error) +/** @typedef {{type: "data.json" | "ckan"; url: string;}} Target */ + +/** @type {Target[]} */ +const targets = urls.map((url) => { + if (url.startsWith("datajson+")) { + return { type: "data.json", url: url.slice("datajson+".length) }; + } else if (url.startsWith("ckan+")) { + return { type: "ckan", url: url.slice("ckan+".length) }; + } else return { type: "data.json", url }; +}); +for (const target of targets) + downloadFromData(target).catch((error) => + console.error(`${target} FALLÓ CON`, error) ); /** - * @param {string} jsonUrl + * @param {Target} target */ -async function downloadFromData(jsonUrl) { - const outputPath = generateOutputPath(jsonUrl); - const jsonRes = await fetch(jsonUrl); +async function downloadFromData(target) { + const outputPath = generateOutputPath(target.url); + let json; + if (target.type === "ckan") { + json = await generateDataJsonFromCkan(target.url); + } else if (target.type === "data.json") { + const jsonRes = await fetch(target.url); + json = await jsonRes.json(); + } + // prettier-ignore - const parsed = /** @type {{ dataset: Dataset[] }} */(await jsonRes.json()) + const parsed = /** @type {{ dataset: Dataset[] }} */(json) await mkdir(outputPath, { recursive: true }); await writeFile(join(outputPath, "data.json"), JSON.stringify(parsed)); - await writeFile(join(outputPath, "url.txt"), jsonUrl); + await writeFile(join(outputPath, "url.txt"), `${target.type}+${target.url}`); const errorFile = ( await open(join(outputPath, "errors.jsonl"), "w") ).createWriteStream(); diff --git a/downloader/generate_dump_metadata.js b/downloader/generate_dump_metadata.js index 9bb5e41..46e68ae 100644 --- a/downloader/generate_dump_metadata.js +++ b/downloader/generate_dump_metadata.js @@ -22,12 +22,15 @@ async function generateMetadata(dumpDir) { .map(async (file) => { const path = join(file.path, file.name); const data = await loadDataJson(path); - const url = await readFile(join(path, "url.txt"), "utf-8"); + let url = await readFile(join(path, "url.txt"), "utf-8"); + if (url.startsWith("datajson+") || url.startsWith("ckan+")) + url = url.slice(url.indexOf("+") + 1); return { title: data.title, description: data.description, url, path: file.name, + nDatasets: data.dataset.length, }; }) );