From 9800b16bbc2ba982b010ec130c163ce9b2b829e7 Mon Sep 17 00:00:00 2001 From: Nulo Date: Fri, 8 Dec 2023 16:05:25 -0300 Subject: [PATCH] =?UTF-8?q?compartir=20c=C3=B3digo,=20estandarizar=20termi?= =?UTF-8?q?nos,=20generar=20dump-metadata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .dockerignore | 11 + common/package.json | 16 + .../src/lib/schema.ts => common/schema.js | 16 +- common/tsconfig.json | 14 + downloader/Containerfile | 18 +- downloader/download_json.js | 91 +- downloader/generate_dump_metadata.js | 98 + downloader/package.json | 7 +- downloader/tsconfig.json | 8 +- frontend/package.json | 3 +- frontend/src/App.svelte | 4 +- frontend/src/lib/{dump.ts => portal.ts} | 2 +- frontend/src/lib/router.ts | 14 +- frontend/src/lib/routes/Dataset.svelte | 11 +- .../{DumpIndex.svelte => Portal.svelte} | 12 +- frontend/tsconfig.json | 8 +- pnpm-lock.yaml | 1744 +++++++++++++++++ pnpm-workspace.yaml | 4 + readme.md | 17 + 19 files changed, 1997 insertions(+), 101 deletions(-) create mode 100644 .dockerignore create mode 100644 common/package.json rename frontend/src/lib/schema.ts => common/schema.js (72%) create mode 100644 common/tsconfig.json create mode 100644 downloader/generate_dump_metadata.js rename frontend/src/lib/{dump.ts => portal.ts} (96%) rename frontend/src/lib/routes/{DumpIndex.svelte => Portal.svelte} (90%) create mode 100644 pnpm-lock.yaml create mode 100644 pnpm-workspace.yaml create mode 100644 readme.md diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..716ad1e --- /dev/null +++ b/.dockerignore @@ -0,0 +1,11 @@ +*/node_modules/ +node_modules/ +dataJsons/ +log +prueba +datos.gob.ar* +data/ +data* +downloader/data + +*.zip \ No newline at end of file diff --git a/common/package.json b/common/package.json new file mode 100644 index 0000000..dda0606 --- /dev/null +++ b/common/package.json @@ -0,0 +1,16 @@ +{ + "name": "common", + "type": "module", + "version": "1.0.0", + "description": "", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "keywords": [], + "author": "", + "license": "ISC", + "dependencies": { + "zod": "^3.22.4" + } +} diff --git a/frontend/src/lib/schema.ts b/common/schema.js similarity index 72% rename from frontend/src/lib/schema.ts rename to common/schema.js index 2620080..f399dbb 100644 --- a/frontend/src/lib/schema.ts +++ b/common/schema.js @@ -12,7 +12,7 @@ export const zDistribution = z.object({ title: z.string(), description: z.string().optional(), }); -export type Distribution = z.infer; +/** @typedef {z.infer} Distribution */ export const zDataset = z.object({ identifier: z.string(), title: z.string(), @@ -21,7 +21,7 @@ export const zDataset = z.object({ distribution: z.array(zDistribution), landingPage: z.string().optional(), }); -export type Dataset = z.infer; +/** @typedef {z.infer} Dataset */ export const zData = z.object({ title: z.string(), description: z.string(), @@ -36,3 +36,15 @@ export const zError = z.object({ kind: z.enum(["generic_error", "http_error", "infinite_redirect"]), error: z.string().optional(), }); + +export const zDumpMetadata = z.object({ + sites: z.array( + z.object({ + title: z.string(), + description: z.string(), + url: z.string(), + path: z.string(), + }) + ), +}); +/** @typedef {z.infer} DumpMetadata */ diff --git a/common/tsconfig.json b/common/tsconfig.json new file mode 100644 index 0000000..9bf2a49 --- /dev/null +++ b/common/tsconfig.json @@ -0,0 +1,14 @@ +{ + "compilerOptions": { + "lib": ["es2023"], + "module": "ES2020", + "target": "es2022", + + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "emitDeclarationOnly": true, + "declaration": true + } +} diff --git a/downloader/Containerfile b/downloader/Containerfile index 04081de..1cf26ce 100644 --- a/downloader/Containerfile +++ b/downloader/Containerfile @@ -1,19 +1,19 @@ FROM docker.io/alpine:3.18 as build RUN apk add --no-cache npm -RUN npm install -g esbuild -WORKDIR /tmp/build +RUN npm install -g esbuild pnpm -COPY package.json . -RUN npm install - -COPY download_json.js . -RUN esbuild --bundle --format=cjs --platform=node --outfile=build.js --sourcemap=inline download_json.js +COPY .. /tmp/build/ +WORKDIR /tmp/build/downloader +RUN pnpm install +RUN esbuild --bundle --format=cjs --platform=node --outfile=download_json.build.js --sourcemap=inline download_json.js +RUN esbuild --bundle --format=cjs --platform=node --outfile=generate_dump_metadata.build.js --sourcemap=inline generate_dump_metadata.js FROM docker.io/alpine:3.18 RUN apk add --no-cache nodejs-current tini -COPY pki/ca_intermediate_root_bundle.pem /usr/lib/ca_intermediate_root_bundle.pem -COPY --from=build /tmp/build/build.js /usr/local/bin/download_json.js +COPY downloader/pki/ca_intermediate_root_bundle.pem /usr/lib/ca_intermediate_root_bundle.pem +COPY --from=build /tmp/build/downloader/download_json.build.js /usr/local/bin/download_json.js +COPY --from=build /tmp/build/downloader/generate_dump_metadata.build.js /usr/local/bin/generate_dump_metadata.js ENV NODE_EXTRA_CA_CERTS=/usr/lib/ca_intermediate_root_bundle.pem WORKDIR /data CMD ["/sbin/tini", "node", "--enable-source-maps", "/usr/local/bin/download_json.js"] diff --git a/downloader/download_json.js b/downloader/download_json.js index 41bb89a..8f6a2f5 100644 --- a/downloader/download_json.js +++ b/downloader/download_json.js @@ -4,7 +4,7 @@ import { Agent, fetch, request, setGlobalDispatcher } from "undici"; import { join, normalize } from "node:path"; import pLimit from "p-limit"; -const sitiosPorDefecto = [ +export const sitiosPorDefecto = [ "https://datos.gob.ar/data.json", "http://datos.energia.gob.ar/data.json", "https://datos.magyp.gob.ar/data.json", @@ -63,7 +63,7 @@ const sitiosPorDefecto = [ setGlobalDispatcher( new Agent({ pipelining: 0, - }), + }) ); /** key es host @@ -85,29 +85,27 @@ let jsonUrls = process.argv.slice(2); if (jsonUrls.length < 1) { jsonUrls = sitiosPorDefecto; } -writeFile("readme.txt", generateReadme(jsonUrls)); for (const url of jsonUrls) downloadFromData(url).catch((error) => - console.error(`${url} FALLÓ CON`, error), + console.error(`${url} FALLÓ CON`, error) ); /** - * @param {string} jsonUrlString + * @param {string} jsonUrl */ -async function downloadFromData(jsonUrlString) { - const jsonUrl = new URL(jsonUrlString); - const outputPath = `${jsonUrl.host}${jsonUrl.pathname}`.replaceAll("/", "_"); +async function downloadFromData(jsonUrl) { + const outputPath = generateOutputPath(jsonUrl); + const jsonRes = await fetch(jsonUrl); + // prettier-ignore + const parsed = /** @type {{ dataset: Dataset[] }} */(await jsonRes.json()) + await writeFile(join(outputPath, "data.json"), JSON.stringify(parsed)); + await mkdir(outputPath, { recursive: true }); + await writeFile(join(outputPath, "url.txt"), jsonUrl); const errorFile = ( await open(join(outputPath, "errors.jsonl"), "w") ).createWriteStream(); - try { - const jsonRes = await fetch(jsonUrl); - // prettier-ignore - const parsed = /** @type {{ dataset: Dataset[] }} */(await jsonRes.json()) - await writeFile(join(outputPath, "data.json"), JSON.stringify(parsed)); - /** @type {DownloadJob[]} */ const jobs = parsed.dataset.flatMap((dataset) => dataset.distribution @@ -117,7 +115,7 @@ async function downloadFromData(jsonUrlString) { return true; } catch (error) { errorFile.write( - JSON.stringify(encodeError({ dataset, dist }, error)) + "\n", + JSON.stringify(encodeError({ dataset, dist }, error)) + "\n" ); return false; } @@ -128,7 +126,7 @@ async function downloadFromData(jsonUrlString) { url: patchUrl(new URL(dist.downloadURL)), outputPath, attempts: 0, - })), + })) ); const totalJobs = jobs.length; let nFinished = 0; @@ -160,7 +158,7 @@ async function downloadFromData(jsonUrlString) { process.stderr.write(`info[${outputPath}]: 0/${totalJobs} done\n`); const interval = setInterval(() => { process.stderr.write( - `info[${outputPath}]: ${nFinished}/${totalJobs} done\n`, + `info[${outputPath}]: ${nFinished}/${totalJobs} done\n` ); }, 30000); await Promise.all(promises); @@ -172,6 +170,15 @@ async function downloadFromData(jsonUrlString) { } } +/** + * @param {string} jsonUrlString + */ +export function generateOutputPath(jsonUrlString) { + const jsonUrl = new URL(jsonUrlString); + const outputPath = `${jsonUrl.host}${jsonUrl.pathname}`.replaceAll("/", "_"); + return outputPath; +} + /** * @argument {DownloadJob} job * @argument {number} attempts @@ -228,12 +235,12 @@ async function downloadDist({ dist, dataset, url, outputPath }) { const fileDirPath = join( outputPath, sanitizeSuffix(dataset.identifier), - sanitizeSuffix(dist.identifier), + sanitizeSuffix(dist.identifier) ); await mkdir(fileDirPath, { recursive: true }); const filePath = join( fileDirPath, - sanitizeSuffix(dist.fileName || dist.identifier), + sanitizeSuffix(dist.fileName || dist.identifier) ); if (!res.body) throw new Error("no body"); @@ -272,11 +279,11 @@ function sanitizeSuffix(path) { */ function chequearIdsDuplicados(jobs, id) { const duplicated = hasDuplicates( - jobs.map((j) => `${j.dataset.identifier}/${j.dist.identifier}`), + jobs.map((j) => `${j.dataset.identifier}/${j.dist.identifier}`) ); if (duplicated) { console.error( - `ADVERTENCIA[${id}]: ¡encontré duplicados! es posible que se pisen archivos entre si`, + `ADVERTENCIA[${id}]: ¡encontré duplicados! es posible que se pisen archivos entre si` ); } } @@ -334,45 +341,3 @@ function shuffleArray(array) { [array[i], array[j]] = [array[j], array[i]]; } } - -/** - * @param {string[]} portales - */ -function generateReadme(portales) { - // basado en el readme de Patricio - return `Dumps de Portales de Datos Abiertos de la República Argentina -============================================================= - -El zip contiene todo lo que se pudo descargar de los portales seleccionados, que fueron: -${portales.map((p) => `- ${p}`).join("\n")} - -La carpeta está ordenada en subcarpetas cuyo nombre corresponde al ID del dataset/distribución del portal. De esta forma, -leyendo el data.json se puede programaticamente y de manera simple volver a mapear qué archivo le corresponde a cada -distribución. - -Formato: - -- {url de data.json sin protocolo y con / reemplazado por _}/ - - data.json - - errors.jsonl: archivo con todos los errores que se obtuvieron al intentar descargar todo. - - {identifier de dataset}/ - - {identifier de distribution}/ - - {fileName (o, si no existe, identifier de distribution)} - -Ejemplo: - -- datos.gob.ar_data.json/ - - data.json - - errors.jsonl - - turismo_fbc269ea-5f71-45b6-b70c-8eb38a03b8db/ -   - turismo_0774a0bb-71c2-44d7-9ea6-780e6bd06d50/ -   - cruceristas-por-puerto-residencia-desagregado-por-pais-mes.csv - - ... - - energia_0d4a18ee-9371-439a-8a94-4f53a9822664/ -    - energia_9f602b6e-2bef-4ac4-895d-f6ecd6bb1866/ -    - energia_9f602b6e-2bef-4ac4-895d-f6ecd6bb1866 (este archivo no tiene fileName en el data.json, entonces se reutiliza el identifier) - - ... - -Este dump fue generado con transicion-desordenada-diablo: https://gitea.nulo.in/Nulo/transicion-desordenada-diablo -`; -} diff --git a/downloader/generate_dump_metadata.js b/downloader/generate_dump_metadata.js new file mode 100644 index 0000000..9bb5e41 --- /dev/null +++ b/downloader/generate_dump_metadata.js @@ -0,0 +1,98 @@ +// @ts-check +import { readFile, readdir, writeFile } from "node:fs/promises"; +import { zData } from "common/schema.js"; +import { join } from "node:path"; + +const dumpDir = process.argv[2]; +generateMetadata(dumpDir); + +/** + * @param {string} dumpDir + */ +async function generateMetadata(dumpDir) { + if (!dumpDir) { + console.error("Especifica una carpeta para generar los metadatos, porfa."); + process.exit(1); + } + + const files = await readdir(dumpDir, { withFileTypes: true }); + const sites = await Promise.all( + files + .filter((file) => file.isDirectory()) + .map(async (file) => { + const path = join(file.path, file.name); + const data = await loadDataJson(path); + const url = await readFile(join(path, "url.txt"), "utf-8"); + return { + title: data.title, + description: data.description, + url, + path: file.name, + }; + }) + ); + /** @type {import("common/schema.js").DumpMetadata} */ + const dumpMetadata = { sites }; + await writeFile( + join(dumpDir, "dump-metadata.json"), + JSON.stringify(dumpMetadata) + ); + await writeFile( + join(dumpDir, "readme.txt"), + generateReadme(sites.map((s) => s.url)) + ); +} + +/** + * @param {string[]} portales + */ +function generateReadme(portales) { + // basado en el readme de Patricio + return `Dumps de Portales de Datos Abiertos de la República Argentina +============================================================= + +Esta carpeta contiene todo lo que se pudo descargar de los portales seleccionados, que fueron: +${portales.map((p) => `- ${p}`).join("\n")} + +La carpeta está ordenada en subcarpetas cuyo nombre corresponde al ID del dataset/distribución del portal. De esta forma, +leyendo el data.json se puede programaticamente y de manera simple volver a mapear qué archivo le corresponde a cada +distribución. + +Formato: + +- {url de data.json sin protocolo y con / reemplazado por _}/ + - data.json + - errors.jsonl: archivo con todos los errores que se obtuvieron al intentar descargar todo. + - {identifier de dataset}/ + - {identifier de distribution}/ + - {fileName (o, si no existe, identifier de distribution)} + +Ejemplo: + +- datos.gob.ar_data.json/ + - data.json + - errors.jsonl + - turismo_fbc269ea-5f71-45b6-b70c-8eb38a03b8db/ +   - turismo_0774a0bb-71c2-44d7-9ea6-780e6bd06d50/ +   - cruceristas-por-puerto-residencia-desagregado-por-pais-mes.csv + - ... + - energia_0d4a18ee-9371-439a-8a94-4f53a9822664/ +    - energia_9f602b6e-2bef-4ac4-895d-f6ecd6bb1866/ +    - energia_9f602b6e-2bef-4ac4-895d-f6ecd6bb1866 (este archivo no tiene fileName en el data.json, entonces se reutiliza el identifier) + - ... + +Este dump fue generado con transicion-desordenada-diablo: https://gitea.nulo.in/Nulo/transicion-desordenada-diablo + +Se puede usar el frontend en esa repo para ver el dump. +`; +} + +/** + * @param {string} dir carpeta del dump + */ +async function loadDataJson(dir) { + const text = await readFile(join(dir, "data.json"), "utf-8"); + const json = JSON.parse(text); + const data = zData.parse(json); + return data; +} diff --git a/downloader/package.json b/downloader/package.json index 6a14082..708e031 100644 --- a/downloader/package.json +++ b/downloader/package.json @@ -1,5 +1,5 @@ { - "name": "js", + "name": "downloader", "type": "module", "version": "1.0.0", "description": "", @@ -12,10 +12,11 @@ "license": "ISC", "dependencies": { "p-limit": "^5.0.0", - "undici": "^5.28.0" + "undici": "^5.28.0", + "common": "workspace:" }, "devDependencies": { "@tsconfig/node20": "^20.1.2", "@types/node": "^20.10.0" } -} \ No newline at end of file +} diff --git a/downloader/tsconfig.json b/downloader/tsconfig.json index 389d708..d0ba92f 100644 --- a/downloader/tsconfig.json +++ b/downloader/tsconfig.json @@ -13,5 +13,11 @@ "skipLibCheck": true, "forceConsistentCasingInFileNames": true, "moduleResolution": "node16" - } + }, + "include": [ + "**/*.ts", + "**/*.js", + // https://github.com/microsoft/TypeScript/issues/33136#issuecomment-578699134 + "../common/**/*.js" + ] } diff --git a/frontend/package.json b/frontend/package.json index 566b678..980abd5 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -23,7 +23,8 @@ "tslib": "^2.6.2", "typescript": "^5.2.2", "vite": "^5.0.0", - "vite-plugin-svelte-svg": "^2.3.0" + "vite-plugin-svelte-svg": "^2.3.0", + "common": "workspace:" }, "dependencies": { "eva-icons": "^1.1.3", diff --git a/frontend/src/App.svelte b/frontend/src/App.svelte index af033d8..326b779 100644 --- a/frontend/src/App.svelte +++ b/frontend/src/App.svelte @@ -3,13 +3,13 @@ import { currentRoute, type ComponentType } from "./lib/router"; import NotFound from "./lib/routes/NotFound.svelte"; - import DumpIndex from "./lib/routes/DumpIndex.svelte"; + import Portal from "./lib/routes/Portal.svelte"; import Dataset from "./lib/routes/Dataset.svelte"; function chooseComponent(route: ComponentType) { if (route === "NotFound") return NotFound; else if (route === "Dataset") return Dataset; - else if (route === "DumpIndex") return DumpIndex; + else if (route === "Portal") return Portal; } $: component = chooseComponent($currentRoute.component); diff --git a/frontend/src/lib/dump.ts b/frontend/src/lib/portal.ts similarity index 96% rename from frontend/src/lib/dump.ts rename to frontend/src/lib/portal.ts index b2c4e34..5da5855 100644 --- a/frontend/src/lib/dump.ts +++ b/frontend/src/lib/portal.ts @@ -1,5 +1,5 @@ import streamSaver from "streamsaver"; -import { zData, type Distribution, zError } from "./schema"; +import { zData, type Distribution, zError } from "common/schema"; export async function downloadFile( dataPath: string, diff --git a/frontend/src/lib/router.ts b/frontend/src/lib/router.ts index 4df7073..20ddda4 100644 --- a/frontend/src/lib/router.ts +++ b/frontend/src/lib/router.ts @@ -2,11 +2,11 @@ import navaid, { type Params } from "navaid"; import { writable } from "svelte/store"; export const routes = { - DumpIndex: "/d/:dumpUrl", - Dataset: "/d/:dumpUrl/dataset/:id", + Portal: "/portal/:portalUrl", + Dataset: "/portal/:portalUrl/dataset/:id", }; -export type ComponentType = "NotFound" | "DumpIndex" | "Dataset"; +export type ComponentType = "NotFound" | "Portal" | "Dataset"; type Route = { component: ComponentType; @@ -15,12 +15,12 @@ type Route = { export const currentRoute = writable(); export const router = navaid(undefined, () => - currentRoute.set({ component: "NotFound" }) + currentRoute.set({ component: "NotFound" }), ); -router.on(routes.DumpIndex, (params) => - currentRoute.set({ component: "DumpIndex", params }) +router.on(routes.Portal, (params) => + currentRoute.set({ component: "Portal", params }), ); router.on(routes.Dataset, (params) => - currentRoute.set({ component: "Dataset", params }) + currentRoute.set({ component: "Dataset", params }), ); router.listen(); diff --git a/frontend/src/lib/routes/Dataset.svelte b/frontend/src/lib/routes/Dataset.svelte index aa61c38..9133cd8 100644 --- a/frontend/src/lib/routes/Dataset.svelte +++ b/frontend/src/lib/routes/Dataset.svelte @@ -1,13 +1,13 @@