compartir código, estandarizar terminos, generar dump-metadata

2024-11-26 11:26:18 +00:00 · 2023-12-08 16:05:25 -03:00 · 2023-12-08 16:05:25 -03:00 · 9800b16bbc
commit 9800b16bbc
parent 37d9b0b767
19 changed files with 1997 additions and 101 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,11 @@
+*/node_modules/
+node_modules/
+dataJsons/
+log
+prueba
+datos.gob.ar*
+data/
+data*
+downloader/data
+
+*.zip
--- a/common/package.json
+++ b/common/package.json
@ -0,0 +1,16 @@
+{
+  "name": "common",
+  "type": "module",
+  "version": "1.0.0",
+  "description": "",
+  "main": "index.js",
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "keywords": [],
+  "author": "",
+  "license": "ISC",
+  "dependencies": {
+    "zod": "^3.22.4"
+  }
+}
--- a/frontend/src/lib/schema.ts
+++ b/frontend/src/lib/schema.ts
@ -12,7 +12,7 @@ export const zDistribution = z.object({
  title: z.string(),
  description: z.string().optional(),
 });
-export type Distribution = z.infer<typeof zDistribution>;
+/** @typedef {z.infer<typeof zDistribution>} Distribution */
 export const zDataset = z.object({
  identifier: z.string(),
  title: z.string(),
@ -21,7 +21,7 @@ export const zDataset = z.object({
  distribution: z.array(zDistribution),
  landingPage: z.string().optional(),
 });
-export type Dataset = z.infer<typeof zDataset>;
+/** @typedef {z.infer<typeof zDataset>} Dataset */
 export const zData = z.object({
  title: z.string(),
  description: z.string(),
@ -36,3 +36,15 @@ export const zError = z.object({
  kind: z.enum(["generic_error", "http_error", "infinite_redirect"]),
  error: z.string().optional(),
 });
+
+export const zDumpMetadata = z.object({
+  sites: z.array(
+    z.object({
+      title: z.string(),
+      description: z.string(),
+      url: z.string(),
+      path: z.string(),
+    })
+  ),
+});
+/** @typedef {z.infer<typeof zDumpMetadata>} DumpMetadata */
--- a/common/tsconfig.json
+++ b/common/tsconfig.json
@ -0,0 +1,14 @@
+{
+  "compilerOptions": {
+    "lib": ["es2023"],
+    "module": "ES2020",
+    "target": "es2022",
+
+    "strict": true,
+    "esModuleInterop": true,
+    "skipLibCheck": true,
+    "forceConsistentCasingInFileNames": true,
+    "emitDeclarationOnly": true,
+    "declaration": true
+  }
+}
--- a/downloader/Containerfile
+++ b/downloader/Containerfile
@ -1,19 +1,19 @@

 FROM docker.io/alpine:3.18 as build
 RUN apk add --no-cache npm
-RUN npm install -g esbuild
-WORKDIR /tmp/build
+RUN npm install -g esbuild pnpm

-COPY package.json .
-RUN npm install
-
-COPY download_json.js .
-RUN esbuild --bundle --format=cjs --platform=node --outfile=build.js --sourcemap=inline download_json.js
+COPY .. /tmp/build/
+WORKDIR /tmp/build/downloader
+RUN pnpm install
+RUN esbuild --bundle --format=cjs --platform=node --outfile=download_json.build.js --sourcemap=inline download_json.js
+RUN esbuild --bundle --format=cjs --platform=node --outfile=generate_dump_metadata.build.js --sourcemap=inline generate_dump_metadata.js

 FROM docker.io/alpine:3.18
 RUN apk add --no-cache nodejs-current tini
-COPY pki/ca_intermediate_root_bundle.pem /usr/lib/ca_intermediate_root_bundle.pem
-COPY --from=build /tmp/build/build.js /usr/local/bin/download_json.js
+COPY downloader/pki/ca_intermediate_root_bundle.pem /usr/lib/ca_intermediate_root_bundle.pem
+COPY --from=build /tmp/build/downloader/download_json.build.js /usr/local/bin/download_json.js
+COPY --from=build /tmp/build/downloader/generate_dump_metadata.build.js /usr/local/bin/generate_dump_metadata.js
 ENV NODE_EXTRA_CA_CERTS=/usr/lib/ca_intermediate_root_bundle.pem
 WORKDIR /data
 CMD ["/sbin/tini", "node", "--enable-source-maps", "/usr/local/bin/download_json.js"]
--- a/downloader/download_json.js
+++ b/downloader/download_json.js
@ -4,7 +4,7 @@ import { Agent, fetch, request, setGlobalDispatcher } from "undici";
 import { join, normalize } from "node:path";
 import pLimit from "p-limit";

-const sitiosPorDefecto = [
+export const sitiosPorDefecto = [
  "https://datos.gob.ar/data.json",
  "http://datos.energia.gob.ar/data.json",
  "https://datos.magyp.gob.ar/data.json",
@ -63,7 +63,7 @@ const sitiosPorDefecto = [
 setGlobalDispatcher(
  new Agent({
    pipelining: 0,
-  }),
+  })
 );

 /** key es host
@ -85,29 +85,27 @@ let jsonUrls = process.argv.slice(2);
 if (jsonUrls.length < 1) {
  jsonUrls = sitiosPorDefecto;
 }
-writeFile("readme.txt", generateReadme(jsonUrls));
 for (const url of jsonUrls)
  downloadFromData(url).catch((error) =>
-    console.error(`${url} FALLÓ CON`, error),
+    console.error(`${url} FALLÓ CON`, error)
  );

 /**
- * @param {string} jsonUrlString
+ * @param {string} jsonUrl
 */
-async function downloadFromData(jsonUrlString) {
-  const jsonUrl = new URL(jsonUrlString);
-  const outputPath = `${jsonUrl.host}${jsonUrl.pathname}`.replaceAll("/", "_");
+async function downloadFromData(jsonUrl) {
+  const outputPath = generateOutputPath(jsonUrl);
+  const jsonRes = await fetch(jsonUrl);
+  // prettier-ignore
+  const parsed = /** @type {{ dataset: Dataset[] }} */(await jsonRes.json())
+  await writeFile(join(outputPath, "data.json"), JSON.stringify(parsed));
+
  await mkdir(outputPath, { recursive: true });
+  await writeFile(join(outputPath, "url.txt"), jsonUrl);
  const errorFile = (
    await open(join(outputPath, "errors.jsonl"), "w")
  ).createWriteStream();
-
  try {
-    const jsonRes = await fetch(jsonUrl);
-    // prettier-ignore
-    const parsed = /** @type {{ dataset: Dataset[] }} */(await jsonRes.json())
-    await writeFile(join(outputPath, "data.json"), JSON.stringify(parsed));
-
    /** @type {DownloadJob[]} */
    const jobs = parsed.dataset.flatMap((dataset) =>
      dataset.distribution
@ -117,7 +115,7 @@ async function downloadFromData(jsonUrlString) {
            return true;
          } catch (error) {
            errorFile.write(
-              JSON.stringify(encodeError({ dataset, dist }, error)) + "\n",
+              JSON.stringify(encodeError({ dataset, dist }, error)) + "\n"
            );
            return false;
          }
@ -128,7 +126,7 @@ async function downloadFromData(jsonUrlString) {
          url: patchUrl(new URL(dist.downloadURL)),
          outputPath,
          attempts: 0,
-        })),
+        }))
    );
    const totalJobs = jobs.length;
    let nFinished = 0;
@ -160,7 +158,7 @@ async function downloadFromData(jsonUrlString) {
    process.stderr.write(`info[${outputPath}]: 0/${totalJobs} done\n`);
    const interval = setInterval(() => {
      process.stderr.write(
-        `info[${outputPath}]: ${nFinished}/${totalJobs} done\n`,
+        `info[${outputPath}]: ${nFinished}/${totalJobs} done\n`
      );
    }, 30000);
    await Promise.all(promises);
@ -172,6 +170,15 @@ async function downloadFromData(jsonUrlString) {
  }
 }

+/**
+ * @param {string} jsonUrlString
+ */
+export function generateOutputPath(jsonUrlString) {
+  const jsonUrl = new URL(jsonUrlString);
+  const outputPath = `${jsonUrl.host}${jsonUrl.pathname}`.replaceAll("/", "_");
+  return outputPath;
+}
+
 /**
 * @argument {DownloadJob} job
 * @argument {number} attempts
@ -228,12 +235,12 @@ async function downloadDist({ dist, dataset, url, outputPath }) {
  const fileDirPath = join(
    outputPath,
    sanitizeSuffix(dataset.identifier),
-    sanitizeSuffix(dist.identifier),
+    sanitizeSuffix(dist.identifier)
  );
  await mkdir(fileDirPath, { recursive: true });
  const filePath = join(
    fileDirPath,
-    sanitizeSuffix(dist.fileName || dist.identifier),
+    sanitizeSuffix(dist.fileName || dist.identifier)
  );

  if (!res.body) throw new Error("no body");
@ -272,11 +279,11 @@ function sanitizeSuffix(path) {
 */
 function chequearIdsDuplicados(jobs, id) {
  const duplicated = hasDuplicates(
-    jobs.map((j) => `${j.dataset.identifier}/${j.dist.identifier}`),
+    jobs.map((j) => `${j.dataset.identifier}/${j.dist.identifier}`)
  );
  if (duplicated) {
    console.error(
-      `ADVERTENCIA[${id}]: ¡encontré duplicados! es posible que se pisen archivos entre si`,
+      `ADVERTENCIA[${id}]: ¡encontré duplicados! es posible que se pisen archivos entre si`
    );
  }
 }
@ -334,45 +341,3 @@ function shuffleArray(array) {
    [array[i], array[j]] = [array[j], array[i]];
  }
 }
-
-/**
- * @param {string[]} portales
- */
-function generateReadme(portales) {
-  // basado en el readme de Patricio
-  return `Dumps de Portales de Datos Abiertos de la República Argentina
-=============================================================
-
-El zip contiene todo lo que se pudo descargar de los portales seleccionados, que fueron:
-${portales.map((p) => `- ${p}`).join("\n")}
-
-La carpeta está ordenada en subcarpetas cuyo nombre corresponde al ID del dataset/distribución del portal. De esta forma, 
-leyendo el data.json se puede programaticamente y de manera simple volver a mapear qué archivo le corresponde a cada
-distribución.
-
-Formato:
-
- {url de data.json sin protocolo y con / reemplazado por _}/
-  - data.json
-  - errors.jsonl: archivo con todos los errores que se obtuvieron al intentar descargar todo.
-  - {identifier de dataset}/
-    - {identifier de distribution}/
-      - {fileName (o, si no existe, identifier de distribution)}
-
-Ejemplo:
-
- datos.gob.ar_data.json/
-  - data.json
-  - errors.jsonl
-  - turismo_fbc269ea-5f71-45b6-b70c-8eb38a03b8db/
-    - turismo_0774a0bb-71c2-44d7-9ea6-780e6bd06d50/
-      - cruceristas-por-puerto-residencia-desagregado-por-pais-mes.csv
-    - ...
-  - energia_0d4a18ee-9371-439a-8a94-4f53a9822664/
-    - energia_9f602b6e-2bef-4ac4-895d-f6ecd6bb1866/
-      - energia_9f602b6e-2bef-4ac4-895d-f6ecd6bb1866 (este archivo no tiene fileName en el data.json, entonces se reutiliza el identifier)
-  - ...
-
-Este dump fue generado con transicion-desordenada-diablo: https://gitea.nulo.in/Nulo/transicion-desordenada-diablo
-`;
-}
--- a/downloader/generate_dump_metadata.js
+++ b/downloader/generate_dump_metadata.js
@ -0,0 +1,98 @@
+// @ts-check
+import { readFile, readdir, writeFile } from "node:fs/promises";
+import { zData } from "common/schema.js";
+import { join } from "node:path";
+
+const dumpDir = process.argv[2];
+generateMetadata(dumpDir);
+
+/**
+ * @param {string} dumpDir
+ */
+async function generateMetadata(dumpDir) {
+  if (!dumpDir) {
+    console.error("Especifica una carpeta para generar los metadatos, porfa.");
+    process.exit(1);
+  }
+
+  const files = await readdir(dumpDir, { withFileTypes: true });
+  const sites = await Promise.all(
+    files
+      .filter((file) => file.isDirectory())
+      .map(async (file) => {
+        const path = join(file.path, file.name);
+        const data = await loadDataJson(path);
+        const url = await readFile(join(path, "url.txt"), "utf-8");
+        return {
+          title: data.title,
+          description: data.description,
+          url,
+          path: file.name,
+        };
+      })
+  );
+  /** @type {import("common/schema.js").DumpMetadata} */
+  const dumpMetadata = { sites };
+  await writeFile(
+    join(dumpDir, "dump-metadata.json"),
+    JSON.stringify(dumpMetadata)
+  );
+  await writeFile(
+    join(dumpDir, "readme.txt"),
+    generateReadme(sites.map((s) => s.url))
+  );
+}
+
+/**
+ * @param {string[]} portales
+ */
+function generateReadme(portales) {
+  // basado en el readme de Patricio
+  return `Dumps de Portales de Datos Abiertos de la República Argentina
+=============================================================
+
+Esta carpeta contiene todo lo que se pudo descargar de los portales seleccionados, que fueron:
+${portales.map((p) => `- ${p}`).join("\n")}
+
+La carpeta está ordenada en subcarpetas cuyo nombre corresponde al ID del dataset/distribución del portal. De esta forma, 
+leyendo el data.json se puede programaticamente y de manera simple volver a mapear qué archivo le corresponde a cada
+distribución.
+
+Formato:
+
+- {url de data.json sin protocolo y con / reemplazado por _}/
+  - data.json
+  - errors.jsonl: archivo con todos los errores que se obtuvieron al intentar descargar todo.
+  - {identifier de dataset}/
+    - {identifier de distribution}/
+      - {fileName (o, si no existe, identifier de distribution)}
+
+Ejemplo:
+
+- datos.gob.ar_data.json/
+  - data.json
+  - errors.jsonl
+  - turismo_fbc269ea-5f71-45b6-b70c-8eb38a03b8db/
+    - turismo_0774a0bb-71c2-44d7-9ea6-780e6bd06d50/
+      - cruceristas-por-puerto-residencia-desagregado-por-pais-mes.csv
+    - ...
+  - energia_0d4a18ee-9371-439a-8a94-4f53a9822664/
+    - energia_9f602b6e-2bef-4ac4-895d-f6ecd6bb1866/
+      - energia_9f602b6e-2bef-4ac4-895d-f6ecd6bb1866 (este archivo no tiene fileName en el data.json, entonces se reutiliza el identifier)
+  - ...
+
+Este dump fue generado con transicion-desordenada-diablo: https://gitea.nulo.in/Nulo/transicion-desordenada-diablo
+
+Se puede usar el frontend en esa repo para ver el dump.
+`;
+}
+
+/**
+ * @param {string} dir carpeta del dump
+ */
+async function loadDataJson(dir) {
+  const text = await readFile(join(dir, "data.json"), "utf-8");
+  const json = JSON.parse(text);
+  const data = zData.parse(json);
+  return data;
+}
--- a/downloader/package.json
+++ b/downloader/package.json
@ -1,5 +1,5 @@
 {
-  "name": "js",
+  "name": "downloader",
  "type": "module",
  "version": "1.0.0",
  "description": "",
@ -12,10 +12,11 @@
  "license": "ISC",
  "dependencies": {
    "p-limit": "^5.0.0",
-    "undici": "^5.28.0"
+    "undici": "^5.28.0",
+    "common": "workspace:"
  },
  "devDependencies": {
    "@tsconfig/node20": "^20.1.2",
    "@types/node": "^20.10.0"
  }
-}
+}
--- a/downloader/tsconfig.json
+++ b/downloader/tsconfig.json
@ -13,5 +13,11 @@
    "skipLibCheck": true,
    "forceConsistentCasingInFileNames": true,
    "moduleResolution": "node16"
-  }
+  },
+  "include": [
+    "**/*.ts",
+    "**/*.js",
+    // https://github.com/microsoft/TypeScript/issues/33136#issuecomment-578699134
+    "../common/**/*.js"
+  ]
 }
--- a/frontend/package.json
+++ b/frontend/package.json
@ -23,7 +23,8 @@
    "tslib": "^2.6.2",
    "typescript": "^5.2.2",
    "vite": "^5.0.0",
-    "vite-plugin-svelte-svg": "^2.3.0"
+    "vite-plugin-svelte-svg": "^2.3.0",
+    "common": "workspace:"
  },
  "dependencies": {
    "eva-icons": "^1.1.3",
--- a/frontend/src/App.svelte
+++ b/frontend/src/App.svelte
@ -3,13 +3,13 @@
  import { currentRoute, type ComponentType } from "./lib/router";

  import NotFound from "./lib/routes/NotFound.svelte";
-  import DumpIndex from "./lib/routes/DumpIndex.svelte";
+  import Portal from "./lib/routes/Portal.svelte";
  import Dataset from "./lib/routes/Dataset.svelte";

  function chooseComponent(route: ComponentType) {
    if (route === "NotFound") return NotFound;
    else if (route === "Dataset") return Dataset;
-    else if (route === "DumpIndex") return DumpIndex;
+    else if (route === "Portal") return Portal;
  }

  $: component = chooseComponent($currentRoute.component);
--- a/frontend/src/lib/portal.ts
+++ b/frontend/src/lib/portal.ts
@ -1,5 +1,5 @@
 import streamSaver from "streamsaver";
-import { zData, type Distribution, zError } from "./schema";
+import { zData, type Distribution, zError } from "common/schema";

 export async function downloadFile(
  dataPath: string,
--- a/frontend/src/lib/router.ts
+++ b/frontend/src/lib/router.ts
@ -2,11 +2,11 @@ import navaid, { type Params } from "navaid";
 import { writable } from "svelte/store";

 export const routes = {
-  DumpIndex: "/d/:dumpUrl",
-  Dataset: "/d/:dumpUrl/dataset/:id",
+  Portal: "/portal/:portalUrl",
+  Dataset: "/portal/:portalUrl/dataset/:id",
 };

-export type ComponentType = "NotFound" | "DumpIndex" | "Dataset";
+export type ComponentType = "NotFound" | "Portal" | "Dataset";

 type Route = {
  component: ComponentType;
@ -15,12 +15,12 @@ type Route = {
 export const currentRoute = writable<Route>();

 export const router = navaid(undefined, () =>
-  currentRoute.set({ component: "NotFound" })
+  currentRoute.set({ component: "NotFound" }),
 );
-router.on(routes.DumpIndex, (params) =>
-  currentRoute.set({ component: "DumpIndex", params })
+router.on(routes.Portal, (params) =>
+  currentRoute.set({ component: "Portal", params }),
 );
 router.on(routes.Dataset, (params) =>
-  currentRoute.set({ component: "Dataset", params })
+  currentRoute.set({ component: "Dataset", params }),
 );
 router.listen();
--- a/frontend/src/lib/routes/Dataset.svelte
+++ b/frontend/src/lib/routes/Dataset.svelte
@ -1,13 +1,13 @@
 <script lang="ts">
  import ArrowBack from "eva-icons/outline/svg/arrow-back-outline.svg?component";
  import ExternalLink from "eva-icons/outline/svg/external-link-outline.svg?component";
-  import { downloadFile, fetchData, fetchErrors } from "../dump";
+  import { downloadFile, fetchData, fetchErrors } from "../portal";
  import NotFound from "./NotFound.svelte";
  import { inject } from "regexparam";
  import { routes } from "../router";

-  export let params: { dumpUrl: string; id: string };
-  const url = decodeURIComponent(params.dumpUrl);
+  export let params: { portalUrl: string; id: string };
+  const url = decodeURIComponent(params.portalUrl);

  const data = Promise.all([fetchData(url), fetchErrors(url)]).then(
    ([data, errors]) => ({ data, errors }),
@ -27,7 +27,7 @@
          <small>
            <a
              class="flex text-blue-500 leading-none gap-1 items-center"
-              href={inject(routes.DumpIndex, { dumpUrl: params.dumpUrl })}
+              href={inject(routes.Portal, { portalUrl: params.portalUrl })}
            >
              <ArrowBack fill="currentColor" class="h-[1.25em]" /> Viendo {data.title}
            </a>
@ -70,7 +70,8 @@
                </h3>
                {#if error}
                  <small class="block text-red-700">
-                    No está en este dump porque hubo un error al descargarlo :(
+                    No está en este archivo porque hubo un error al descargarlo
+                    :(
                  </small>
                {/if}
                {#if dist.fileName}
--- a/frontend/src/lib/routes/DumpIndex.svelte
+++ b/frontend/src/lib/routes/DumpIndex.svelte
@ -2,12 +2,12 @@
  import { inject } from "regexparam";
  import ArrowForward from "eva-icons/outline/svg/arrow-forward-outline.svg?component";
  import ExternalLink from "eva-icons/outline/svg/external-link-outline.svg?component";
-  import { fetchData, fetchErrors } from "../dump";
+  import { fetchData, fetchErrors } from "../portal";
  import { routes } from "../router";
-  import type { Dataset } from "../schema";
+  import type { Dataset } from "common/schema";

-  export let params: { dumpUrl: string };
-  const url = decodeURIComponent(params.dumpUrl);
+  export let params: { portalUrl: string };
+  const url = decodeURIComponent(params.portalUrl);

  const data = Promise.all([fetchData(url), fetchErrors(url)]).then(
    ([data, errors]) => ({ data, errors }),
@ -70,7 +70,7 @@
      <ul class="divide-y divide-gray-100">
        {#each filterDatasets(data.dataset, query) as dataset}
          {@const datasetLink = inject(routes.Dataset, {
-            dumpUrl: params.dumpUrl,
+            portalUrl: params.portalUrl,
            id: dataset.identifier,
          })}
          <li>
@ -92,7 +92,7 @@
        {/each}
      </ul>
    {:catch error}
-      Hubo un error intenando cargar este dump. <pre>{error}</pre>
+      Hubo un error intenando cargar este portal archivado. <pre>{error}</pre>
    {/await}
  </div>
 </main>
--- a/frontend/tsconfig.json
+++ b/frontend/tsconfig.json
@ -15,6 +15,12 @@
    "checkJs": true,
    "isolatedModules": true
  },
-  "include": ["src/**/*.ts", "src/**/*.js", "src/**/*.svelte"],
+  "include": [
+    "src/**/*.ts",
+    "src/**/*.js",
+    "src/**/*.svelte",
+    // https://github.com/microsoft/TypeScript/issues/33136#issuecomment-578699134
+    "../common/**/*.js"
+  ],
  "references": [{ "path": "./tsconfig.node.json" }]
 }
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
--- a/pnpm-workspace.yaml
+++ b/pnpm-workspace.yaml
@ -0,0 +1,4 @@
+packages:
+  - "frontend/"
+  - "downloader/"
+  - "common/"
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,17 @@
+# Transicion Desordeanada (diablo)
+
+Herramientas para descargar masivamente portales de datos abiertos y generar un archivo, que luego se puede ver en una página web.
+
+## [Downloader](./downloader)
+
+El descargador.
+
+## [Frontend](./frontend)
+
+La página web para ver el archivo generado.
+
+## Glosario
+
+- Portal (de datos): algo que tiene un data.json en un formato similar a [DCAT 2](https://www.w3.org/TR/vocab-dcat-2/) (suelen ser portales [CKAN](https://ckan.org/))
+- Archivo (dump): una versión descargada de uno o varios portales de datos
+- Dataset: conjunto de archivos que suelen estar relacionados