2024-11-22 22:26:19 +00:00
10 changed files with 5 additions and 467 deletions
--- a/sepa/bun.lockb
+++ b/sepa/bun.lockb
--- a/sepa/package.json
+++ b/sepa/package.json
@ -5,7 +5,6 @@
    "sepa-precios-archiver",
    "sepa-precios-importer",
    "sepa-index-gen",
-    "sepa-dataset-validator",
    "ckan"
  ]
 }
--- a/sepa/sepa-dataset-validator/.gitignore
+++ b/sepa/sepa-dataset-validator/.gitignore
@ -1,175 +0,0 @@
-# Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore
-
-# Logs
-
-logs
-_.log
-npm-debug.log_
-yarn-debug.log*
-yarn-error.log*
-lerna-debug.log*
-.pnpm-debug.log*
-
-# Caches
-
-.cache
-
-# Diagnostic reports (https://nodejs.org/api/report.html)
-
-report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
-
-# Runtime data
-
-pids
-_.pid
-_.seed
-*.pid.lock
-
-# Directory for instrumented libs generated by jscoverage/JSCover
-
-lib-cov
-
-# Coverage directory used by tools like istanbul
-
-coverage
-*.lcov
-
-# nyc test coverage
-
-.nyc_output
-
-# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
-
-.grunt
-
-# Bower dependency directory (https://bower.io/)
-
-bower_components
-
-# node-waf configuration
-
-.lock-wscript
-
-# Compiled binary addons (https://nodejs.org/api/addons.html)
-
-build/Release
-
-# Dependency directories
-
-node_modules/
-jspm_packages/
-
-# Snowpack dependency directory (https://snowpack.dev/)
-
-web_modules/
-
-# TypeScript cache
-
-*.tsbuildinfo
-
-# Optional npm cache directory
-
-.npm
-
-# Optional eslint cache
-
-.eslintcache
-
-# Optional stylelint cache
-
-.stylelintcache
-
-# Microbundle cache
-
-.rpt2_cache/
-.rts2_cache_cjs/
-.rts2_cache_es/
-.rts2_cache_umd/
-
-# Optional REPL history
-
-.node_repl_history
-
-# Output of 'npm pack'
-
-*.tgz
-
-# Yarn Integrity file
-
-.yarn-integrity
-
-# dotenv environment variable files
-
-.env
-.env.development.local
-.env.test.local
-.env.production.local
-.env.local
-
-# parcel-bundler cache (https://parceljs.org/)
-
-.parcel-cache
-
-# Next.js build output
-
-.next
-out
-
-# Nuxt.js build / generate output
-
-.nuxt
-dist
-
-# Gatsby files
-
-# Comment in the public line in if your project uses Gatsby and not Next.js
-
-# https://nextjs.org/blog/next-9-1#public-directory-support
-
-# public
-
-# vuepress build output
-
-.vuepress/dist
-
-# vuepress v2.x temp and cache directory
-
-.temp
-
-# Docusaurus cache and generated files
-
-.docusaurus
-
-# Serverless directories
-
-.serverless/
-
-# FuseBox cache
-
-.fusebox/
-
-# DynamoDB Local files
-
-.dynamodb/
-
-# TernJS port file
-
-.tern-port
-
-# Stores VSCode versions used for testing VSCode extensions
-
-.vscode-test
-
-# yarn v2
-
-.yarn/cache
-.yarn/unplugged
-.yarn/build-state.yml
-.yarn/install-state.gz
-.pnp.*
-
-# IntelliJ based IDEs
-.idea
-
-# Finder (MacOS) folder config
-.DS_Store
--- a/sepa/sepa-dataset-validator/README.md
+++ b/sepa/sepa-dataset-validator/README.md
@ -1,14 +0,0 @@
-# sepa-dataset-validator
-
-un script para validar los datasets de SEPA automaticamente
-
-basado en [la lista de problemas](https://gist.github.com/catdevnull/587d5c63c4bab11b9798861c917db93b) que encontramos
-
-para ejecutar, necesitas [Bun](https://bun.sh)
-
-```bash
-bun install
-bun run . [ruta/al/dataset]
-```
-
-podes descargar un dump de [nuestro index](https://github.com/catdevnull/sepa-precios-metadata/blob/main/index.md) para analizar (la descarga pesa mucho menos que los oficiales :). para descomprimir, necesitas tener `zstd` y `tar`. después solo tenes que ejecutar `tar xvf ARCHIVO.tar.zst` y listo.
--- a/sepa/sepa-dataset-validator/consts.ts
+++ b/sepa/sepa-dataset-validator/consts.ts
@ -1,26 +0,0 @@
-export const ISO_PROVINCIAS = [
-  "AR-C",
-  "AR-B",
-  "AR-K",
-  "AR-H",
-  "AR-U",
-  "AR-X",
-  "AR-W",
-  "AR-E",
-  "AR-P",
-  "AR-Y",
-  "AR-L",
-  "AR-F",
-  "AR-M",
-  "AR-N",
-  "AR-Q",
-  "AR-R",
-  "AR-A",
-  "AR-J",
-  "AR-D",
-  "AR-Z",
-  "AR-S",
-  "AR-G",
-  "AR-V",
-  "AR-T",
-];
--- a/sepa/sepa-dataset-validator/index.ts
+++ b/sepa/sepa-dataset-validator/index.ts
@ -1,169 +0,0 @@
-import * as fs from "fs";
-import { join } from "path";
-import jschardet from "jschardet";
-import Papa from "papaparse";
-import { Comerico, ProductoSegúnSpec } from "./schemas";
-import { ISO_PROVINCIAS } from "./consts";
-
-const dir = process.argv[2];
-
-if (!dir) {
-  console.error("Usage: bun index.ts <directory>");
-  process.exit(1);
-}
-
-async function readFiles(dir: string) {
-  const buffers = {
-    "productos.csv": await fs.promises.readFile(join(dir, "productos.csv")),
-    "sucursales.csv": await fs.promises.readFile(join(dir, "sucursales.csv")),
-    "comercio.csv": await fs.promises.readFile(join(dir, "comercio.csv")),
-  };
-
-  let texts: Record<keyof typeof buffers, string> = {
-    "productos.csv": "",
-    "sucursales.csv": "",
-    "comercio.csv": "",
-  };
-
-  let notUtf8 = [];
-  for (const [name, buffer] of Object.entries(buffers)) {
-    const det = jschardet.detect(buffer.subarray(0, 1024 * 1024));
-    if (det.encoding === "ascii") det.encoding = "UTF-8";
-    if (det.encoding !== "UTF-8") {
-      notUtf8.push(name);
-      if (det.encoding === "UTF-16LE") {
-        texts[name as keyof typeof buffers] = buffer.toString("utf-16le");
-      } else throw new Error(`Can't parse encoding ${det.encoding} in ${name}`);
-    } else {
-      texts[name as keyof typeof buffers] = buffer.toString("utf-8");
-    }
-  }
-  if (notUtf8.length > 0) {
-    console.error(`❌ No son UTF-8: ${notUtf8.join(", ")}`);
-  }
-
-  if (texts["productos.csv"].includes("\t")) {
-    console.error(`❌ El archivo productos.csv contiene tabs`);
-  }
-
-  // XXX: cada uno tiene su interpretación de que tildes tiene cada palabra...
-  // vi varias combinaciones de tildes y sin tildes en los archivos de los datasets
-  // estaría bueno chequearlo para verificar que está según spec
-  const regex =
-    /(?:\r?\n *)?\r?\n(?:[ÚU]ltima actualizaci[oó]n): (.+)(?:\r?\n)*$/iu;
-  for (const [name, text] of Object.entries(texts)) {
-    const matches = text.match(regex);
-    if (!matches) {
-      console.error(`❌ [${name}] No pude encontrar la fecha de actualización`);
-    } else {
-      texts[name as keyof typeof buffers] = text.replace(regex, "");
-    }
-  }
-
-  const csvs = {
-    "productos.csv": Papa.parse(texts["productos.csv"], {
-      header: true,
-    }),
-    "sucursales.csv": Papa.parse(texts["sucursales.csv"], {
-      header: true,
-    }),
-    "comercio.csv": Papa.parse(texts["comercio.csv"], {
-      header: true,
-    }),
-  };
-
-  const comercio = Comerico.parse(csvs["comercio.csv"].data[0]);
-  console.log(
-    `  -> CUIT ${comercio.comercio_cuit}: ${comercio.comercio_razon_social}`
-  );
-  if (Object.values(csvs).some((csv) => csv.errors.length > 0)) {
-    console.error(`❌ Hubo errores parseando el CSV`);
-  }
-  return csvs;
-}
-
-type Files = Awaited<ReturnType<typeof readFiles>>;
-
-// si retorna truthy es un error
-const checkers: Record<string, (files: Files) => boolean | string> = {
-  ["[productos.csv] Nombres de columnas incorrectas"](files) {
-    const firstRow = files["productos.csv"].data[0];
-    if (!firstRow) return true;
-    const res = ProductoSegúnSpec.safeParse(firstRow);
-    if (res.error) {
-      for (const [key, value] of Object.entries(res.error.format())) {
-        if (!value) continue;
-        const errors = Array.isArray(value) ? value : value._errors;
-        console.error(`    Error en columna ${key}:`, errors.join(", "));
-      }
-      return true;
-    }
-    return false;
-  },
-  ["Sucursales mencionadas en productos.csv existen en sucursales.csv"](files) {
-    const productos = new Set(
-      files["productos.csv"].data.map((row) => (row as any).id_sucursal)
-    );
-    const sucursales = new Set(
-      files["sucursales.csv"].data.map((row) => (row as any).id_sucursal)
-    );
-    const missing = [...productos].filter((id) => !sucursales.has(id));
-    if (missing.length > 0) {
-      console.error(
-        `    Las sucursales ${missing.join(", ")} no existen en sucursales.csv`
-      );
-    }
-    return missing.length > 0;
-  },
-  ["Hay productos duplicados con el mismo EAN"](files) {
-    const productosEnSucursales = files["productos.csv"].data
-      .filter((row: any) => row.productos_ean == 1)
-      .map(
-        (row: any) => `${row.id_bandera}-${row.id_sucursal}-${row.id_producto}`
-      );
-    const eansUnicos = new Set(productosEnSucursales);
-    if (productosEnSucursales.length !== eansUnicos.size) return true;
-    return false;
-  },
-  ["[sucursales.csv] sucursales_provincia no cumple con ISO 3166-2"](files) {
-    const sucursales = files["sucursales.csv"].data;
-    for (const sucursal of sucursales) {
-      const prov = (sucursal as any).sucursales_provincia;
-      if (!prov) continue;
-      if (!ISO_PROVINCIAS.includes(prov)) {
-        console.error(`    La provincia ${prov} no es válida`);
-        return true;
-      }
-    }
-    return false;
-  },
-};
-
-const content = await fs.promises.readdir(dir);
-
-if (content.find((x) => x.endsWith(".csv"))) {
-  await chequearDataset(dir);
-} else if (content.find((x) => x.startsWith("sepa"))) {
-  for (const subdir of content) {
-    if (!subdir.startsWith("sepa")) continue;
-    console.info(`chequeando ${subdir}...`);
-    await chequearDataset(join(dir, subdir));
-  }
-}
-
-async function chequearDataset(dir: string) {
-  const files = await readFiles(dir);
-
-  for (const [name, checker] of Object.entries(checkers)) {
-    try {
-      const res = checker(files);
-      if (res) {
-        console.error(`❌ ${name} (${res})`);
-      }
-    } catch (error) {
-      console.error(`❌ ${name}:`, error);
-    }
-  }
-}
-
-console.error(`¡Haga patria, arregle su dataset!`);
--- a/sepa/sepa-dataset-validator/package.json
+++ b/sepa/sepa-dataset-validator/package.json
@ -1,16 +0,0 @@
-{
-  "name": "sepa-dataset-validator",
-  "module": "index.ts",
-  "type": "module",
-  "devDependencies": {
-    "@types/bun": "latest"
-  },
-  "peerDependencies": {
-    "typescript": "^5.5.4"
-  },
-  "dependencies": {
-    "jschardet": "^3.1.3",
-    "papaparse": "^5.4.1",
-    "zod": "^3.23.8"
-  }
-}
--- a/sepa/sepa-dataset-validator/schemas.ts
+++ b/sepa/sepa-dataset-validator/schemas.ts
@ -1,33 +0,0 @@
-import { z } from "zod";
-
-export const Comerico = z.object({
-  id_comercio: z.string(),
-  id_bandera: z.string(),
-  comercio_cuit: z.string(),
-  comercio_razon_social: z.string(),
-  comercio_bandera_nombre: z.string(),
-  comercio_bandera_url: z.string(),
-  comercio_ultima_actualizacion: z.string(),
-  comercio_version_sepa: z.string().optional(), // no es opcional pero a veces no lo agregan...
-});
-
-export const ProductoSegúnSpec = z.object({
-  id_comercio: z.coerce.number(),
-  id_bandera: z.coerce.number(),
-  id_sucursal: z.coerce.number(),
-  id_producto: z.coerce.number(),
-  // 0 es ID interna del comercio, 1 es EAN/UPC-A
-  productos_ean: z.union([z.literal("0"), z.literal("1")]),
-  productos_descripcion: z.string(),
-  productos_cantidad_presentacion: z.coerce.number(),
-  productos_unidad_medida_presentacion: z.string(),
-  productos_marca: z.string(),
-  productos_precio_lista: z.coerce.number(),
-  productos_precio_referencia: z.coerce.number(),
-  productos_cantidad_referencia: z.coerce.number(),
-  productos_unidad_medida_referencia: z.string(),
-  productos_precio_unitario_promo1: z.coerce.number().optional(),
-  productos_leyenda_promo1: z.string().optional(),
-  productos_precio_unitario_promo2: z.coerce.number().optional(),
-  productos_leyenda_promo2: z.string().optional(),
-});
--- a/sepa/sepa-dataset-validator/tsconfig.json
+++ b/sepa/sepa-dataset-validator/tsconfig.json
@ -1,27 +0,0 @@
-{
-  "compilerOptions": {
-    // Enable latest features
-    "lib": ["ESNext", "DOM"],
-    "target": "ESNext",
-    "module": "ESNext",
-    "moduleDetection": "force",
-    "jsx": "react-jsx",
-    "allowJs": true,
-
-    // Bundler mode
-    "moduleResolution": "bundler",
-    "allowImportingTsExtensions": true,
-    "verbatimModuleSyntax": true,
-    "noEmit": true,
-
-    // Best practices
-    "strict": true,
-    "skipLibCheck": true,
-    "noFallthroughCasesInSwitch": true,
-
-    // Some stricter flags (disabled by default)
-    "noUnusedLocals": false,
-    "noUnusedParameters": false,
-    "noPropertyAccessFromIndexSignature": false
-  }
-}
--- a/sepa/sepa-index-gen/index.ts
+++ b/sepa/sepa-index-gen/index.ts
@ -17,14 +17,13 @@ export async function generateMarkdown() {
    }))
    .sort((a, b) => +b.date - +a.date);

-  let latestResources = new Map<string, Resource & { firstSeenAt: Date }>();
+  let latestResources = new Map<string, Resource>();

  for (const { date, resources } of datasetsArray) {
    for (const resource of resources) {
      const id = `${resource.id}-revID-${resource.revision_id}`;
-      const existing = latestResources.get(id);
-      if (existing && existing.firstSeenAt < date) continue;
-      latestResources.set(id, { ...resource, firstSeenAt: date });
+      if (latestResources.has(id)) continue;
+      latestResources.set(id, resource);
    }
  }

@ -124,7 +123,7 @@ esto esta automáticamente generado por sepa-index-gen dentro de preciazo.`;
      const id = `${resource.id}-revID-${resource.revision_id}`;
      const fileExists = fileList.find((file) => file.startsWith(id));
      const link =
-        fileExists &&
+        fileExists ??
        `https://f004.backblazeb2.com/file/precios-justos-datasets/${fileExists}`;
      let warnings = "";
      if (
@ -134,7 +133,7 @@ esto esta automáticamente generado por sepa-index-gen dentro de preciazo.`;
        warnings +=
          "⁉️⚠️ dia de semana incorrecto, puede haberse subido incorrectamente ";
      }
-      markdown += `\n  * ${id} ${warnings} ${fileExists ? `[✅ descargar](${link})` : "❌"} (primera vez visto: ${dateTimeFormatter.format(resource.firstSeenAt)})`;
+      markdown += `\n  * ${id} ${warnings} ${fileExists ? `[✅ descargar](${link})` : "❌"} (${dateTimeFormatter.format(resource.modified)})`;
    }
  }