mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-21 22:16:18 +00:00
generar index
This commit is contained in:
parent
4f3038eee9
commit
41c299a8ce
11 changed files with 518 additions and 18 deletions
BIN
sepa/bun.lockb
BIN
sepa/bun.lockb
Binary file not shown.
|
@ -4,6 +4,7 @@
|
|||
"workspaces": [
|
||||
"sepa-precios-archiver",
|
||||
"sepa-precios-importer",
|
||||
"sepa-index-gen",
|
||||
"ckan"
|
||||
]
|
||||
}
|
||||
|
|
175
sepa/sepa-index-gen/.gitignore
vendored
Normal file
175
sepa/sepa-index-gen/.gitignore
vendored
Normal file
|
@ -0,0 +1,175 @@
|
|||
# Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore
|
||||
|
||||
# Logs
|
||||
|
||||
logs
|
||||
_.log
|
||||
npm-debug.log_
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
lerna-debug.log*
|
||||
.pnpm-debug.log*
|
||||
|
||||
# Caches
|
||||
|
||||
.cache
|
||||
|
||||
# Diagnostic reports (https://nodejs.org/api/report.html)
|
||||
|
||||
report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
|
||||
|
||||
# Runtime data
|
||||
|
||||
pids
|
||||
_.pid
|
||||
_.seed
|
||||
*.pid.lock
|
||||
|
||||
# Directory for instrumented libs generated by jscoverage/JSCover
|
||||
|
||||
lib-cov
|
||||
|
||||
# Coverage directory used by tools like istanbul
|
||||
|
||||
coverage
|
||||
*.lcov
|
||||
|
||||
# nyc test coverage
|
||||
|
||||
.nyc_output
|
||||
|
||||
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
|
||||
|
||||
.grunt
|
||||
|
||||
# Bower dependency directory (https://bower.io/)
|
||||
|
||||
bower_components
|
||||
|
||||
# node-waf configuration
|
||||
|
||||
.lock-wscript
|
||||
|
||||
# Compiled binary addons (https://nodejs.org/api/addons.html)
|
||||
|
||||
build/Release
|
||||
|
||||
# Dependency directories
|
||||
|
||||
node_modules/
|
||||
jspm_packages/
|
||||
|
||||
# Snowpack dependency directory (https://snowpack.dev/)
|
||||
|
||||
web_modules/
|
||||
|
||||
# TypeScript cache
|
||||
|
||||
*.tsbuildinfo
|
||||
|
||||
# Optional npm cache directory
|
||||
|
||||
.npm
|
||||
|
||||
# Optional eslint cache
|
||||
|
||||
.eslintcache
|
||||
|
||||
# Optional stylelint cache
|
||||
|
||||
.stylelintcache
|
||||
|
||||
# Microbundle cache
|
||||
|
||||
.rpt2_cache/
|
||||
.rts2_cache_cjs/
|
||||
.rts2_cache_es/
|
||||
.rts2_cache_umd/
|
||||
|
||||
# Optional REPL history
|
||||
|
||||
.node_repl_history
|
||||
|
||||
# Output of 'npm pack'
|
||||
|
||||
*.tgz
|
||||
|
||||
# Yarn Integrity file
|
||||
|
||||
.yarn-integrity
|
||||
|
||||
# dotenv environment variable files
|
||||
|
||||
.env
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
.env.local
|
||||
|
||||
# parcel-bundler cache (https://parceljs.org/)
|
||||
|
||||
.parcel-cache
|
||||
|
||||
# Next.js build output
|
||||
|
||||
.next
|
||||
out
|
||||
|
||||
# Nuxt.js build / generate output
|
||||
|
||||
.nuxt
|
||||
dist
|
||||
|
||||
# Gatsby files
|
||||
|
||||
# Comment in the public line in if your project uses Gatsby and not Next.js
|
||||
|
||||
# https://nextjs.org/blog/next-9-1#public-directory-support
|
||||
|
||||
# public
|
||||
|
||||
# vuepress build output
|
||||
|
||||
.vuepress/dist
|
||||
|
||||
# vuepress v2.x temp and cache directory
|
||||
|
||||
.temp
|
||||
|
||||
# Docusaurus cache and generated files
|
||||
|
||||
.docusaurus
|
||||
|
||||
# Serverless directories
|
||||
|
||||
.serverless/
|
||||
|
||||
# FuseBox cache
|
||||
|
||||
.fusebox/
|
||||
|
||||
# DynamoDB Local files
|
||||
|
||||
.dynamodb/
|
||||
|
||||
# TernJS port file
|
||||
|
||||
.tern-port
|
||||
|
||||
# Stores VSCode versions used for testing VSCode extensions
|
||||
|
||||
.vscode-test
|
||||
|
||||
# yarn v2
|
||||
|
||||
.yarn/cache
|
||||
.yarn/unplugged
|
||||
.yarn/build-state.yml
|
||||
.yarn/install-state.gz
|
||||
.pnp.*
|
||||
|
||||
# IntelliJ based IDEs
|
||||
.idea
|
||||
|
||||
# Finder (MacOS) folder config
|
||||
.DS_Store
|
15
sepa/sepa-index-gen/README.md
Normal file
15
sepa/sepa-index-gen/README.md
Normal file
|
@ -0,0 +1,15 @@
|
|||
# sepa-index-gen
|
||||
|
||||
To install dependencies:
|
||||
|
||||
```bash
|
||||
bun install
|
||||
```
|
||||
|
||||
To run:
|
||||
|
||||
```bash
|
||||
bun run index.ts
|
||||
```
|
||||
|
||||
This project was created using `bun init` in bun v1.1.26. [Bun](https://bun.sh) is a fast all-in-one JavaScript runtime.
|
67
sepa/sepa-index-gen/b2.ts
Normal file
67
sepa/sepa-index-gen/b2.ts
Normal file
|
@ -0,0 +1,67 @@
|
|||
import {
|
||||
S3Client,
|
||||
HeadObjectCommand,
|
||||
ListObjectsV2Command,
|
||||
GetObjectCommand,
|
||||
} from "@aws-sdk/client-s3";
|
||||
import { Upload } from "@aws-sdk/lib-storage";
|
||||
|
||||
function checkEnvVariable(variableName: string) {
|
||||
const value = process.env[variableName];
|
||||
if (value) {
|
||||
console.log(`✅ ${variableName} is set`);
|
||||
return value;
|
||||
} else {
|
||||
console.log(`❌ ${variableName} is not set`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
export const B2_BUCKET_NAME = checkEnvVariable("B2_BUCKET_NAME");
|
||||
const B2_BUCKET_KEY_ID = checkEnvVariable("B2_BUCKET_KEY_ID");
|
||||
const B2_BUCKET_KEY = checkEnvVariable("B2_BUCKET_KEY");
|
||||
export const s3 = new S3Client({
|
||||
endpoint: "https://s3.us-west-004.backblazeb2.com",
|
||||
region: "us-west-004",
|
||||
credentials: {
|
||||
accessKeyId: B2_BUCKET_KEY_ID,
|
||||
secretAccessKey: B2_BUCKET_KEY,
|
||||
},
|
||||
});
|
||||
|
||||
export async function listDirectory(directoryName: string) {
|
||||
const command = new ListObjectsV2Command({
|
||||
Bucket: B2_BUCKET_NAME,
|
||||
Prefix: directoryName ? `${directoryName}/` : "",
|
||||
Delimiter: "/",
|
||||
});
|
||||
|
||||
const response = await s3.send(command);
|
||||
return response.Contents?.map((item) => item.Key ?? "") ?? [];
|
||||
}
|
||||
|
||||
export async function getFileContent(fileName: string) {
|
||||
const fileContent = await s3.send(
|
||||
new GetObjectCommand({
|
||||
Bucket: B2_BUCKET_NAME,
|
||||
Key: fileName,
|
||||
})
|
||||
);
|
||||
return (await fileContent.Body?.transformToString()) ?? "";
|
||||
}
|
||||
|
||||
export async function checkFileExists(fileName: string): Promise<boolean> {
|
||||
try {
|
||||
await s3.send(
|
||||
new HeadObjectCommand({
|
||||
Bucket: B2_BUCKET_NAME,
|
||||
Key: fileName,
|
||||
})
|
||||
);
|
||||
return true;
|
||||
} catch (error) {
|
||||
if ((error as any).name === "NotFound") {
|
||||
return false;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
53
sepa/sepa-index-gen/index-resources.ts
Normal file
53
sepa/sepa-index-gen/index-resources.ts
Normal file
|
@ -0,0 +1,53 @@
|
|||
import { zDatasetInfo, type DatasetInfo, type Resource } from "ckan/schemas";
|
||||
import { getFileContent, listDirectory } from "./b2";
|
||||
|
||||
import { basename } from "path";
|
||||
|
||||
export async function indexResources() {
|
||||
const fileNames = await listDirectory("timestamped-metadata");
|
||||
|
||||
let datasetInfos = new Map<string, Map<string, Resource>>();
|
||||
|
||||
await Promise.all(
|
||||
fileNames.map(async (fileName) => {
|
||||
let resources = new Map<string, Resource>();
|
||||
|
||||
const fileContentText = await getFileContent(fileName);
|
||||
const json = JSON.parse(fileContentText);
|
||||
if (!json.success) {
|
||||
console.log(`skipping ${fileName} because of ${json.error.message}`);
|
||||
return;
|
||||
}
|
||||
let datasetInfo: DatasetInfo;
|
||||
try {
|
||||
datasetInfo = zDatasetInfo.parse(json.result);
|
||||
} catch (e) {
|
||||
console.log(fileName, e);
|
||||
throw e;
|
||||
}
|
||||
|
||||
for (const resource of datasetInfo.resources) {
|
||||
const id = `${resource.id}-${resource.revision_id}`;
|
||||
const existingResource = resources.get(id);
|
||||
if (
|
||||
existingResource &&
|
||||
existingResource.modified &&
|
||||
existingResource.modified > (resource.modified ?? 0)
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
resources.set(id, resource);
|
||||
}
|
||||
const base = basename(fileName, ".json");
|
||||
datasetInfos.set(base, resources);
|
||||
})
|
||||
);
|
||||
|
||||
return Array.from(datasetInfos.entries()).reduce(
|
||||
(acc, [fileName, resources]) => {
|
||||
acc[fileName] = Array.from(resources.values());
|
||||
return acc;
|
||||
},
|
||||
{} as Record<string, Resource[]>
|
||||
);
|
||||
}
|
141
sepa/sepa-index-gen/index.ts
Normal file
141
sepa/sepa-index-gen/index.ts
Normal file
|
@ -0,0 +1,141 @@
|
|||
import { zResource, type Resource } from "ckan/schemas";
|
||||
import { z } from "zod";
|
||||
import { listDirectory } from "./b2";
|
||||
import { isSameDay } from "date-fns";
|
||||
import { indexResources } from "./index-resources";
|
||||
|
||||
export async function generateMarkdown() {
|
||||
const resourcesIndex = await indexResources();
|
||||
|
||||
const datasets = z
|
||||
.record(z.string(), z.array(zResource))
|
||||
.parse(resourcesIndex);
|
||||
const datasetsArray = Object.entries(datasets)
|
||||
.map(([date, resources]) => ({
|
||||
date: new Date(date),
|
||||
resources,
|
||||
}))
|
||||
.sort((a, b) => +b.date - +a.date);
|
||||
|
||||
let latestResources = new Map<string, Resource>();
|
||||
|
||||
for (const { date, resources } of datasetsArray) {
|
||||
for (const resource of resources) {
|
||||
const id = `${resource.id}-revID-${resource.revision_id}`;
|
||||
if (latestResources.has(id)) continue;
|
||||
latestResources.set(id, resource);
|
||||
}
|
||||
}
|
||||
|
||||
function getDate(resource: Resource) {
|
||||
{
|
||||
const matches = resource.name.match(/precios_(\d{4})(\d{2})(\d{2})/);
|
||||
if (matches) {
|
||||
return new Date(
|
||||
parseInt(matches[1]),
|
||||
parseInt(matches[2]) - 1,
|
||||
parseInt(matches[3])
|
||||
);
|
||||
}
|
||||
}
|
||||
{
|
||||
const matches = resource.description.match(
|
||||
/(?<day>\d{2})\/(?<month>\d{2})\/(?<year>\d{4})|(?<year2>\d{4})-(?<month2>\d{2})-(?<day2>\d{2})/
|
||||
);
|
||||
if (matches) {
|
||||
const { day, month, year, day2, month2, year2 } = matches.groups!;
|
||||
return new Date(
|
||||
parseInt(year || year2),
|
||||
parseInt(month || month2) - 1,
|
||||
parseInt(day || day2)
|
||||
);
|
||||
}
|
||||
}
|
||||
console.debug(resource);
|
||||
throw new Error(`No date found for ${resource.name}`);
|
||||
}
|
||||
|
||||
const fileList = await listDirectory("");
|
||||
|
||||
const zipResources = [...latestResources.values()].filter(
|
||||
(r) => r.format === "ZIP"
|
||||
);
|
||||
const dates = [
|
||||
...new Set(
|
||||
zipResources.map((r) => getDate(r).toISOString().split("T")[0]).sort()
|
||||
),
|
||||
];
|
||||
|
||||
// check if dates are missing in between min and max date
|
||||
const minDate = new Date(
|
||||
Math.min(...[...dates].map((d) => new Date(d).getTime()))
|
||||
);
|
||||
const maxDate = new Date(
|
||||
Math.max(...[...dates].map((d) => new Date(d).getTime()))
|
||||
);
|
||||
for (let d = minDate; d <= maxDate; d.setDate(d.getDate() + 1)) {
|
||||
const dateStr = d.toISOString().split("T")[0];
|
||||
if (!dates.includes(dateStr)) dates.push(dateStr);
|
||||
}
|
||||
dates.sort();
|
||||
|
||||
function getWeekDay(searchIn: string) {
|
||||
if (/domingo/iu.test(searchIn)) return 0;
|
||||
if (/lunes/iu.test(searchIn)) return 1;
|
||||
if (/martes/iu.test(searchIn)) return 2;
|
||||
if (/mi[eé]rcoles/iu.test(searchIn)) return 3;
|
||||
if (/jueves/iu.test(searchIn)) return 4;
|
||||
if (/viernes/iu.test(searchIn)) return 5;
|
||||
if (/s[aá]bado/iu.test(searchIn)) return 6;
|
||||
return null;
|
||||
}
|
||||
function getWeekDayInResource(resource: Resource) {
|
||||
return getWeekDay(resource.description) ?? getWeekDay(resource.name);
|
||||
}
|
||||
|
||||
let markdown = `# index de archivo de datasets de precios SEPA
|
||||
|
||||
esto esta automáticamente generado por sepa-index-gen dentro de preciazo.`;
|
||||
|
||||
const formatter = Intl.DateTimeFormat("es-AR", {
|
||||
year: "numeric",
|
||||
month: "2-digit",
|
||||
day: "2-digit",
|
||||
weekday: "long",
|
||||
});
|
||||
const dateTimeFormatter = Intl.DateTimeFormat("es-AR", {
|
||||
year: "numeric",
|
||||
month: "2-digit",
|
||||
day: "2-digit",
|
||||
hour: "2-digit",
|
||||
minute: "2-digit",
|
||||
});
|
||||
for (const dateStr of dates) {
|
||||
const date = new Date(dateStr);
|
||||
markdown += `\n* ${formatter.format(date)}:`;
|
||||
const resourcesInDate = zipResources.filter((r) =>
|
||||
isSameDay(getDate(r), date)
|
||||
);
|
||||
if (!resourcesInDate.length) {
|
||||
markdown += " ❌ no tengo recursos para esta fecha";
|
||||
}
|
||||
for (const resource of resourcesInDate) {
|
||||
const id = `${resource.id}-revID-${resource.revision_id}`;
|
||||
const fileExists = fileList.find((file) => file.startsWith(id));
|
||||
const link =
|
||||
fileExists ??
|
||||
`https://f004.backblazeb2.com/file/precios-justos-datasets/${fileExists}`;
|
||||
let warnings = "";
|
||||
if (
|
||||
getWeekDayInResource(resource) &&
|
||||
date.getDay() !== getWeekDayInResource(resource)
|
||||
) {
|
||||
warnings +=
|
||||
"⁉️⚠️ dia de semana incorrecto, puede haberse subido incorrectamente ";
|
||||
}
|
||||
markdown += `\n * ${id} ${warnings} ${fileExists ? `[✅ descargar](${link})` : "❌"} (${dateTimeFormatter.format(resource.modified)})`;
|
||||
}
|
||||
}
|
||||
|
||||
return markdown;
|
||||
}
|
19
sepa/sepa-index-gen/package.json
Normal file
19
sepa/sepa-index-gen/package.json
Normal file
|
@ -0,0 +1,19 @@
|
|||
{
|
||||
"name": "sepa-index-gen",
|
||||
"module": "index.ts",
|
||||
"type": "module",
|
||||
"devDependencies": {
|
||||
"@types/bun": "latest",
|
||||
"prettier": "^3.3.3"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"typescript": "^5.0.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"@aws-sdk/client-s3": "^3.637.0",
|
||||
"@aws-sdk/lib-storage": "^3.637.0",
|
||||
"ckan": "workspace:*",
|
||||
"date-fns": "^3.6.0",
|
||||
"zod": "^3.23.8"
|
||||
}
|
||||
}
|
27
sepa/sepa-index-gen/tsconfig.json
Normal file
27
sepa/sepa-index-gen/tsconfig.json
Normal file
|
@ -0,0 +1,27 @@
|
|||
{
|
||||
"compilerOptions": {
|
||||
// Enable latest features
|
||||
"lib": ["ESNext", "DOM"],
|
||||
"target": "ESNext",
|
||||
"module": "ESNext",
|
||||
"moduleDetection": "force",
|
||||
"jsx": "react-jsx",
|
||||
"allowJs": true,
|
||||
|
||||
// Bundler mode
|
||||
"moduleResolution": "bundler",
|
||||
"allowImportingTsExtensions": true,
|
||||
"verbatimModuleSyntax": true,
|
||||
"noEmit": true,
|
||||
|
||||
// Best practices
|
||||
"strict": true,
|
||||
"skipLibCheck": true,
|
||||
"noFallthroughCasesInSwitch": true,
|
||||
|
||||
// Some stricter flags (disabled by default)
|
||||
"noUnusedLocals": false,
|
||||
"noUnusedParameters": false,
|
||||
"noPropertyAccessFromIndexSignature": false
|
||||
}
|
||||
}
|
|
@ -5,6 +5,7 @@ import { basename, extname, join } from "path";
|
|||
import { $, write } from "bun";
|
||||
import { S3Client, HeadObjectCommand } from "@aws-sdk/client-s3";
|
||||
import { Upload } from "@aws-sdk/lib-storage";
|
||||
import { generateMarkdown } from "sepa-index-gen";
|
||||
|
||||
function checkEnvVariable(variableName: string) {
|
||||
const value = process.env[variableName];
|
||||
|
@ -38,29 +39,24 @@ async function getRawDatasetInfo() {
|
|||
console.error(
|
||||
`❌ Error fetching dataset info`,
|
||||
error,
|
||||
`retrying in 5min...`,
|
||||
`retrying in 5min...`
|
||||
);
|
||||
await new Promise((resolve) => setTimeout(resolve, 5 * 60 * 1000));
|
||||
return await getRawDatasetInfo();
|
||||
}
|
||||
}
|
||||
|
||||
async function saveDatasetInfoIntoRepo(datasetInfo: any) {
|
||||
async function saveFileIntoRepo(fileName: string, fileContent: string) {
|
||||
const dir = await mkdtemp("/tmp/sepa-precios-archiver-metadata-repo-");
|
||||
try {
|
||||
await $`git clone https://catdevnull:${GITHUB_TOKEN}@github.com/catdevnull/sepa-precios-metadata.git ${dir}`;
|
||||
await writeFile(
|
||||
dir + "/dataset-info.json",
|
||||
JSON.stringify(datasetInfo, null, 2),
|
||||
);
|
||||
await $`cd ${dir} && git add dataset-info.json`;
|
||||
await $`cd ${dir} && git config user.email "git@nulo.in" && git config user.name "github actions"`;
|
||||
await $`cd ${dir} && git diff --staged --quiet || git commit -m "Update dataset info"`;
|
||||
await writeFile(join(dir, fileName), fileContent);
|
||||
await $`cd ${dir} && git add ${fileName}`;
|
||||
await $`cd ${dir} && git diff --staged --quiet || git commit -m "Update ${fileName}"`;
|
||||
await $`cd ${dir} && git push origin main`;
|
||||
} finally {
|
||||
await $`rm -rf ${dir}`;
|
||||
}
|
||||
console.log(`✅ Saved dataset info into repo`);
|
||||
console.log(`✅ Saved ${fileName} into repo`);
|
||||
}
|
||||
|
||||
async function checkFileExistsInB2(fileName: string): Promise<boolean> {
|
||||
|
@ -69,7 +65,7 @@ async function checkFileExistsInB2(fileName: string): Promise<boolean> {
|
|||
new HeadObjectCommand({
|
||||
Bucket: B2_BUCKET_NAME,
|
||||
Key: fileName,
|
||||
}),
|
||||
})
|
||||
);
|
||||
return true;
|
||||
} catch (error) {
|
||||
|
@ -82,7 +78,7 @@ async function checkFileExistsInB2(fileName: string): Promise<boolean> {
|
|||
|
||||
async function uploadToB2Bucket(
|
||||
fileName: string,
|
||||
fileContent: ReadableStream | Blob | string,
|
||||
fileContent: ReadableStream | Blob | string
|
||||
) {
|
||||
const upload = new Upload({
|
||||
client: s3,
|
||||
|
@ -98,12 +94,15 @@ async function uploadToB2Bucket(
|
|||
|
||||
const rawDatasetInfo = await getRawDatasetInfo();
|
||||
|
||||
await saveDatasetInfoIntoRepo(rawDatasetInfo);
|
||||
await saveFileIntoRepo(
|
||||
"dataset-info.json",
|
||||
JSON.stringify(rawDatasetInfo, null, 2)
|
||||
);
|
||||
|
||||
let errored = false;
|
||||
|
||||
function checkRes(
|
||||
res: Response,
|
||||
res: Response
|
||||
): res is Response & { body: ReadableStream<Uint8Array> } {
|
||||
if (!res.ok) {
|
||||
console.error(`❌ Error downloading ${res.url}`);
|
||||
|
@ -116,7 +115,7 @@ function checkRes(
|
|||
|
||||
await uploadToB2Bucket(
|
||||
`timestamped-metadata/${new Date().toISOString()}.json`,
|
||||
JSON.stringify(rawDatasetInfo, null, 2),
|
||||
JSON.stringify(rawDatasetInfo, null, 2)
|
||||
);
|
||||
|
||||
const datasetInfo = z.object({ result: zDatasetInfo }).parse(rawDatasetInfo);
|
||||
|
@ -144,7 +143,7 @@ for (const resource of datasetInfo.result.resources) {
|
|||
|
||||
await writeFile(
|
||||
join(dir, "dataset-info.json"),
|
||||
JSON.stringify(rawDatasetInfo, null, 2),
|
||||
JSON.stringify(rawDatasetInfo, null, 2)
|
||||
);
|
||||
|
||||
const compressed =
|
||||
|
@ -163,6 +162,8 @@ for (const resource of datasetInfo.result.resources) {
|
|||
}
|
||||
}
|
||||
|
||||
await saveFileIntoRepo("index.md", await generateMarkdown());
|
||||
|
||||
if (errored) {
|
||||
process.exit(1);
|
||||
}
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
"dependencies": {
|
||||
"@aws-sdk/client-s3": "^3.637.0",
|
||||
"@aws-sdk/lib-storage": "^3.637.0",
|
||||
"zod": "^3.23.8"
|
||||
"zod": "^3.23.8",
|
||||
"sepa-index-gen": "workspace:*"
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue