mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 03:26:19 +00:00
archivar de verda
This commit is contained in:
parent
cdac98d251
commit
0a8178d649
4 changed files with 151 additions and 15 deletions
5
.github/workflows/sepa-precios-archiver.yml
vendored
5
.github/workflows/sepa-precios-archiver.yml
vendored
|
@ -11,8 +11,7 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- name: Setup Bun
|
- uses: oven-sh/setup-bun@v2
|
||||||
uses: oven-sh/setup-bun@v1
|
|
||||||
with:
|
with:
|
||||||
bun-version: latest
|
bun-version: latest
|
||||||
|
|
||||||
|
@ -21,5 +20,5 @@ jobs:
|
||||||
GITHUB_TOKEN: ${{ secrets.ARCHIVE_GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.ARCHIVE_GITHUB_TOKEN }}
|
||||||
run: |
|
run: |
|
||||||
cd sepa-precios-archiver
|
cd sepa-precios-archiver
|
||||||
bun install
|
bun install --frozen-lockfile
|
||||||
bun index.ts
|
bun index.ts
|
||||||
|
|
Binary file not shown.
|
@ -1,11 +1,35 @@
|
||||||
import { z } from "zod";
|
import { z } from "zod";
|
||||||
import { zDatasetInfo } from "./schemas";
|
import { zDatasetInfo } from "./schemas";
|
||||||
import { mkdtemp, writeFile } from "fs/promises";
|
import { mkdtemp, writeFile, readdir, mkdir, rm } from "fs/promises";
|
||||||
import { $ } from "bun";
|
import { basename, extname, join } from "path";
|
||||||
|
import { $, write } from "bun";
|
||||||
|
import { S3Client, HeadObjectCommand } from "@aws-sdk/client-s3";
|
||||||
|
import { Upload } from "@aws-sdk/lib-storage";
|
||||||
|
|
||||||
const dir = await mkdtemp("/tmp/sepa-precios-archiver-");
|
function checkEnvVariable(variableName: string) {
|
||||||
|
const value = process.env[variableName];
|
||||||
|
if (value) {
|
||||||
|
console.log(`✅ ${variableName} is set`);
|
||||||
|
return value;
|
||||||
|
} else {
|
||||||
|
console.log(`❌ ${variableName} is not set`);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
await $`git clone https://catdevnull:${process.env.GITHUB_TOKEN}@github.com/catdevnull/sepa-precios-metadata.git ${dir}`;
|
const GITHUB_TOKEN = checkEnvVariable("GITHUB_TOKEN");
|
||||||
|
const B2_BUCKET_NAME = checkEnvVariable("B2_BUCKET_NAME");
|
||||||
|
const B2_BUCKET_KEY_ID = checkEnvVariable("B2_BUCKET_KEY_ID");
|
||||||
|
const B2_BUCKET_KEY = checkEnvVariable("B2_BUCKET_KEY");
|
||||||
|
|
||||||
|
const s3 = new S3Client({
|
||||||
|
endpoint: "https://s3.us-west-004.backblazeb2.com",
|
||||||
|
region: "us-west-004",
|
||||||
|
credentials: {
|
||||||
|
accessKeyId: B2_BUCKET_KEY_ID,
|
||||||
|
secretAccessKey: B2_BUCKET_KEY,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
async function getRawDatasetInfo() {
|
async function getRawDatasetInfo() {
|
||||||
const response = await fetch(
|
const response = await fetch(
|
||||||
|
@ -15,14 +39,125 @@ async function getRawDatasetInfo() {
|
||||||
return json;
|
return json;
|
||||||
}
|
}
|
||||||
|
|
||||||
const rawDatasetInfo = await getRawDatasetInfo();
|
async function saveDatasetInfoIntoRepo(datasetInfo: any) {
|
||||||
const datasetInfo = z.object({ result: zDatasetInfo }).parse(rawDatasetInfo);
|
const dir = await mkdtemp("/tmp/sepa-precios-archiver-metadata-repo-");
|
||||||
|
try {
|
||||||
|
await $`git clone https://catdevnull:${GITHUB_TOKEN}@github.com/catdevnull/sepa-precios-metadata.git ${dir}`;
|
||||||
|
await writeFile(
|
||||||
|
dir + "/dataset-info.json",
|
||||||
|
JSON.stringify(datasetInfo, null, 2),
|
||||||
|
);
|
||||||
|
await $`cd ${dir} && git add dataset-info.json`;
|
||||||
|
await $`cd ${dir} && git diff --staged --quiet || git commit -m "Update dataset info"`;
|
||||||
|
await $`cd ${dir} && git push origin main`;
|
||||||
|
} finally {
|
||||||
|
await $`rm -rf ${dir}`;
|
||||||
|
}
|
||||||
|
console.log(`✅ Saved dataset info into repo`);
|
||||||
|
}
|
||||||
|
|
||||||
await writeFile(
|
async function checkFileExistsInB2(fileName: string): Promise<boolean> {
|
||||||
dir + "/dataset-info.json",
|
try {
|
||||||
|
await s3.send(
|
||||||
|
new HeadObjectCommand({
|
||||||
|
Bucket: B2_BUCKET_NAME,
|
||||||
|
Key: fileName,
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
return true;
|
||||||
|
} catch (error) {
|
||||||
|
if ((error as any).name === "NotFound") {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function uploadToB2Bucket(
|
||||||
|
fileName: string,
|
||||||
|
fileContent: ReadableStream | Blob | string,
|
||||||
|
) {
|
||||||
|
const upload = new Upload({
|
||||||
|
client: s3,
|
||||||
|
params: {
|
||||||
|
Bucket: B2_BUCKET_NAME,
|
||||||
|
Key: fileName,
|
||||||
|
Body: fileContent,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
await upload.done();
|
||||||
|
}
|
||||||
|
|
||||||
|
const rawDatasetInfo = await getRawDatasetInfo();
|
||||||
|
|
||||||
|
await saveDatasetInfoIntoRepo(rawDatasetInfo);
|
||||||
|
|
||||||
|
let errored = false;
|
||||||
|
|
||||||
|
function checkRes(
|
||||||
|
res: Response,
|
||||||
|
): res is Response & { body: ReadableStream<Uint8Array> } {
|
||||||
|
if (!res.ok) {
|
||||||
|
console.error(`❌ Error downloading ${res.url}`);
|
||||||
|
errored = true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (!res.body) throw new Error(`❌ No body in response`);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
await uploadToB2Bucket(
|
||||||
|
`timestamped-metadata/${new Date().toISOString()}.json`,
|
||||||
JSON.stringify(rawDatasetInfo, null, 2),
|
JSON.stringify(rawDatasetInfo, null, 2),
|
||||||
);
|
);
|
||||||
|
|
||||||
await $`cd ${dir} && git add dataset-info.json`;
|
const datasetInfo = z.object({ result: zDatasetInfo }).parse(rawDatasetInfo);
|
||||||
await $`cd ${dir} && git diff --staged --quiet || git commit -m "Update dataset info"`;
|
for (const resource of datasetInfo.result.resources) {
|
||||||
await $`cd ${dir} && git push origin main`;
|
if (extname(resource.url) === ".zip") {
|
||||||
|
const fileName = `${resource.id}-${basename(resource.url)}-repackaged.tar.zst`;
|
||||||
|
if (await checkFileExistsInB2(fileName)) continue;
|
||||||
|
console.log(`⬇️ Downloading, repackaging and uploading ${resource.url}`);
|
||||||
|
const dir = await mkdtemp("/tmp/sepa-precios-archiver-repackage-");
|
||||||
|
console.info(dir);
|
||||||
|
try {
|
||||||
|
const response = await fetch(resource.url);
|
||||||
|
if (!checkRes(response)) continue;
|
||||||
|
// const response = Bun.file(
|
||||||
|
// `/Users/diablo/Downloads/dump precios justos/${basename(resource.url)}`,
|
||||||
|
// );
|
||||||
|
|
||||||
|
const zip = join(dir, "zip");
|
||||||
|
await write(zip, response);
|
||||||
|
await $`unzip ${zip} -d ${dir}`;
|
||||||
|
await rm(zip);
|
||||||
|
|
||||||
|
for (const file of await readdir(dir)) {
|
||||||
|
const path = join(dir, file);
|
||||||
|
if (extname(file) !== ".zip") continue;
|
||||||
|
const extractDir = join(dir, basename(file, ".zip"));
|
||||||
|
await mkdir(extractDir, { recursive: true });
|
||||||
|
await $`cd ${dir} && unzip ${path} -d ${extractDir}`;
|
||||||
|
await rm(path);
|
||||||
|
}
|
||||||
|
|
||||||
|
const compressed =
|
||||||
|
await $`tar -c -C ${dir} ${dir} | zstd -15 --long -T0`.blob();
|
||||||
|
await uploadToB2Bucket(fileName, compressed);
|
||||||
|
} finally {
|
||||||
|
await $`rm -rf ${dir}`;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const fileName = `${resource.id}-${basename(resource.url)}`;
|
||||||
|
if (await checkFileExistsInB2(fileName)) continue;
|
||||||
|
console.log(`⬇️ Downloading and reuploading ${resource.url}`);
|
||||||
|
const response = await fetch(resource.url);
|
||||||
|
if (!checkRes(response)) continue;
|
||||||
|
|
||||||
|
await uploadToB2Bucket(fileName, response.body);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (errored) {
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
|
@ -6,9 +6,11 @@
|
||||||
"@types/bun": "latest"
|
"@types/bun": "latest"
|
||||||
},
|
},
|
||||||
"peerDependencies": {
|
"peerDependencies": {
|
||||||
"typescript": "^5.0.0"
|
"typescript": "^5.5.4"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"@aws-sdk/client-s3": "^3.637.0",
|
||||||
|
"@aws-sdk/lib-storage": "^3.637.0",
|
||||||
"zod": "^3.23.8"
|
"zod": "^3.23.8"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue