mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-22 14:16:19 +00:00
archivar de verda
This commit is contained in:
parent
cdac98d251
commit
0a8178d649
4 changed files with 151 additions and 15 deletions
5
.github/workflows/sepa-precios-archiver.yml
vendored
5
.github/workflows/sepa-precios-archiver.yml
vendored
|
@ -11,8 +11,7 @@ jobs:
|
|||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup Bun
|
||||
uses: oven-sh/setup-bun@v1
|
||||
- uses: oven-sh/setup-bun@v2
|
||||
with:
|
||||
bun-version: latest
|
||||
|
||||
|
@ -21,5 +20,5 @@ jobs:
|
|||
GITHUB_TOKEN: ${{ secrets.ARCHIVE_GITHUB_TOKEN }}
|
||||
run: |
|
||||
cd sepa-precios-archiver
|
||||
bun install
|
||||
bun install --frozen-lockfile
|
||||
bun index.ts
|
||||
|
|
Binary file not shown.
|
@ -1,11 +1,35 @@
|
|||
import { z } from "zod";
|
||||
import { zDatasetInfo } from "./schemas";
|
||||
import { mkdtemp, writeFile } from "fs/promises";
|
||||
import { $ } from "bun";
|
||||
import { mkdtemp, writeFile, readdir, mkdir, rm } from "fs/promises";
|
||||
import { basename, extname, join } from "path";
|
||||
import { $, write } from "bun";
|
||||
import { S3Client, HeadObjectCommand } from "@aws-sdk/client-s3";
|
||||
import { Upload } from "@aws-sdk/lib-storage";
|
||||
|
||||
const dir = await mkdtemp("/tmp/sepa-precios-archiver-");
|
||||
function checkEnvVariable(variableName: string) {
|
||||
const value = process.env[variableName];
|
||||
if (value) {
|
||||
console.log(`✅ ${variableName} is set`);
|
||||
return value;
|
||||
} else {
|
||||
console.log(`❌ ${variableName} is not set`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
await $`git clone https://catdevnull:${process.env.GITHUB_TOKEN}@github.com/catdevnull/sepa-precios-metadata.git ${dir}`;
|
||||
const GITHUB_TOKEN = checkEnvVariable("GITHUB_TOKEN");
|
||||
const B2_BUCKET_NAME = checkEnvVariable("B2_BUCKET_NAME");
|
||||
const B2_BUCKET_KEY_ID = checkEnvVariable("B2_BUCKET_KEY_ID");
|
||||
const B2_BUCKET_KEY = checkEnvVariable("B2_BUCKET_KEY");
|
||||
|
||||
const s3 = new S3Client({
|
||||
endpoint: "https://s3.us-west-004.backblazeb2.com",
|
||||
region: "us-west-004",
|
||||
credentials: {
|
||||
accessKeyId: B2_BUCKET_KEY_ID,
|
||||
secretAccessKey: B2_BUCKET_KEY,
|
||||
},
|
||||
});
|
||||
|
||||
async function getRawDatasetInfo() {
|
||||
const response = await fetch(
|
||||
|
@ -15,14 +39,125 @@ async function getRawDatasetInfo() {
|
|||
return json;
|
||||
}
|
||||
|
||||
const rawDatasetInfo = await getRawDatasetInfo();
|
||||
const datasetInfo = z.object({ result: zDatasetInfo }).parse(rawDatasetInfo);
|
||||
async function saveDatasetInfoIntoRepo(datasetInfo: any) {
|
||||
const dir = await mkdtemp("/tmp/sepa-precios-archiver-metadata-repo-");
|
||||
try {
|
||||
await $`git clone https://catdevnull:${GITHUB_TOKEN}@github.com/catdevnull/sepa-precios-metadata.git ${dir}`;
|
||||
await writeFile(
|
||||
dir + "/dataset-info.json",
|
||||
JSON.stringify(datasetInfo, null, 2),
|
||||
);
|
||||
await $`cd ${dir} && git add dataset-info.json`;
|
||||
await $`cd ${dir} && git diff --staged --quiet || git commit -m "Update dataset info"`;
|
||||
await $`cd ${dir} && git push origin main`;
|
||||
} finally {
|
||||
await $`rm -rf ${dir}`;
|
||||
}
|
||||
console.log(`✅ Saved dataset info into repo`);
|
||||
}
|
||||
|
||||
await writeFile(
|
||||
dir + "/dataset-info.json",
|
||||
async function checkFileExistsInB2(fileName: string): Promise<boolean> {
|
||||
try {
|
||||
await s3.send(
|
||||
new HeadObjectCommand({
|
||||
Bucket: B2_BUCKET_NAME,
|
||||
Key: fileName,
|
||||
}),
|
||||
);
|
||||
return true;
|
||||
} catch (error) {
|
||||
if ((error as any).name === "NotFound") {
|
||||
return false;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async function uploadToB2Bucket(
|
||||
fileName: string,
|
||||
fileContent: ReadableStream | Blob | string,
|
||||
) {
|
||||
const upload = new Upload({
|
||||
client: s3,
|
||||
params: {
|
||||
Bucket: B2_BUCKET_NAME,
|
||||
Key: fileName,
|
||||
Body: fileContent,
|
||||
},
|
||||
});
|
||||
|
||||
await upload.done();
|
||||
}
|
||||
|
||||
const rawDatasetInfo = await getRawDatasetInfo();
|
||||
|
||||
await saveDatasetInfoIntoRepo(rawDatasetInfo);
|
||||
|
||||
let errored = false;
|
||||
|
||||
function checkRes(
|
||||
res: Response,
|
||||
): res is Response & { body: ReadableStream<Uint8Array> } {
|
||||
if (!res.ok) {
|
||||
console.error(`❌ Error downloading ${res.url}`);
|
||||
errored = true;
|
||||
return false;
|
||||
}
|
||||
if (!res.body) throw new Error(`❌ No body in response`);
|
||||
return true;
|
||||
}
|
||||
|
||||
await uploadToB2Bucket(
|
||||
`timestamped-metadata/${new Date().toISOString()}.json`,
|
||||
JSON.stringify(rawDatasetInfo, null, 2),
|
||||
);
|
||||
|
||||
await $`cd ${dir} && git add dataset-info.json`;
|
||||
await $`cd ${dir} && git diff --staged --quiet || git commit -m "Update dataset info"`;
|
||||
await $`cd ${dir} && git push origin main`;
|
||||
const datasetInfo = z.object({ result: zDatasetInfo }).parse(rawDatasetInfo);
|
||||
for (const resource of datasetInfo.result.resources) {
|
||||
if (extname(resource.url) === ".zip") {
|
||||
const fileName = `${resource.id}-${basename(resource.url)}-repackaged.tar.zst`;
|
||||
if (await checkFileExistsInB2(fileName)) continue;
|
||||
console.log(`⬇️ Downloading, repackaging and uploading ${resource.url}`);
|
||||
const dir = await mkdtemp("/tmp/sepa-precios-archiver-repackage-");
|
||||
console.info(dir);
|
||||
try {
|
||||
const response = await fetch(resource.url);
|
||||
if (!checkRes(response)) continue;
|
||||
// const response = Bun.file(
|
||||
// `/Users/diablo/Downloads/dump precios justos/${basename(resource.url)}`,
|
||||
// );
|
||||
|
||||
const zip = join(dir, "zip");
|
||||
await write(zip, response);
|
||||
await $`unzip ${zip} -d ${dir}`;
|
||||
await rm(zip);
|
||||
|
||||
for (const file of await readdir(dir)) {
|
||||
const path = join(dir, file);
|
||||
if (extname(file) !== ".zip") continue;
|
||||
const extractDir = join(dir, basename(file, ".zip"));
|
||||
await mkdir(extractDir, { recursive: true });
|
||||
await $`cd ${dir} && unzip ${path} -d ${extractDir}`;
|
||||
await rm(path);
|
||||
}
|
||||
|
||||
const compressed =
|
||||
await $`tar -c -C ${dir} ${dir} | zstd -15 --long -T0`.blob();
|
||||
await uploadToB2Bucket(fileName, compressed);
|
||||
} finally {
|
||||
await $`rm -rf ${dir}`;
|
||||
}
|
||||
} else {
|
||||
const fileName = `${resource.id}-${basename(resource.url)}`;
|
||||
if (await checkFileExistsInB2(fileName)) continue;
|
||||
console.log(`⬇️ Downloading and reuploading ${resource.url}`);
|
||||
const response = await fetch(resource.url);
|
||||
if (!checkRes(response)) continue;
|
||||
|
||||
await uploadToB2Bucket(fileName, response.body);
|
||||
}
|
||||
}
|
||||
|
||||
if (errored) {
|
||||
process.exit(1);
|
||||
}
|
||||
|
|
|
@ -6,9 +6,11 @@
|
|||
"@types/bun": "latest"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"typescript": "^5.0.0"
|
||||
"typescript": "^5.5.4"
|
||||
},
|
||||
"dependencies": {
|
||||
"@aws-sdk/client-s3": "^3.637.0",
|
||||
"@aws-sdk/lib-storage": "^3.637.0",
|
||||
"zod": "^3.23.8"
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue