importer-rs

This commit is contained in:
Cat /dev/Nulo 2024-12-01 22:54:03 -03:00
parent 307e7616bf
commit eb61721b5e
3 changed files with 108 additions and 3 deletions

View file

@ -420,6 +420,31 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "crossbeam-deque"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
dependencies = [
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
[[package]]
name = "crunchy"
version = "0.2.2"
@ -466,6 +491,12 @@ dependencies = [
"strum 0.25.0",
]
[[package]]
name = "either"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
[[package]]
name = "equivalent"
version = "1.0.1"
@ -619,8 +650,11 @@ dependencies = [
"anyhow",
"csv",
"duckdb",
"rayon",
"regex",
"serde",
"tar",
"zstd",
]
[[package]]
@ -966,6 +1000,26 @@ dependencies = [
"getrandom",
]
[[package]]
name = "rayon"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
dependencies = [
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
dependencies = [
"crossbeam-deque",
"crossbeam-utils",
]
[[package]]
name = "redox_syscall"
version = "0.5.7"
@ -1497,3 +1551,31 @@ dependencies = [
"quote",
"syn 2.0.90",
]
[[package]]
name = "zstd"
version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9"
dependencies = [
"zstd-safe",
]
[[package]]
name = "zstd-safe"
version = "7.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059"
dependencies = [
"zstd-sys",
]
[[package]]
name = "zstd-sys"
version = "2.0.13+zstd.1.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa"
dependencies = [
"cc",
"pkg-config",
]

View file

@ -7,5 +7,8 @@ edition = "2021"
anyhow = "1.0.93"
csv = "1.3.1"
duckdb = { version = "1.1.1", features = ["bundled"] }
rayon = "1.10.0"
regex = "1.11.1"
serde = { version = "1.0.215", features = ["derive"] }
tar = "0.4.43"
zstd = "0.13.2"

View file

@ -1,8 +1,11 @@
use rayon::prelude::*;
use std::{
env::args,
io::{self, BufRead},
path::{Path, PathBuf},
};
use tar::Archive;
use zstd::Decoder;
use duckdb::Connection;
@ -232,13 +235,30 @@ fn import_dataset(conn: &Connection, dir_path: PathBuf) -> anyhow::Result<()> {
fn main() {
let conn = Connection::open("importer-rs.db").unwrap();
// let decoded = Decoder::new(
// std::fs::File::open("/d076720f-a7f0-4af8-b1d6-1b99d5a90c14-revID-a3de6c6a-8795-4348-a16d-bc626e9f1b2e-sepa_jueves.zip-repackaged.tar.zst").unwrap(),
// )
// .unwrap();
// let mut archive = Archive::new(decoded);
// archive
// .entries()
// .unwrap()
// .filter_map(|e| e.ok())
// .filter(|e| e.path().unwrap().ends_with("comercio.csv"))
// .collect::<Vec<_>>()
// .par_iter()
// .for_each(|entry| {
// let path = entry.path().unwrap();
// let parent = path.parent().unwrap();
// import_dataset(&conn.try_clone().unwrap(), parent.to_path_buf()).unwrap();
// });
import_dataset(
&conn.try_clone().unwrap(),
args()
.nth(1)
.unwrap_or(
"/Users/diablo/Downloads/sepa_1_comercio-sepa-10_2024-11-23_09-05-11/".to_owned(),
)
.unwrap_or("/sepa_1_comercio-sepa-10_2024-11-23_09-05-11/".to_owned())
.into(),
)
.unwrap();