diff --git a/sepa/importer-rs/Cargo.lock b/sepa/importer-rs/Cargo.lock index 3466a1a..1f3e8f6 100644 --- a/sepa/importer-rs/Cargo.lock +++ b/sepa/importer-rs/Cargo.lock @@ -420,6 +420,31 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + [[package]] name = "crunchy" version = "0.2.2" @@ -466,6 +491,12 @@ dependencies = [ "strum 0.25.0", ] +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" + [[package]] name = "equivalent" version = "1.0.1" @@ -619,8 +650,11 @@ dependencies = [ "anyhow", "csv", "duckdb", + "rayon", "regex", "serde", + "tar", + "zstd", ] [[package]] @@ -966,6 +1000,26 @@ dependencies = [ "getrandom", ] +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "redox_syscall" version = "0.5.7" @@ -1497,3 +1551,31 @@ dependencies = [ "quote", "syn 2.0.90", ] + +[[package]] +name = "zstd" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.13+zstd.1.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/sepa/importer-rs/Cargo.toml b/sepa/importer-rs/Cargo.toml index d7f5dd5..7c56b18 100644 --- a/sepa/importer-rs/Cargo.toml +++ b/sepa/importer-rs/Cargo.toml @@ -7,5 +7,8 @@ edition = "2021" anyhow = "1.0.93" csv = "1.3.1" duckdb = { version = "1.1.1", features = ["bundled"] } +rayon = "1.10.0" regex = "1.11.1" serde = { version = "1.0.215", features = ["derive"] } +tar = "0.4.43" +zstd = "0.13.2" diff --git a/sepa/importer-rs/src/main.rs b/sepa/importer-rs/src/main.rs index 8967588..2449f57 100644 --- a/sepa/importer-rs/src/main.rs +++ b/sepa/importer-rs/src/main.rs @@ -1,8 +1,11 @@ +use rayon::prelude::*; use std::{ env::args, io::{self, BufRead}, path::{Path, PathBuf}, }; +use tar::Archive; +use zstd::Decoder; use duckdb::Connection; @@ -232,13 +235,30 @@ fn import_dataset(conn: &Connection, dir_path: PathBuf) -> anyhow::Result<()> { fn main() { let conn = Connection::open("importer-rs.db").unwrap(); + + // let decoded = Decoder::new( + // std::fs::File::open("/d076720f-a7f0-4af8-b1d6-1b99d5a90c14-revID-a3de6c6a-8795-4348-a16d-bc626e9f1b2e-sepa_jueves.zip-repackaged.tar.zst").unwrap(), + // ) + // .unwrap(); + // let mut archive = Archive::new(decoded); + // archive + // .entries() + // .unwrap() + // .filter_map(|e| e.ok()) + // .filter(|e| e.path().unwrap().ends_with("comercio.csv")) + // .collect::>() + // .par_iter() + // .for_each(|entry| { + // let path = entry.path().unwrap(); + // let parent = path.parent().unwrap(); + // import_dataset(&conn.try_clone().unwrap(), parent.to_path_buf()).unwrap(); + // }); + import_dataset( &conn.try_clone().unwrap(), args() .nth(1) - .unwrap_or( - "/Users/diablo/Downloads/sepa_1_comercio-sepa-10_2024-11-23_09-05-11/".to_owned(), - ) + .unwrap_or("/sepa_1_comercio-sepa-10_2024-11-23_09-05-11/".to_owned()) .into(), ) .unwrap();