From 63ebf48da69bddf5290e0378d3a9a00faed21acf Mon Sep 17 00:00:00 2001 From: Nulo Date: Sun, 23 Apr 2023 21:12:29 -0300 Subject: [PATCH] =?UTF-8?q?RUST=20=F0=9F=9A=80=F0=9F=9A=80=F0=9F=9A=80?= =?UTF-8?q?=F0=9F=9A=80=F0=9F=9A=80=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 + rust/.gitignore | 3 + rust/Cargo.lock | 235 +++++++++++++++++++++++++++++++++++++++++++++++ rust/Cargo.toml | 15 +++ rust/src/main.rs | 224 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 480 insertions(+) create mode 100644 rust/.gitignore create mode 100644 rust/Cargo.lock create mode 100644 rust/Cargo.toml create mode 100644 rust/src/main.rs diff --git a/.gitignore b/.gitignore index b27af60..7836c55 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ keywords-no-compressed.zip *.sqlite3 *.sqlite3-shm *.sqlite3-wal +rust/*.sqlite3 +rust/*.sqlite3-shm +rust/*.sqlite3-wal diff --git a/rust/.gitignore b/rust/.gitignore new file mode 100644 index 0000000..564ec86 --- /dev/null +++ b/rust/.gitignore @@ -0,0 +1,3 @@ +target/ +perf.data* +flamegraph.svg diff --git a/rust/Cargo.lock b/rust/Cargo.lock new file mode 100644 index 0000000..cacd379 --- /dev/null +++ b/rust/Cargo.lock @@ -0,0 +1,235 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "ahash" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" +dependencies = [ + "getrandom", + "once_cell", + "version_check", +] + +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + +[[package]] +name = "bitflags" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c70beb79cbb5ce9c4f8e20849978f34225931f665bb49efa6982875a4d5facb3" + +[[package]] +name = "brotli" +version = "3.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1a0b1dbcc8ae29329621f8d4f0d835787c1c38bb1401979b49d13b0b305ff68" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "2.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b6561fd3f895a11e8f72af2cb7d22e08366bebc2b6b57f7744c4bda27034744" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + +[[package]] +name = "cc" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "fallible-iterator" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + +[[package]] +name = "getrandom" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +dependencies = [ + "ahash", +] + +[[package]] +name = "hashlink" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69fe1fcf8b4278d860ad0548329f892a3631fb63f82574df68275f34cdbe0ffa" +dependencies = [ + "hashbrown", +] + +[[package]] +name = "libc" +version = "0.2.142" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a987beff54b60ffa6d51982e1aa1146bc42f19bd26be28b0586f252fccf5317" + +[[package]] +name = "libsqlite3-sys" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afc22eff61b133b115c6e8c74e818c628d6d5e7a502afea6f64dee076dd94326" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "once_cell" +version = "1.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" + +[[package]] +name = "pkg-config" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" + +[[package]] +name = "rusqlite" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "549b9d036d571d42e6e85d1c1425e2ac83491075078ca9a15be021c56b1641f2" +dependencies = [ + "bitflags", + "fallible-iterator", + "fallible-streaming-iterator", + "hashlink", + "libsqlite3-sys", + "smallvec", +] + +[[package]] +name = "rust" +version = "0.1.0" +dependencies = [ + "brotli", + "rusqlite", + "walkdir", +] + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "smallvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "walkdir" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/rust/Cargo.toml b/rust/Cargo.toml new file mode 100644 index 0000000..3ee2882 --- /dev/null +++ b/rust/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "rust" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +brotli = "3.3.4" +# rayon = "1.7.0" +rusqlite = { version = "0.29.0", features = ["bundled", "unlock_notify"] } +# smol-potat = { version = "1.1.2", features = ["auto"] } +# sqlite-zstd = "0.3.2" +walkdir = "2.3.3" +# zstd = "0.12.3" diff --git a/rust/src/main.rs b/rust/src/main.rs new file mode 100644 index 0000000..11ee5ef --- /dev/null +++ b/rust/src/main.rs @@ -0,0 +1,224 @@ +use std::{ + error::Error, + fs::{read, remove_file, File}, + path::{Path, PathBuf}, + sync::mpsc::{self, Receiver, Sender}, + thread::{self, JoinHandle}, +}; + +use brotli::enc::{backward_references::BrotliEncoderMode, BrotliEncoderParams}; +use rusqlite::Connection; +use walkdir::WalkDir; + +fn db_path() -> &'static str { + "./archiveee.sqlite3" +} + +fn main() -> Result<(), Box> { + _ = remove_file(db_path()); + let conn = make_conn()?; + make_db_schema(&conn)?; + let input = Path::new("./input"); + + // let (dict, dict_id) = make_dict(conn, input)?; + + // let mut enc = zstd::bulk::Compressor::new(5)?; + let paths = iterator(input).collect::>(); + + let chunks = paths.chunks(paths.len() / 4); + println!("{}", chunks.len()); + + let (tx, rx): (Sender, Receiver) = mpsc::channel(); + + let threads = chunks + .map(|chunk| { + let chunkk = chunk.to_vec(); + let ttx = tx.clone(); + thread::spawn(move || { + let input = Path::new("./input"); + let params = file_compress_params(); + // let mut conn = make_conn().unwrap(); + println!("thread"); + // let trans = conn.transaction().unwrap(); + // { + // let mut insert_stmt = trans + // .prepare_cached("insert into files(path, content, compressed) values(?, ?, ?)") + // .unwrap(); + for path in chunkk { + compress_file(&path, input, ¶ms, &ttx); + } + }) + }) + .collect::>>(); + let collector = thread::spawn(|| { + let mut conn = make_conn().unwrap(); + println!("thread"); + let trans = conn.transaction().unwrap(); + { + let mut insert_stmt = trans + .prepare_cached("insert into files(path, content, compressed) values(?, ?, ?)") + .unwrap(); + for entry in rx { + insert_stmt.execute(entry).unwrap(); + } + } + trans.commit().unwrap(); + }); + drop(tx); + + collector.join().unwrap(); + for thread in threads { + thread.join().unwrap(); + } + // .map(|path| -> Option<()> { + // insert_file(path, input, params, insert_stmt); + // Some(()) + // }) + // .collect::>>(); + // if iter.iter().any(|o| o.is_none()) { + // println!("Algo falló"); + // } + + // recurse(dict, conn)?; + + println!("Hello, world!"); + Ok(()) +} + +// fn insert_files(paths: &[PathBuf]) { +// let input = Path::new("./input"); +// let params = file_compress_params(); +// // let mut conn = make_conn().unwrap(); +// println!("thread"); +// // let trans = conn.transaction().unwrap(); +// // { +// // let mut insert_stmt = trans +// // .prepare_cached("insert into files(path, content, compressed) values(?, ?, ?)") +// // .unwrap(); +// for path in paths { +// compress_file(path, input, ¶ms, tx); +// } +// // insert_stmt.execute((path_str, content, false)).unwrap(); +// // } +// // trans.commit().unwrap(); +// } + +// fn insert_file( +// path: &Path, +// input: &Path, +// params: &BrotliEncoderParams, +// insert_stmt: &mut rusqlite::CachedStatement, +// ) { +// // let mut path_comp = zstd::bulk::Compressor::with_dictionary(0, &dict).unwrap(); + +// // println!("{:?}", path); +// // let path_compressed = path_comp +// // .compress(strip_input(&path, input).to_string_lossy().as_bytes()) +// // .unwrap(); +// let path_strip = strip_input(&path, input); +// let path_str = path_strip.to_string_lossy(); +// if compressable(path) { +// let mut content_compressed = Vec::new(); +// let mut file = File::open(path).unwrap(); +// brotli::BrotliCompress(&mut file, &mut content_compressed, ¶ms).unwrap(); +// // let content_compressed = enc.compress(&content)?; +// insert_stmt +// .execute((path_str, content_compressed, true)) +// .unwrap(); +// } else { +// let content = read(path.clone()).unwrap(); +// insert_stmt.execute((path_str, content, false)).unwrap(); +// } +// } + +fn make_conn() -> Result> { + let conn = Connection::open(db_path())?; + conn.pragma_update(None, "journal_mode", "OFF")?; + conn.pragma_update(None, "synchronous", 0)?; + Ok(conn) +} + +type Entry = (String, Vec, bool); + +fn compress_file(path: &Path, input: &Path, params: &BrotliEncoderParams, tx: &Sender) { + let path_strip = strip_input(&path, input); + let path_str = path_strip.to_string_lossy(); + if compressable(path) { + let mut content_compressed = Vec::new(); + let mut file = File::open(path).unwrap(); + brotli::BrotliCompress(&mut file, &mut content_compressed, ¶ms).unwrap(); + // let content_compressed = enc.compress(&content)?; + // insert_stmt + // .execute((path_str, content_compressed, true)) + // .unwrap(); + tx.send((path_str.to_string(), content_compressed, true)) + .unwrap(); + } else { + let content = read(path.clone()).unwrap(); + // insert_stmt.execute((path_str, content, false)).unwrap(); + tx.send((path_str.to_string(), content, false)).unwrap(); + } +} + +fn make_db_schema(conn: &Connection) -> Result<(), Box> { + conn.execute( + "create table files(path text, content blob, compressed bool)", + [], + )?; + // conn.execute( + // "create table files(path blob, path_dictionary_id integer, content blob, compressed bool)", + // [], + // )?; + // conn.execute("create table path_dictionaries(dictionary blob)", [])?; + conn.execute("create unique index path on files(path)", [])?; + Ok(()) +} + +// fn make_dict(conn: Connection, input: &Path) -> Result<(Vec, i64), Box> { +// let mut insert_dict_stmt = +// conn.prepare_cached("insert into path_dictionaries(dictionary) values(?1)")?; +// let mut all_paths = Vec::new(); +// let iter = iterator(input); +// for entry in iter { +// all_paths.push(entry.to_string_lossy().as_bytes().to_vec()); +// } +// println!("path n {}", all_paths.len()); +// let dict = zstd::dict::from_samples(&all_paths, 999999)?; +// println!("dict"); +// let dict_id = insert_dict_stmt.insert(params![&dict])?; +// Ok((dict, dict_id)) +// } + +fn file_compress_params() -> BrotliEncoderParams { + let mut params = BrotliEncoderParams::default(); + params.quality = 0; + params.mode = BrotliEncoderMode::BROTLI_MODE_TEXT; + params +} + +fn iterator(input: &Path) -> impl Iterator { + WalkDir::new(input) + .into_iter() + .filter_map(|e| e.ok()) + .filter(|e| e.file_type().is_file()) + .map(|e| e.path().to_owned()) + .filter(|e| !already_compressed(e)) +} + +fn strip_input(path: &Path, input: &Path) -> PathBuf { + path.strip_prefix(input).unwrap().to_owned() +} + +fn already_compressed(path: &PathBuf) -> bool { + let p = path.to_string_lossy(); + p.ends_with(".br") || p.ends_with(".gz") +} + +fn compressable(path: &Path) -> bool { + let p = path.to_string_lossy(); + p.ends_with(".html") + || p.ends_with(".css") + || p.ends_with(".js") + || p.ends_with(".json") + || p.ends_with(".svg") +}