Compare commits

..

7 commits

Author SHA1 Message Date
1f60324e83 rust: no alocar por archivos que no se van a comprimir 2023-04-24 20:10:54 -03:00
bade1e4913 rust: mover código 2023-04-24 20:10:45 -03:00
3d3ac7c655 rust: cli flags 2023-04-23 22:00:21 -03:00
259f68d9c4 x 2023-04-23 21:46:21 -03:00
9f70f973ce rust: más parametros de optimización de sqlite 2023-04-23 21:40:18 -03:00
c4a1418cc2 rust: simplificar 2023-04-23 21:40:13 -03:00
92a9ab8340 rust: clean 2023-04-23 21:13:20 -03:00
3 changed files with 372 additions and 158 deletions

305
rust/Cargo.lock generated
View file

@ -28,6 +28,61 @@ dependencies = [
"alloc-no-stdlib",
]
[[package]]
name = "anstream"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e579a7752471abc2a8268df8b20005e3eadd975f585398f17efcfd8d4927371"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
"is-terminal",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41ed9a86bf92ae6580e0a31281f65a1b1d867c0cc68d5346e2ae128dddfa6a7d"
[[package]]
name = "anstyle-parse"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e765fd216e48e067936442276d1d57399e37bce53c264d6fefbe298080cb57ee"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b"
dependencies = [
"windows-sys",
]
[[package]]
name = "anstyle-wincon"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4bcd8291a340dd8ac70e18878bc4501dd7b4ff970cfa21c207d36ece51ea88fd"
dependencies = [
"anstyle",
"windows-sys",
]
[[package]]
name = "bitflags"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "bitflags"
version = "2.1.0"
@ -67,6 +122,75 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "clap"
version = "4.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "956ac1f6381d8d82ab4684768f89c0ea3afe66925ceadb4eeb3fc452ffc55d62"
dependencies = [
"clap_builder",
"clap_derive",
"once_cell",
]
[[package]]
name = "clap_builder"
version = "4.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "84080e799e54cff944f4b4a4b0e71630b0e0443b25b985175c7dddc1a859b749"
dependencies = [
"anstream",
"anstyle",
"bitflags 1.3.2",
"clap_lex",
"strsim",
]
[[package]]
name = "clap_derive"
version = "4.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9644cd56d6b87dbe899ef8b053e331c0637664e9e21a33dfcdc36093f5c5c4"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "clap_lex"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a2dd5a6fe8c6e3502f568a6353e5273bbb15193ad9a89e457b9970798efbea1"
[[package]]
name = "colorchoice"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
[[package]]
name = "errno"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
dependencies = [
"errno-dragonfly",
"libc",
"windows-sys",
]
[[package]]
name = "errno-dragonfly"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
dependencies = [
"cc",
"libc",
]
[[package]]
name = "fallible-iterator"
version = "0.2.0"
@ -108,6 +232,41 @@ dependencies = [
"hashbrown",
]
[[package]]
name = "heck"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "hermit-abi"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
[[package]]
name = "io-lifetimes"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220"
dependencies = [
"hermit-abi",
"libc",
"windows-sys",
]
[[package]]
name = "is-terminal"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
dependencies = [
"hermit-abi",
"io-lifetimes",
"rustix",
"windows-sys",
]
[[package]]
name = "libc"
version = "0.2.142"
@ -125,6 +284,12 @@ dependencies = [
"vcpkg",
]
[[package]]
name = "linux-raw-sys"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "36eb31c1778188ae1e64398743890d0877fef36d11521ac60406b42016e8c2cf"
[[package]]
name = "once_cell"
version = "1.17.1"
@ -137,13 +302,31 @@ version = "0.3.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
[[package]]
name = "proc-macro2"
version = "1.0.56"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rusqlite"
version = "0.29.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "549b9d036d571d42e6e85d1c1425e2ac83491075078ca9a15be021c56b1641f2"
dependencies = [
"bitflags",
"bitflags 2.1.0",
"fallible-iterator",
"fallible-streaming-iterator",
"hashlink",
@ -152,12 +335,17 @@ dependencies = [
]
[[package]]
name = "rust"
version = "0.1.0"
name = "rustix"
version = "0.37.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b864d3c18a5785a05953adeed93e2dca37ed30f18e69bba9f30079d51f363f"
dependencies = [
"brotli",
"rusqlite",
"walkdir",
"bitflags 1.3.2",
"errno",
"io-lifetimes",
"libc",
"linux-raw-sys",
"windows-sys",
]
[[package]]
@ -175,6 +363,45 @@ version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
[[package]]
name = "strsim"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
[[package]]
name = "syn"
version = "2.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a34fcf3e8b60f57e6a14301a2e916d323af98b0ea63c599441eec8558660c822"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "tofufirme"
version = "0.1.0"
dependencies = [
"brotli",
"clap",
"rusqlite",
"walkdir",
]
[[package]]
name = "unicode-ident"
version = "1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
[[package]]
name = "utf8parse"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
[[package]]
name = "vcpkg"
version = "0.2.15"
@ -233,3 +460,69 @@ name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-sys"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
[[package]]
name = "windows_aarch64_msvc"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
[[package]]
name = "windows_i686_gnu"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
[[package]]
name = "windows_i686_msvc"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
[[package]]
name = "windows_x86_64_gnu"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
[[package]]
name = "windows_x86_64_msvc"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"

View file

@ -1,5 +1,5 @@
[package]
name = "rust"
name = "tofufirme"
version = "0.1.0"
edition = "2021"
@ -7,6 +7,7 @@ edition = "2021"
[dependencies]
brotli = "3.3.4"
clap = { version = "4.2.4", features = ["derive"] }
# rayon = "1.7.0"
rusqlite = { version = "0.29.0", features = ["bundled", "unlock_notify"] }
# smol-potat = { version = "1.1.2", features = ["auto"] }

View file

@ -1,194 +1,114 @@
use std::{
error::Error,
fs::{read, remove_file, File},
fs::{read, File},
path::{Path, PathBuf},
sync::mpsc::{self, Receiver, Sender},
thread::{self, JoinHandle},
};
use brotli::enc::{backward_references::BrotliEncoderMode, BrotliEncoderParams};
use clap::Parser;
use rusqlite::Connection;
use walkdir::WalkDir;
fn db_path() -> &'static str {
"./archiveee.sqlite3"
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
/// A donde guardar el archivo
#[arg(short, long)]
output: String,
/// Carpeta a guardar
#[arg(short, long)]
input: String,
}
fn main() -> Result<(), Box<dyn Error>> {
_ = remove_file(db_path());
let conn = make_conn()?;
make_db_schema(&conn)?;
let input = Path::new("./input");
let args = Args::parse();
{
let conn = make_conn(&args.output)?;
make_db_schema(&conn)?;
}
let input = Path::new(&args.input).to_owned();
let paths = iterator(&input).collect::<Vec<PathBuf>>();
// let (dict, dict_id) = make_dict(conn, input)?;
let n_threads = 4;
let chunks = paths.chunks(paths.len() / n_threads + paths.len() % n_threads);
println!("threads: {}", chunks.len());
// let mut enc = zstd::bulk::Compressor::new(5)?;
let paths = iterator(input).collect::<Vec<PathBuf>>();
let chunks = paths.chunks(paths.len() / 4);
println!("{}", chunks.len());
let (tx, rx): (Sender<Entry>, Receiver<Entry>) = mpsc::channel();
let threads = chunks
.map(|chunk| {
let chunkk = chunk.to_vec();
let ttx = tx.clone();
thread::spawn(move || {
let input = Path::new("./input");
let params = file_compress_params();
// let mut conn = make_conn().unwrap();
println!("thread");
// let trans = conn.transaction().unwrap();
// {
// let mut insert_stmt = trans
// .prepare_cached("insert into files(path, content, compressed) values(?, ?, ?)")
// .unwrap();
for path in chunkk {
compress_file(&path, input, &params, &ttx);
}
let (rx, threads) = {
let (tx, rx): (Sender<Entry>, Receiver<Entry>) = mpsc::channel();
let producers = chunks
.map(|chunk| {
let chunkk = chunk.to_vec();
let ttx = tx.clone();
let inputt = input.clone();
thread::spawn(move || producer(&inputt, chunkk, ttx))
})
})
.collect::<Vec<JoinHandle<()>>>();
let collector = thread::spawn(|| {
let mut conn = make_conn().unwrap();
println!("thread");
let trans = conn.transaction().unwrap();
{
let mut insert_stmt = trans
.prepare_cached("insert into files(path, content, compressed) values(?, ?, ?)")
.unwrap();
for entry in rx {
insert_stmt.execute(entry).unwrap();
}
}
trans.commit().unwrap();
});
drop(tx);
.collect::<Vec<JoinHandle<()>>>();
collector.join().unwrap();
(rx, producers)
};
collector(&args.output, rx);
for thread in threads {
thread.join().unwrap();
}
// .map(|path| -> Option<()> {
// insert_file(path, input, params, insert_stmt);
// Some(())
// })
// .collect::<Vec<Option<()>>>();
// if iter.iter().any(|o| o.is_none()) {
// println!("Algo falló");
// }
// recurse(dict, conn)?;
println!("Hello, world!");
Ok(())
}
// fn insert_files(paths: &[PathBuf]) {
// let input = Path::new("./input");
// let params = file_compress_params();
// // let mut conn = make_conn().unwrap();
// println!("thread");
// // let trans = conn.transaction().unwrap();
// // {
// // let mut insert_stmt = trans
// // .prepare_cached("insert into files(path, content, compressed) values(?, ?, ?)")
// // .unwrap();
// for path in paths {
// compress_file(path, input, &params, tx);
// }
// // insert_stmt.execute((path_str, content, false)).unwrap();
// // }
// // trans.commit().unwrap();
// }
fn producer(input: &Path, chunkk: Vec<PathBuf>, ttx: Sender<(String, Vec<u8>, bool)>) {
let params = file_compress_params();
for path in chunkk {
let tx = &ttx;
let path_strip = strip_input(&path, input);
let path_str = path_strip.to_string_lossy();
if compressable(&path) {
let mut content_compressed = Vec::new();
let mut file = File::open(&path).unwrap();
brotli::BrotliCompress(&mut file, &mut content_compressed, &params).unwrap();
tx.send((path_str.to_string(), content_compressed, true))
.unwrap();
} else {
let content = read(path.clone()).unwrap();
tx.send((path_str.to_string(), content, false)).unwrap();
}
}
}
// fn insert_file(
// path: &Path,
// input: &Path,
// params: &BrotliEncoderParams,
// insert_stmt: &mut rusqlite::CachedStatement,
// ) {
// // let mut path_comp = zstd::bulk::Compressor::with_dictionary(0, &dict).unwrap();
fn collector(output: &str, rx: Receiver<(String, Vec<u8>, bool)>) {
let mut conn = make_conn(output).unwrap();
let trans = conn.transaction().unwrap();
{
let mut insert_stmt = trans
.prepare_cached("insert into files(path, content, compressed) values(?, ?, ?)")
.unwrap();
for entry in rx {
insert_stmt.execute(entry).unwrap();
}
}
trans.commit().unwrap();
}
// // println!("{:?}", path);
// // let path_compressed = path_comp
// // .compress(strip_input(&path, input).to_string_lossy().as_bytes())
// // .unwrap();
// let path_strip = strip_input(&path, input);
// let path_str = path_strip.to_string_lossy();
// if compressable(path) {
// let mut content_compressed = Vec::new();
// let mut file = File::open(path).unwrap();
// brotli::BrotliCompress(&mut file, &mut content_compressed, &params).unwrap();
// // let content_compressed = enc.compress(&content)?;
// insert_stmt
// .execute((path_str, content_compressed, true))
// .unwrap();
// } else {
// let content = read(path.clone()).unwrap();
// insert_stmt.execute((path_str, content, false)).unwrap();
// }
// }
fn make_conn() -> Result<Connection, Box<dyn Error>> {
let conn = Connection::open(db_path())?;
fn make_conn(output: &str) -> Result<Connection, Box<dyn Error>> {
let conn = Connection::open(output)?;
conn.pragma_update(None, "journal_mode", "OFF")?;
conn.pragma_update(None, "synchronous", 0)?;
conn.pragma_update(None, "cache_size", 1000000)?;
conn.pragma_update(None, "locking_mode", "EXCLUSIVE")?;
Ok(conn)
}
type Entry = (String, Vec<u8>, bool);
fn compress_file(path: &Path, input: &Path, params: &BrotliEncoderParams, tx: &Sender<Entry>) {
let path_strip = strip_input(&path, input);
let path_str = path_strip.to_string_lossy();
if compressable(path) {
let mut content_compressed = Vec::new();
let mut file = File::open(path).unwrap();
brotli::BrotliCompress(&mut file, &mut content_compressed, &params).unwrap();
// let content_compressed = enc.compress(&content)?;
// insert_stmt
// .execute((path_str, content_compressed, true))
// .unwrap();
tx.send((path_str.to_string(), content_compressed, true))
.unwrap();
} else {
let content = read(path.clone()).unwrap();
// insert_stmt.execute((path_str, content, false)).unwrap();
tx.send((path_str.to_string(), content, false)).unwrap();
}
}
fn make_db_schema(conn: &Connection) -> Result<(), Box<dyn Error>> {
conn.execute(
"create table files(path text, content blob, compressed bool)",
[],
)?;
// conn.execute(
// "create table files(path blob, path_dictionary_id integer, content blob, compressed bool)",
// [],
// )?;
// conn.execute("create table path_dictionaries(dictionary blob)", [])?;
conn.execute("create unique index path on files(path)", [])?;
Ok(())
}
// fn make_dict(conn: Connection, input: &Path) -> Result<(Vec<u8>, i64), Box<dyn Error>> {
// let mut insert_dict_stmt =
// conn.prepare_cached("insert into path_dictionaries(dictionary) values(?1)")?;
// let mut all_paths = Vec::new();
// let iter = iterator(input);
// for entry in iter {
// all_paths.push(entry.to_string_lossy().as_bytes().to_vec());
// }
// println!("path n {}", all_paths.len());
// let dict = zstd::dict::from_samples(&all_paths, 999999)?;
// println!("dict");
// let dict_id = insert_dict_stmt.insert(params![&dict])?;
// Ok((dict, dict_id))
// }
fn file_compress_params() -> BrotliEncoderParams {
let mut params = BrotliEncoderParams::default();
params.quality = 0;
@ -201,15 +121,15 @@ fn iterator(input: &Path) -> impl Iterator<Item = PathBuf> {
.into_iter()
.filter_map(|e| e.ok())
.filter(|e| e.file_type().is_file())
.filter(|e| !already_compressed(e.path()))
.map(|e| e.path().to_owned())
.filter(|e| !already_compressed(e))
}
fn strip_input(path: &Path, input: &Path) -> PathBuf {
path.strip_prefix(input).unwrap().to_owned()
}
fn already_compressed(path: &PathBuf) -> bool {
fn already_compressed(path: &Path) -> bool {
let p = path.to_string_lossy();
p.ends_with(".br") || p.ends_with(".gz")
}