WIP changes

2025-02-22 20:46:24 +00:00 · 2025-01-11 14:13:36 +00:00 · 2025-01-11 14:13:36 +00:00 · 2f44b329e7
commit 2f44b329e7
parent 94c4b35740
4 changed files with 149 additions and 61 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -1,5 +1,6 @@
 {
  "spellright.language": ["es_AR"],
  "spellright.documentTypes": ["markdown", "latex", "plaintext"],
-  "editor.formatOnSave": true
+  "editor.formatOnSave": true,
  "rust-analyzer.trace.server": "verbose"
 }
--- a/sepa/importer-rs/Cargo.lock
+++ b/sepa/importer-rs/Cargo.lock
@ -1,6 +1,6 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
+version = 4
 [[package]]
 name = "adler2"
@ -525,6 +525,12 @@ version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
 [[package]]
 name = "fastrand"
 version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4"
 [[package]]
 name = "filetime"
 version = "0.2.25"
@ -654,6 +660,7 @@ dependencies = [
 "regex",
 "serde",
 "tar",
 "tempfile",
 "zstd",
 ]
@ -1279,6 +1286,19 @@ dependencies = [
 "xattr",
 ]
 [[package]]
 name = "tempfile"
 version = "3.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c"
 dependencies = [
 "cfg-if",
 "fastrand",
 "once_cell",
 "rustix",
 "windows-sys 0.59.0",
 ]
 [[package]]
 name = "tiny-keccak"
 version = "2.0.2"
--- a/sepa/importer-rs/Cargo.toml
+++ b/sepa/importer-rs/Cargo.toml
@ -11,4 +11,5 @@ rayon = "1.10.0"
 regex = "1.11.1"
 serde = { version = "1.0.215", features = ["derive"] }
 tar = "0.4.43"
 tempfile = "3.14.0"
 zstd = "0.13.2"
--- a/sepa/importer-rs/src/main.rs
+++ b/sepa/importer-rs/src/main.rs
@ -1,7 +1,9 @@
 use anyhow::anyhow;
 use rayon::prelude::*;
 use std::{
    env::args,
-    io::{self, BufRead},
+    fs,
    io::{self, BufRead, Write},
    path::{Path, PathBuf},
 };
 use tar::Archive;
@ -44,7 +46,10 @@ struct Bandera {
 fn read_csv_trimmed(path: PathBuf) -> anyhow::Result<String> {
    Ok(io::BufReader::new(std::fs::File::open(path)?)
        .lines()
-        .map(|line| line.map(|l| l.trim().to_string()))
+        .map(|line| {
            line.map(|l| l.trim().to_string())
                .map_err(anyhow::Error::from)
        })
        .take_while(|line| {
            if let Ok(l) = line {
                if l.starts_with('&') || l.starts_with(' ') || l.starts_with('\0') || l.is_empty() {
@ -52,6 +57,9 @@ fn read_csv_trimmed(path: PathBuf) -> anyhow::Result<String> {
                        .replace('\0', "")
                        .replace(" ", "")
                        .is_empty()
                } else if l.starts_with("Última") {
                    // algunos datasets erroneamente les falta el newline
                    false
                } else {
                    true
                }
@ -59,7 +67,16 @@ fn read_csv_trimmed(path: PathBuf) -> anyhow::Result<String> {
                true
            }
        })
-        .collect::<Result<Vec<_>, io::Error>>()?
+        .map(|line| {
            line.and_then(|l| {
                if l.starts_with("|") {
                    Err(anyhow!("Alberdi S.A.: newlines incorrectos"))
                } else {
                    Ok(l)
                }
            })
        })
        .collect::<Result<Vec<String>, _>>()?
        .join("\n"))
 }
@ -72,8 +89,8 @@ struct Sucursal {
    sucursales_tipo: String,
    sucursales_calle: String,
    sucursales_numero: String,
-    sucursales_latitud: f64,
+    sucursales_latitud: Option<f64>,
-    sucursales_longitud: f64,
+    sucursales_longitud: Option<f64>,
    sucursales_observaciones: Option<String>,
    sucursales_barrio: Option<String>,
    sucursales_codigo_postal: String,
@ -85,10 +102,12 @@ struct Sucursal {
    sucursales_jueves_horario_atencion: String,
    sucursales_viernes_horario_atencion: String,
    sucursales_sabado_horario_atencion: String,
    /// bruh. a veces lo escriben asi
    #[serde(alias = "sucursales_domingohorario_atencion")]
    sucursales_domingo_horario_atencion: String,
 }
-fn import_dataset(conn: &Connection, dir_path: PathBuf) -> anyhow::Result<()> {
+fn import_dataset(conn: &Connection, dir_path: &Path) -> anyhow::Result<()> {
    conn.execute("BEGIN", duckdb::params![])?;
    let dataset_id = {
@ -150,9 +169,17 @@ fn import_dataset(conn: &Connection, dir_path: PathBuf) -> anyhow::Result<()> {
        let mut file = csv::ReaderBuilder::new()
            .delimiter(b'|')
            .from_reader(csv.as_bytes());
-        let sucursales = file
+        let sucursales: Vec<Sucursal> = file
-            .records()
+            .deserialize()
-            .map(|r| r.unwrap().deserialize::<Sucursal>(None).unwrap());
+            .map(|result: csv::Result<Sucursal>| {
                result.map_err(|e| {
                    println!("Error: {:?}", e);
                    println!("CSV content: {}", csv);
                    e
                })
            })
            .collect::<Result<_, _>>()?;
        for sucursal in sucursales {
            app.append_row(duckdb::params![
                dataset_id,
@ -183,49 +210,58 @@ fn import_dataset(conn: &Connection, dir_path: PathBuf) -> anyhow::Result<()> {
    {
        let file = read_csv_trimmed(dir_path.join("productos.csv"))?;
-        let mut app = conn.appender("precios")?;
+        let mut temp = tempfile::NamedTempFile::new()?;
        // get path to file
        temp.write_all(file.as_bytes())?;
        // let mut app = conn.appender("precios")?;
        let start = std::time::Instant::now();
-        let mut rdr = csv::ReaderBuilder::new()
+        conn.execute(
-            .delimiter(b'|')
+            "
-            .from_reader(file.as_bytes());
+insert into precios select ? as id_dataset, * from read_csv(?, delim='|', header=true, nullstr='')",
-        for result in rdr.records() {
+            duckdb::params![dataset_id, &temp.path().to_string_lossy()],
-            match result {
+        )?;
-                Ok(record) => {
+        // let mut rdr = csv::ReaderBuilder::new()
-                    // println!("{:?}", record);
+        //     .delimiter(b'|')
-                    let producto: Producto = record.deserialize(None).unwrap();
+        //     .from_reader(file.as_bytes());
-                    // println!("{:?}", producto);
+        // for result in rdr.records() {
-                    app.append_row(duckdb::params![
+        //     match result {
-                        dataset_id,
+        //         Ok(record) => {
-                        producto.id_comercio,
+        //             // println!("{:?}", record);
-                        producto.id_bandera,
+        //             let producto: Producto = record.deserialize(None).unwrap();
-                        producto.id_sucursal,
+        //             // println!("{:?}", producto);
-                        producto.id_producto,
+        //             // app.append_row(duckdb::params![
-                        producto.productos_ean,
+        //             //     dataset_id,
-                        producto.productos_descripcion,
+        //             //     producto.id_comercio,
-                        producto.productos_cantidad_presentacion,
+        //             //     producto.id_bandera,
-                        producto.productos_unidad_medida_presentacion,
+        //             //     producto.id_sucursal,
-                        producto.productos_marca,
+        //             //     producto.id_producto,
-                        producto.productos_precio_lista,
+        //             //     producto.productos_ean,
-                        producto.productos_precio_referencia,
+        //             //     producto.productos_descripcion,
-                        producto.productos_cantidad_referencia,
+        //             //     producto.productos_cantidad_presentacion,
-                        producto.productos_unidad_medida_referencia,
+        //             //     producto.productos_unidad_medida_presentacion,
-                        producto.productos_precio_unitario_promo1,
+        //             //     producto.productos_marca,
-                        producto.productos_leyenda_promo1,
+        //             //     producto.productos_precio_lista,
-                        producto.productos_precio_unitario_promo2,
+        //             //     producto.productos_precio_referencia,
-                        producto.productos_leyenda_promo2,
+        //             //     producto.productos_cantidad_referencia,
-                    ])?;
+        //             //     producto.productos_unidad_medida_referencia,
-                }
+        //             //     producto.productos_precio_unitario_promo1,
-                Err(e) => {
+        //             //     producto.productos_leyenda_promo1,
-                    println!("Error: {:?}", e);
+        //             //     producto.productos_precio_unitario_promo2,
-                    println!(
+        //             //     producto.productos_leyenda_promo2,
-                        "lines: {:?}",
+        //             // ])?;
-                        &file[e.position().unwrap().byte() as usize..]
+        //         }
-                    );
+        //         Err(e) => {
-                    panic!("Error parsing csv: {:#?}", e);
+        //             println!("Error: {:?}", e);
-                }
+        //             println!(
-            }
+        //                 "lines: {:?}",
-        }
+        //                 &file[e.position().unwrap().byte() as usize..]
-        app.flush()?;
+        //             );
        //             panic!("Error parsing csv: {:#?}", e);
        //         }
        //     }
        // }
        // app.flush()?;
        println!("Time taken flushed: {:?}", start.elapsed());
    }
    conn.execute("COMMIT", duckdb::params![])?;
@ -233,6 +269,34 @@ fn import_dataset(conn: &Connection, dir_path: PathBuf) -> anyhow::Result<()> {
    Ok(())
 }
 /// a dataset dump is a dump of a single day with multiple datasets (one per comercio)
 fn import_dataset_dump(conn: &Connection, dir_path: &Path) -> anyhow::Result<()> {
    for entry in fs::read_dir(dir_path)? {
        let entry = entry?;
        let path = entry.path();
        if path.is_dir() {
            let res = import_dataset(conn, &path);
            match res {
                Ok(_) => {}
                Err(e) => {
                    println!("Error importing dataset: {:?}", e);
                    conn.execute("ROLLBACK", duckdb::params![])?;
                }
            }
        }
    }
    Ok(())
 }
 fn is_dataset_dump(dir_path: &Path) -> anyhow::Result<bool> {
    Ok(fs::read_dir(dir_path)?.any(|entry| {
        entry
            .as_ref()
            .map(|e| e.file_name() == "dataset-info.json")
            .unwrap_or(false)
    }))
 }
 fn main() {
    let conn = Connection::open("importer-rs.db").unwrap();
@ -254,13 +318,15 @@ fn main() {
    //         import_dataset(&conn.try_clone().unwrap(), parent.to_path_buf()).unwrap();
    //     });
-    import_dataset(
+    let path: PathBuf = args()
-        &conn.try_clone().unwrap(),
+        .nth(1)
-        args()
+        .unwrap_or("/sepa_1_comercio-sepa-10_2024-11-23_09-05-11/".to_owned())
-            .nth(1)
+        .into();
-            .unwrap_or("/sepa_1_comercio-sepa-10_2024-11-23_09-05-11/".to_owned())
+
-            .into(),
+    if is_dataset_dump(&path).unwrap() {
-    )
+        import_dataset_dump(&conn, &path).unwrap();
-    .unwrap();
+    } else {
        import_dataset(&conn.try_clone().unwrap(), &path).unwrap();
    }
    println!("Hello, world!");
 }