Compare commits

...

2 commits

Author SHA1 Message Date
348d054b7b renombrar warcificator -> scraper-rs 2024-01-10 21:46:10 -03:00
613efc3111 warcificator: restructuracion masiva 2024-01-10 21:44:35 -03:00
5 changed files with 372 additions and 141 deletions

3
.gitignore vendored
View file

@ -15,4 +15,5 @@ target/
.env.*
*/flamegraph.svg
*/perf.data*
*/perf.data*
scraper-rs/debug/

8
.vscode/launch.json vendored
View file

@ -7,13 +7,13 @@
{
"type": "lldb",
"request": "launch",
"name": "warcificator",
"cwd": "warcificator/",
"name": "scraper-rs",
"cwd": "scraper-rs/",
"cargo": {
// https://github.com/vadimcn/codelldb/issues/884
"args": ["build", "--manifest-path=warcificator/Cargo.toml"]
"args": ["build", "--manifest-path=scraper-rs/Cargo.toml"]
},
"args": ["../data/carrefour"],
"args": ["../data/Carrefour.txt"],
"env": {}
},
{

View file

@ -17,6 +17,17 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "again"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05802a5ad4d172eaf796f7047b42d0af9db513585d16d4169660a21613d34b93"
dependencies = [
"log",
"rand 0.7.3",
"wasm-timer",
]
[[package]]
name = "ahash"
version = "0.8.7"
@ -198,16 +209,6 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "crossbeam-channel"
version = "0.5.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "82a9b73a36529d9c47029b9fb3a6f0ea3cc916a261195352ba19e770fc1748b2"
dependencies = [
"cfg-if",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.18"
@ -218,13 +219,10 @@ dependencies = [
]
[[package]]
name = "deranged"
version = "0.3.11"
name = "either"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
dependencies = [
"powerfmt",
]
checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
[[package]]
name = "encoding_rs"
@ -299,6 +297,21 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "futures"
version = "0.3.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0"
dependencies = [
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-channel"
version = "0.3.30"
@ -306,6 +319,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
dependencies = [
"futures-core",
"futures-sink",
]
[[package]]
@ -314,6 +328,34 @@ version = "0.3.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
[[package]]
name = "futures-executor"
version = "0.3.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d"
dependencies = [
"futures-core",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-io"
version = "0.3.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
[[package]]
name = "futures-macro"
version = "0.3.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "futures-sink"
version = "0.3.30"
@ -332,10 +374,27 @@ version = "0.3.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
dependencies = [
"futures-channel",
"futures-core",
"futures-io",
"futures-macro",
"futures-sink",
"futures-task",
"memchr",
"pin-project-lite",
"pin-utils",
"slab",
]
[[package]]
name = "getrandom"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
dependencies = [
"cfg-if",
"libc",
"wasi 0.9.0+wasi-snapshot-preview1",
]
[[package]]
@ -346,7 +405,7 @@ checksum = "fe9006bed769170c11f845cf00c7c1e9092aeb3f268e007c3e760ac68008070f"
dependencies = [
"cfg-if",
"libc",
"wasi",
"wasi 0.11.0+wasi-snapshot-preview1",
]
[[package]]
@ -491,6 +550,15 @@ dependencies = [
"hashbrown",
]
[[package]]
name = "instant"
version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
dependencies = [
"cfg-if",
]
[[package]]
name = "ipnet"
version = "2.9.0"
@ -578,10 +646,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
dependencies = [
"libc",
"wasi",
"wasi 0.11.0+wasi-snapshot-preview1",
"windows-sys",
]
[[package]]
name = "nanoid"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ffa00dec017b5b1a8b7cf5e2c008bfda1aa7e0697ac1508b491fdf2622fb4d8"
dependencies = [
"rand 0.8.5",
]
[[package]]
name = "nu-ansi-term"
version = "0.46.0"
@ -629,6 +706,17 @@ version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb813b8af86854136c6922af0598d719255ecb2179515e6e7730d468f05c9cae"
[[package]]
name = "parking_lot"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
dependencies = [
"instant",
"lock_api",
"parking_lot_core 0.8.6",
]
[[package]]
name = "parking_lot"
version = "0.12.1"
@ -636,7 +724,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
dependencies = [
"lock_api",
"parking_lot_core",
"parking_lot_core 0.9.9",
]
[[package]]
name = "parking_lot_core"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc"
dependencies = [
"cfg-if",
"instant",
"libc",
"redox_syscall 0.2.16",
"smallvec",
"winapi",
]
[[package]]
@ -647,7 +749,7 @@ checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e"
dependencies = [
"cfg-if",
"libc",
"redox_syscall",
"redox_syscall 0.4.1",
"smallvec",
"windows-targets",
]
@ -677,29 +779,109 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69d3587f8a9e599cc7ec2c00e331f71c4e69a5f9a4b8a6efd5b07466b9736f9a"
[[package]]
name = "powerfmt"
version = "0.2.0"
name = "ppv-lite86"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
[[package]]
name = "proc-macro2"
version = "1.0.71"
version = "1.0.76"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75cb1540fadbd5b8fbccc4dddad2734eba435053f725621c070711a14bb5f4b8"
checksum = "95fc56cda0b5c3325f5fbbd7ff9fda9e02bb00bb3dac51252d2f1bfa1cb8cc8c"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.33"
version = "1.0.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rand"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
dependencies = [
"getrandom 0.1.16",
"libc",
"rand_chacha 0.2.2",
"rand_core 0.5.1",
"rand_hc",
]
[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
"rand_chacha 0.3.1",
"rand_core 0.6.4",
]
[[package]]
name = "rand_chacha"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
dependencies = [
"ppv-lite86",
"rand_core 0.5.1",
]
[[package]]
name = "rand_chacha"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core 0.6.4",
]
[[package]]
name = "rand_core"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
dependencies = [
"getrandom 0.1.16",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
"getrandom 0.2.11",
]
[[package]]
name = "rand_hc"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
dependencies = [
"rand_core 0.5.1",
]
[[package]]
name = "redox_syscall"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
dependencies = [
"bitflags 1.3.2",
]
[[package]]
name = "redox_syscall"
version = "0.4.1"
@ -741,6 +923,7 @@ dependencies = [
"system-configuration",
"tokio",
"tokio-rustls",
"tokio-socks",
"tokio-util",
"tower-service",
"url",
@ -758,7 +941,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "688c63d65483050968b2a8937f7995f443e27041a0f7700aa59b0822aedebb74"
dependencies = [
"cc",
"getrandom",
"getrandom 0.2.11",
"libc",
"spin",
"untrusted",
@ -828,6 +1011,26 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scraper-rs"
version = "0.1.0"
dependencies = [
"again",
"async-channel",
"nanoid",
"rand 0.8.5",
"reqwest",
"rusqlite",
"serde",
"serde_json",
"simple-error",
"thiserror",
"tl",
"tokio",
"tracing",
"tracing-subscriber",
]
[[package]]
name = "sct"
version = "0.7.1"
@ -899,6 +1102,12 @@ dependencies = [
"libc",
]
[[package]]
name = "simple-error"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8542b68b8800c3cda649d2c72d688b6907b30f1580043135d61669d4aad1c175"
[[package]]
name = "slab"
version = "0.4.9"
@ -932,9 +1141,9 @@ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
[[package]]
name = "syn"
version = "2.0.43"
version = "2.0.48"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee659fb5f3d355364e1f3e5bc10fb82068efbf824a1e9d1c9504244a6469ad53"
checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f"
dependencies = [
"proc-macro2",
"quote",
@ -964,18 +1173,18 @@ dependencies = [
[[package]]
name = "thiserror"
version = "1.0.55"
version = "1.0.56"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e3de26b0965292219b4287ff031fcba86837900fe9cd2b34ea8ad893c0953d2"
checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.55"
version = "1.0.56"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "268026685b2be38d7103e9e507c938a1fcb3d7e6eb15e87870b617bf37b6d581"
checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471"
dependencies = [
"proc-macro2",
"quote",
@ -992,35 +1201,6 @@ dependencies = [
"once_cell",
]
[[package]]
name = "time"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f657ba42c3f86e7680e53c8cd3af8abbe56b5491790b46e22e19c0d57463583e"
dependencies = [
"deranged",
"itoa",
"powerfmt",
"serde",
"time-core",
"time-macros",
]
[[package]]
name = "time-core"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
[[package]]
name = "time-macros"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26197e33420244aeb70c3e8c78376ca46571bc4e701e4791c2cd9f57dcb3a43f"
dependencies = [
"time-core",
]
[[package]]
name = "tinyvec"
version = "1.6.0"
@ -1052,7 +1232,7 @@ dependencies = [
"libc",
"mio",
"num_cpus",
"parking_lot",
"parking_lot 0.12.1",
"pin-project-lite",
"signal-hook-registry",
"socket2",
@ -1081,6 +1261,18 @@ dependencies = [
"tokio",
]
[[package]]
name = "tokio-socks"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51165dfa029d2a65969413a6cc96f354b86b464498702f174a4efa13608fd8c0"
dependencies = [
"either",
"futures-util",
"thiserror",
"tokio",
]
[[package]]
name = "tokio-util"
version = "0.7.10"
@ -1107,24 +1299,11 @@ version = "0.1.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
dependencies = [
"log",
"pin-project-lite",
"tracing-attributes",
"tracing-core",
]
[[package]]
name = "tracing-appender"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3566e8ce28cc0a3fe42519fc80e6b4c943cc4c8cef275620eb8dac2d3d4e06cf"
dependencies = [
"crossbeam-channel",
"thiserror",
"time",
"tracing-subscriber",
]
[[package]]
name = "tracing-attributes"
version = "0.1.27"
@ -1243,20 +1422,10 @@ dependencies = [
]
[[package]]
name = "warcificator"
version = "0.1.0"
dependencies = [
"async-channel",
"reqwest",
"rusqlite",
"serde",
"serde_json",
"tl",
"tokio",
"tracing",
"tracing-appender",
"tracing-subscriber",
]
name = "wasi"
version = "0.9.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
[[package]]
name = "wasi"
@ -1330,6 +1499,21 @@ version = "0.2.89"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ab9b36309365056cd639da3134bf87fa8f3d86008abf99e612384a6eecd459f"
[[package]]
name = "wasm-timer"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be0ecb0db480561e9a7642b5d3e4187c128914e58aa84330b9493e3eb68c5e7f"
dependencies = [
"futures",
"js-sys",
"parking_lot 0.11.2",
"pin-utils",
"wasm-bindgen",
"wasm-bindgen-futures",
"web-sys",
]
[[package]]
name = "web-sys"
version = "0.3.66"

View file

@ -1,24 +1,29 @@
[package]
name = "warcificator"
name = "scraper-rs"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
again = "0.1.2"
async-channel = "2.1.1"
nanoid = "0.4.0"
rand = "0.8.5"
# lol_html = "1.2.0"
reqwest = { version = "0.11.23", default-features = false, features = [
"rustls-tls",
"gzip",
"brotli",
"socks",
] }
rusqlite = "0.30.0"
# scraper = "0.18.1"
serde = { version = "1.0.193", features = ["derive"] }
serde_json = "1.0.109"
simple-error = "0.3.0"
thiserror = "1.0.56"
tl = { git = "https://github.com/evertedsphere/tl", branch = "patch-1", features = ["simd"] }
tokio = { version = "1.35.1", features = ["full"] }
tracing = { version = "0.1", features = ["log"] }
tracing-appender = "0.2.3"
tracing-subscriber = "0.3.18"
tracing = "0.1"
tracing-subscriber = "0.3"

View file

@ -1,11 +1,18 @@
use again::RetryPolicy;
use async_channel::{Receiver, Sender};
use nanoid::nanoid;
use rand::seq::SliceRandom;
use reqwest::Url;
use rusqlite::Connection;
use simple_error::{bail, SimpleError};
use std::{
borrow::Cow,
env::{self, args},
fs,
time::{SystemTime, UNIX_EPOCH},
path::PathBuf,
time::{Duration, SystemTime, UNIX_EPOCH},
};
use thiserror::Error;
use tl::VDom;
use tokio::io::{stderr, AsyncWriteExt};
@ -95,14 +102,16 @@ struct PrecioPoint {
// }
#[tokio::main]
async fn main() {
async fn main() -> Result<(), Box<dyn std::error::Error>> {
tracing_subscriber::fmt::init();
let mut args = args().skip(1);
let links_list_path = args.next().unwrap();
let links_list_path = args.next().expect("Falta arg para path de lista de urls");
let links_str = fs::read_to_string(links_list_path).unwrap();
let links = links_str
.split("\n")
.split('\n')
.map(|s| s.trim())
.filter(|s| s.len() > 0)
.filter(|s| !s.is_empty())
.map(|s| s.to_owned())
.collect::<Vec<_>>();
@ -112,8 +121,8 @@ async fn main() {
let mut handles = Vec::new();
for _ in 1..env::var("N_COROUTINES")
.map_or(Ok(32), |s| s.parse::<usize>())
.unwrap()
.map_or(Ok(128), |s| s.parse::<usize>())
.expect("N_COROUTINES no es un número")
{
let rx = receiver.clone();
let tx = res_sender.clone();
@ -134,6 +143,7 @@ async fn main() {
db_writer_handle
};
handle.await.unwrap();
Ok(())
}
async fn worker(rx: Receiver<String>, tx: Sender<PrecioPoint>) {
@ -145,46 +155,68 @@ async fn worker(rx: Receiver<String>, tx: Sender<PrecioPoint>) {
tx.send(ex).await.unwrap();
}
Err(err) => {
stderr()
.write_all(format!("Failed to fetch {}: {:?}\n", url.as_str(), err).as_bytes())
.await
.unwrap();
tracing::error!(error=%err, url=url);
}
}
}
}
#[derive(Debug)]
#[derive(Debug, Error)]
enum FetchError {
HttpError(reqwest::Error),
ParseError(&'static str),
#[error("reqwest error")]
Http(#[from] reqwest::Error),
#[error("http status: {0}")]
HttpStatus(reqwest::StatusCode),
#[error("parse error")]
Parse(#[from] SimpleError),
#[error("tl error")]
Tl(#[from] tl::ParseError),
}
#[tracing::instrument(skip(client))]
async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result<PrecioPoint, FetchError> {
let request = client.get(url.as_str()).build().unwrap();
let response = client
.execute(request)
.await
.map_err(|e| FetchError::HttpError(e))?;
let body = response
.text()
.await
.map_err(|e| FetchError::HttpError(e))?;
let policy = RetryPolicy::exponential(Duration::from_millis(300))
.with_max_retries(10)
.with_jitter(true);
let dom = tl::parse(&body, tl::ParserOptions::default()).unwrap();
// let parser = dom.parser();
let response = policy
.retry(|| {
let request = client.get(url.as_str()).build().unwrap();
client.execute(request)
})
.await
.map_err(FetchError::Http)?;
if !response.status().is_success() {
return Err(FetchError::HttpStatus(response.status()));
}
let body = response.text().await.map_err(FetchError::Http)?;
let point = parse_carrefour(url, &dom)?;
let maybe_point = {
let dom = tl::parse(&body, tl::ParserOptions::default()).map_err(FetchError::Tl)?;
parse_carrefour(url, &dom)
};
let point = match maybe_point {
Ok(p) => Ok(p),
Err(err) => {
let debug_path = PathBuf::from("debug/");
tokio::fs::create_dir_all(&debug_path).await.unwrap();
let file_path = debug_path.join(format!("{}.html", nanoid!()));
tokio::fs::write(&file_path, &body).await.unwrap();
tracing::debug!(error=%err, "Failed to parse, saved body at {}",file_path.display());
Err(err)
}
}?;
Ok(point)
}
fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, FetchError> {
fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, SimpleError> {
let precio_centavos = {
get_meta_content(dom, "product:price:amount")?
.map(|s| {
s.parse::<f64>()
.map_err(|_| FetchError::ParseError("Failed to parse number"))
.map_err(|_| SimpleError::new("Failed to parse number"))
})
.transpose()
.map(|f| f.map(|f| (f * 100.0) as u64))
@ -195,7 +227,7 @@ fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, FetchErro
Some(s) => match s.as_ref() {
"oos" => Some(false),
"instock" => Some(true),
_ => return Err(FetchError::ParseError("Not a valid product:availability")),
_ => return Err(SimpleError::new("Not a valid product:availability")),
},
None => None,
};
@ -204,7 +236,10 @@ fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, FetchErro
let json = &parse_script_json(dom, "__STATE__")?;
let state = json
.as_object()
.ok_or(FetchError::ParseError("Seed state not an object"))?;
.ok_or(SimpleError::new("Seed state not an object"))?;
if state.is_empty() {
bail!("Seed state is an empty object")
}
let (_, product_json) = state
.into_iter()
.find(|(key, val)| {
@ -214,11 +249,11 @@ fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, FetchErro
.and_then(|val| val.get("__typename"))
.map_or(false, |typename| typename == "Product")
})
.ok_or(FetchError::ParseError("No product in seed state"))?;
.ok_or(SimpleError::new("No product in seed state"))?;
let cache_id = product_json
.get("cacheId")
.and_then(|v| v.as_str())
.ok_or(FetchError::ParseError("No cacheId in seed state"))?;
.ok_or(SimpleError::new("No cacheId in seed state"))?;
let (_, product_sku_json) = state
.iter()
.find(|(key, val)| {
@ -228,11 +263,11 @@ fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, FetchErro
.map_or(false, |typename| typename == "SKU")
})
})
.ok_or(FetchError::ParseError("No Product:cacheId* found"))?;
.ok_or(SimpleError::new("No Product:cacheId* found"))?;
product_sku_json
.get("ean")
.and_then(|v| v.as_str())
.ok_or(FetchError::ParseError("No product SKU in seed state"))?
.ok_or(SimpleError::new("No product SKU in seed state"))?
.to_string()
};
@ -248,7 +283,10 @@ fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, FetchErro
})
}
fn get_meta_content<'a>(dom: &'a VDom<'a>, prop: &str) -> Result<Option<Cow<'a, str>>, FetchError> {
fn get_meta_content<'a>(
dom: &'a VDom<'a>,
prop: &str,
) -> Result<Option<Cow<'a, str>>, SimpleError> {
let tag = &dom
.query_selector(&format!("meta[property=\"{}\"]", prop))
.and_then(|mut iter| iter.next())
@ -259,14 +297,14 @@ fn get_meta_content<'a>(dom: &'a VDom<'a>, prop: &str) -> Result<Option<Cow<'a,
tag.attributes()
.get("content")
.flatten()
.ok_or(FetchError::ParseError("Failed to get content attr"))?
.ok_or(SimpleError::new("Failed to get content attr"))?
.as_utf8_str(),
)),
None => Ok(None),
}
}
fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, FetchError> {
fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, SimpleError> {
let parser = dom.parser();
let inner_html = &dom
.query_selector(&format!(
@ -282,11 +320,11 @@ fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, Fet
.iter()
.find(|n| n.as_tag().is_some())
})
.ok_or(FetchError::ParseError("Failed to get script tag"))?
.ok_or(SimpleError::new("Failed to get script tag"))?
.inner_html(parser);
Ok(inner_html
inner_html
.parse()
.map_err(|_| FetchError::ParseError("Couldn't parse JSON in script"))?)
.map_err(|_| SimpleError::new("Couldn't parse JSON in script"))
}
fn now_sec() -> u64 {
@ -300,7 +338,10 @@ fn now_sec() -> u64 {
async fn db_writer(rx: Receiver<PrecioPoint>) {
// let conn = Connection::open("../scraper/sqlite.db").unwrap();
// let mut stmt = conn.prepare("SELECT id, name, data FROM person")?;
let mut n = 0;
while let Ok(res) = rx.recv().await {
println!("{:?}", res)
n += 1;
println!("{}", n);
// println!("{:?}", res)
}
}