mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-25 19:16:19 +00:00
Compare commits
2 commits
78878d8b7e
...
348d054b7b
Author | SHA1 | Date | |
---|---|---|---|
348d054b7b | |||
613efc3111 |
5 changed files with 372 additions and 141 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -15,4 +15,5 @@ target/
|
|||
.env.*
|
||||
|
||||
*/flamegraph.svg
|
||||
*/perf.data*
|
||||
*/perf.data*
|
||||
scraper-rs/debug/
|
8
.vscode/launch.json
vendored
8
.vscode/launch.json
vendored
|
@ -7,13 +7,13 @@
|
|||
{
|
||||
"type": "lldb",
|
||||
"request": "launch",
|
||||
"name": "warcificator",
|
||||
"cwd": "warcificator/",
|
||||
"name": "scraper-rs",
|
||||
"cwd": "scraper-rs/",
|
||||
"cargo": {
|
||||
// https://github.com/vadimcn/codelldb/issues/884
|
||||
"args": ["build", "--manifest-path=warcificator/Cargo.toml"]
|
||||
"args": ["build", "--manifest-path=scraper-rs/Cargo.toml"]
|
||||
},
|
||||
"args": ["../data/carrefour"],
|
||||
"args": ["../data/Carrefour.txt"],
|
||||
"env": {}
|
||||
},
|
||||
{
|
||||
|
|
366
warcificator/Cargo.lock → scraper-rs/Cargo.lock
generated
366
warcificator/Cargo.lock → scraper-rs/Cargo.lock
generated
|
@ -17,6 +17,17 @@ version = "1.0.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
||||
|
||||
[[package]]
|
||||
name = "again"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "05802a5ad4d172eaf796f7047b42d0af9db513585d16d4169660a21613d34b93"
|
||||
dependencies = [
|
||||
"log",
|
||||
"rand 0.7.3",
|
||||
"wasm-timer",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ahash"
|
||||
version = "0.8.7"
|
||||
|
@ -198,16 +209,6 @@ dependencies = [
|
|||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-channel"
|
||||
version = "0.5.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "82a9b73a36529d9c47029b9fb3a6f0ea3cc916a261195352ba19e770fc1748b2"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.18"
|
||||
|
@ -218,13 +219,10 @@ dependencies = [
|
|||
]
|
||||
|
||||
[[package]]
|
||||
name = "deranged"
|
||||
version = "0.3.11"
|
||||
name = "either"
|
||||
version = "1.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
|
||||
dependencies = [
|
||||
"powerfmt",
|
||||
]
|
||||
checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
|
@ -299,6 +297,21 @@ dependencies = [
|
|||
"percent-encoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures"
|
||||
version = "0.3.30"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-executor",
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-channel"
|
||||
version = "0.3.30"
|
||||
|
@ -306,6 +319,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -314,6 +328,34 @@ version = "0.3.30"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
|
||||
|
||||
[[package]]
|
||||
name = "futures-executor"
|
||||
version = "0.3.30"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-io"
|
||||
version = "0.3.30"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
|
||||
|
||||
[[package]]
|
||||
name = "futures-macro"
|
||||
version = "0.3.30"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-sink"
|
||||
version = "0.3.30"
|
||||
|
@ -332,10 +374,27 @@ version = "0.3.30"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-io",
|
||||
"futures-macro",
|
||||
"futures-sink",
|
||||
"futures-task",
|
||||
"memchr",
|
||||
"pin-project-lite",
|
||||
"pin-utils",
|
||||
"slab",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.1.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"wasi 0.9.0+wasi-snapshot-preview1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -346,7 +405,7 @@ checksum = "fe9006bed769170c11f845cf00c7c1e9092aeb3f268e007c3e760ac68008070f"
|
|||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"wasi",
|
||||
"wasi 0.11.0+wasi-snapshot-preview1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -491,6 +550,15 @@ dependencies = [
|
|||
"hashbrown",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "instant"
|
||||
version = "0.1.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ipnet"
|
||||
version = "2.9.0"
|
||||
|
@ -578,10 +646,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"wasi",
|
||||
"wasi 0.11.0+wasi-snapshot-preview1",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nanoid"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3ffa00dec017b5b1a8b7cf5e2c008bfda1aa7e0697ac1508b491fdf2622fb4d8"
|
||||
dependencies = [
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nu-ansi-term"
|
||||
version = "0.46.0"
|
||||
|
@ -629,6 +706,17 @@ version = "2.2.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bb813b8af86854136c6922af0598d719255ecb2179515e6e7730d468f05c9cae"
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
|
||||
dependencies = [
|
||||
"instant",
|
||||
"lock_api",
|
||||
"parking_lot_core 0.8.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.12.1"
|
||||
|
@ -636,7 +724,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
|
||||
dependencies = [
|
||||
"lock_api",
|
||||
"parking_lot_core",
|
||||
"parking_lot_core 0.9.9",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot_core"
|
||||
version = "0.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"instant",
|
||||
"libc",
|
||||
"redox_syscall 0.2.16",
|
||||
"smallvec",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -647,7 +749,7 @@ checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e"
|
|||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"redox_syscall",
|
||||
"redox_syscall 0.4.1",
|
||||
"smallvec",
|
||||
"windows-targets",
|
||||
]
|
||||
|
@ -677,29 +779,109 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "69d3587f8a9e599cc7ec2c00e331f71c4e69a5f9a4b8a6efd5b07466b9736f9a"
|
||||
|
||||
[[package]]
|
||||
name = "powerfmt"
|
||||
version = "0.2.0"
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
|
||||
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.71"
|
||||
version = "1.0.76"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "75cb1540fadbd5b8fbccc4dddad2734eba435053f725621c070711a14bb5f4b8"
|
||||
checksum = "95fc56cda0b5c3325f5fbbd7ff9fda9e02bb00bb3dac51252d2f1bfa1cb8cc8c"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.33"
|
||||
version = "1.0.35"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
|
||||
checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
|
||||
dependencies = [
|
||||
"getrandom 0.1.16",
|
||||
"libc",
|
||||
"rand_chacha 0.2.2",
|
||||
"rand_core 0.5.1",
|
||||
"rand_hc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"rand_chacha 0.3.1",
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core 0.5.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
|
||||
dependencies = [
|
||||
"getrandom 0.1.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
||||
dependencies = [
|
||||
"getrandom 0.2.11",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_hc"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
|
||||
dependencies = [
|
||||
"rand_core 0.5.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.2.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.4.1"
|
||||
|
@ -741,6 +923,7 @@ dependencies = [
|
|||
"system-configuration",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
"tokio-socks",
|
||||
"tokio-util",
|
||||
"tower-service",
|
||||
"url",
|
||||
|
@ -758,7 +941,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "688c63d65483050968b2a8937f7995f443e27041a0f7700aa59b0822aedebb74"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"getrandom",
|
||||
"getrandom 0.2.11",
|
||||
"libc",
|
||||
"spin",
|
||||
"untrusted",
|
||||
|
@ -828,6 +1011,26 @@ version = "1.2.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
||||
|
||||
[[package]]
|
||||
name = "scraper-rs"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"again",
|
||||
"async-channel",
|
||||
"nanoid",
|
||||
"rand 0.8.5",
|
||||
"reqwest",
|
||||
"rusqlite",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"simple-error",
|
||||
"thiserror",
|
||||
"tl",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sct"
|
||||
version = "0.7.1"
|
||||
|
@ -899,6 +1102,12 @@ dependencies = [
|
|||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "simple-error"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8542b68b8800c3cda649d2c72d688b6907b30f1580043135d61669d4aad1c175"
|
||||
|
||||
[[package]]
|
||||
name = "slab"
|
||||
version = "0.4.9"
|
||||
|
@ -932,9 +1141,9 @@ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
|
|||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.43"
|
||||
version = "2.0.48"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ee659fb5f3d355364e1f3e5bc10fb82068efbf824a1e9d1c9504244a6469ad53"
|
||||
checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
|
@ -964,18 +1173,18 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.55"
|
||||
version = "1.0.56"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6e3de26b0965292219b4287ff031fcba86837900fe9cd2b34ea8ad893c0953d2"
|
||||
checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "1.0.55"
|
||||
version = "1.0.56"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "268026685b2be38d7103e9e507c938a1fcb3d7e6eb15e87870b617bf37b6d581"
|
||||
checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
|
@ -992,35 +1201,6 @@ dependencies = [
|
|||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f657ba42c3f86e7680e53c8cd3af8abbe56b5491790b46e22e19c0d57463583e"
|
||||
dependencies = [
|
||||
"deranged",
|
||||
"itoa",
|
||||
"powerfmt",
|
||||
"serde",
|
||||
"time-core",
|
||||
"time-macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time-core"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
|
||||
|
||||
[[package]]
|
||||
name = "time-macros"
|
||||
version = "0.2.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "26197e33420244aeb70c3e8c78376ca46571bc4e701e4791c2cd9f57dcb3a43f"
|
||||
dependencies = [
|
||||
"time-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec"
|
||||
version = "1.6.0"
|
||||
|
@ -1052,7 +1232,7 @@ dependencies = [
|
|||
"libc",
|
||||
"mio",
|
||||
"num_cpus",
|
||||
"parking_lot",
|
||||
"parking_lot 0.12.1",
|
||||
"pin-project-lite",
|
||||
"signal-hook-registry",
|
||||
"socket2",
|
||||
|
@ -1081,6 +1261,18 @@ dependencies = [
|
|||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-socks"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "51165dfa029d2a65969413a6cc96f354b86b464498702f174a4efa13608fd8c0"
|
||||
dependencies = [
|
||||
"either",
|
||||
"futures-util",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-util"
|
||||
version = "0.7.10"
|
||||
|
@ -1107,24 +1299,11 @@ version = "0.1.40"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
|
||||
dependencies = [
|
||||
"log",
|
||||
"pin-project-lite",
|
||||
"tracing-attributes",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-appender"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3566e8ce28cc0a3fe42519fc80e6b4c943cc4c8cef275620eb8dac2d3d4e06cf"
|
||||
dependencies = [
|
||||
"crossbeam-channel",
|
||||
"thiserror",
|
||||
"time",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-attributes"
|
||||
version = "0.1.27"
|
||||
|
@ -1243,20 +1422,10 @@ dependencies = [
|
|||
]
|
||||
|
||||
[[package]]
|
||||
name = "warcificator"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"async-channel",
|
||||
"reqwest",
|
||||
"rusqlite",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tl",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-appender",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
name = "wasi"
|
||||
version = "0.9.0+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
|
@ -1330,6 +1499,21 @@ version = "0.2.89"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7ab9b36309365056cd639da3134bf87fa8f3d86008abf99e612384a6eecd459f"
|
||||
|
||||
[[package]]
|
||||
name = "wasm-timer"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "be0ecb0db480561e9a7642b5d3e4187c128914e58aa84330b9493e3eb68c5e7f"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"js-sys",
|
||||
"parking_lot 0.11.2",
|
||||
"pin-utils",
|
||||
"wasm-bindgen",
|
||||
"wasm-bindgen-futures",
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "web-sys"
|
||||
version = "0.3.66"
|
|
@ -1,24 +1,29 @@
|
|||
[package]
|
||||
name = "warcificator"
|
||||
name = "scraper-rs"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
again = "0.1.2"
|
||||
async-channel = "2.1.1"
|
||||
nanoid = "0.4.0"
|
||||
rand = "0.8.5"
|
||||
# lol_html = "1.2.0"
|
||||
reqwest = { version = "0.11.23", default-features = false, features = [
|
||||
"rustls-tls",
|
||||
"gzip",
|
||||
"brotli",
|
||||
"socks",
|
||||
] }
|
||||
rusqlite = "0.30.0"
|
||||
# scraper = "0.18.1"
|
||||
serde = { version = "1.0.193", features = ["derive"] }
|
||||
serde_json = "1.0.109"
|
||||
simple-error = "0.3.0"
|
||||
thiserror = "1.0.56"
|
||||
tl = { git = "https://github.com/evertedsphere/tl", branch = "patch-1", features = ["simd"] }
|
||||
tokio = { version = "1.35.1", features = ["full"] }
|
||||
tracing = { version = "0.1", features = ["log"] }
|
||||
tracing-appender = "0.2.3"
|
||||
tracing-subscriber = "0.3.18"
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = "0.3"
|
|
@ -1,11 +1,18 @@
|
|||
use again::RetryPolicy;
|
||||
use async_channel::{Receiver, Sender};
|
||||
use nanoid::nanoid;
|
||||
use rand::seq::SliceRandom;
|
||||
use reqwest::Url;
|
||||
use rusqlite::Connection;
|
||||
use simple_error::{bail, SimpleError};
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
env::{self, args},
|
||||
fs,
|
||||
time::{SystemTime, UNIX_EPOCH},
|
||||
path::PathBuf,
|
||||
time::{Duration, SystemTime, UNIX_EPOCH},
|
||||
};
|
||||
use thiserror::Error;
|
||||
use tl::VDom;
|
||||
use tokio::io::{stderr, AsyncWriteExt};
|
||||
|
||||
|
@ -95,14 +102,16 @@ struct PrecioPoint {
|
|||
// }
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
tracing_subscriber::fmt::init();
|
||||
|
||||
let mut args = args().skip(1);
|
||||
let links_list_path = args.next().unwrap();
|
||||
let links_list_path = args.next().expect("Falta arg para path de lista de urls");
|
||||
let links_str = fs::read_to_string(links_list_path).unwrap();
|
||||
let links = links_str
|
||||
.split("\n")
|
||||
.split('\n')
|
||||
.map(|s| s.trim())
|
||||
.filter(|s| s.len() > 0)
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| s.to_owned())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
|
@ -112,8 +121,8 @@ async fn main() {
|
|||
|
||||
let mut handles = Vec::new();
|
||||
for _ in 1..env::var("N_COROUTINES")
|
||||
.map_or(Ok(32), |s| s.parse::<usize>())
|
||||
.unwrap()
|
||||
.map_or(Ok(128), |s| s.parse::<usize>())
|
||||
.expect("N_COROUTINES no es un número")
|
||||
{
|
||||
let rx = receiver.clone();
|
||||
let tx = res_sender.clone();
|
||||
|
@ -134,6 +143,7 @@ async fn main() {
|
|||
db_writer_handle
|
||||
};
|
||||
handle.await.unwrap();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn worker(rx: Receiver<String>, tx: Sender<PrecioPoint>) {
|
||||
|
@ -145,46 +155,68 @@ async fn worker(rx: Receiver<String>, tx: Sender<PrecioPoint>) {
|
|||
tx.send(ex).await.unwrap();
|
||||
}
|
||||
Err(err) => {
|
||||
stderr()
|
||||
.write_all(format!("Failed to fetch {}: {:?}\n", url.as_str(), err).as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
tracing::error!(error=%err, url=url);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Error)]
|
||||
enum FetchError {
|
||||
HttpError(reqwest::Error),
|
||||
ParseError(&'static str),
|
||||
#[error("reqwest error")]
|
||||
Http(#[from] reqwest::Error),
|
||||
#[error("http status: {0}")]
|
||||
HttpStatus(reqwest::StatusCode),
|
||||
#[error("parse error")]
|
||||
Parse(#[from] SimpleError),
|
||||
#[error("tl error")]
|
||||
Tl(#[from] tl::ParseError),
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip(client))]
|
||||
async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result<PrecioPoint, FetchError> {
|
||||
let request = client.get(url.as_str()).build().unwrap();
|
||||
let response = client
|
||||
.execute(request)
|
||||
.await
|
||||
.map_err(|e| FetchError::HttpError(e))?;
|
||||
let body = response
|
||||
.text()
|
||||
.await
|
||||
.map_err(|e| FetchError::HttpError(e))?;
|
||||
let policy = RetryPolicy::exponential(Duration::from_millis(300))
|
||||
.with_max_retries(10)
|
||||
.with_jitter(true);
|
||||
|
||||
let dom = tl::parse(&body, tl::ParserOptions::default()).unwrap();
|
||||
// let parser = dom.parser();
|
||||
let response = policy
|
||||
.retry(|| {
|
||||
let request = client.get(url.as_str()).build().unwrap();
|
||||
client.execute(request)
|
||||
})
|
||||
.await
|
||||
.map_err(FetchError::Http)?;
|
||||
if !response.status().is_success() {
|
||||
return Err(FetchError::HttpStatus(response.status()));
|
||||
}
|
||||
let body = response.text().await.map_err(FetchError::Http)?;
|
||||
|
||||
let point = parse_carrefour(url, &dom)?;
|
||||
let maybe_point = {
|
||||
let dom = tl::parse(&body, tl::ParserOptions::default()).map_err(FetchError::Tl)?;
|
||||
parse_carrefour(url, &dom)
|
||||
};
|
||||
|
||||
let point = match maybe_point {
|
||||
Ok(p) => Ok(p),
|
||||
Err(err) => {
|
||||
let debug_path = PathBuf::from("debug/");
|
||||
tokio::fs::create_dir_all(&debug_path).await.unwrap();
|
||||
let file_path = debug_path.join(format!("{}.html", nanoid!()));
|
||||
tokio::fs::write(&file_path, &body).await.unwrap();
|
||||
tracing::debug!(error=%err, "Failed to parse, saved body at {}",file_path.display());
|
||||
Err(err)
|
||||
}
|
||||
}?;
|
||||
|
||||
Ok(point)
|
||||
}
|
||||
|
||||
fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, FetchError> {
|
||||
fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, SimpleError> {
|
||||
let precio_centavos = {
|
||||
get_meta_content(dom, "product:price:amount")?
|
||||
.map(|s| {
|
||||
s.parse::<f64>()
|
||||
.map_err(|_| FetchError::ParseError("Failed to parse number"))
|
||||
.map_err(|_| SimpleError::new("Failed to parse number"))
|
||||
})
|
||||
.transpose()
|
||||
.map(|f| f.map(|f| (f * 100.0) as u64))
|
||||
|
@ -195,7 +227,7 @@ fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, FetchErro
|
|||
Some(s) => match s.as_ref() {
|
||||
"oos" => Some(false),
|
||||
"instock" => Some(true),
|
||||
_ => return Err(FetchError::ParseError("Not a valid product:availability")),
|
||||
_ => return Err(SimpleError::new("Not a valid product:availability")),
|
||||
},
|
||||
None => None,
|
||||
};
|
||||
|
@ -204,7 +236,10 @@ fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, FetchErro
|
|||
let json = &parse_script_json(dom, "__STATE__")?;
|
||||
let state = json
|
||||
.as_object()
|
||||
.ok_or(FetchError::ParseError("Seed state not an object"))?;
|
||||
.ok_or(SimpleError::new("Seed state not an object"))?;
|
||||
if state.is_empty() {
|
||||
bail!("Seed state is an empty object")
|
||||
}
|
||||
let (_, product_json) = state
|
||||
.into_iter()
|
||||
.find(|(key, val)| {
|
||||
|
@ -214,11 +249,11 @@ fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, FetchErro
|
|||
.and_then(|val| val.get("__typename"))
|
||||
.map_or(false, |typename| typename == "Product")
|
||||
})
|
||||
.ok_or(FetchError::ParseError("No product in seed state"))?;
|
||||
.ok_or(SimpleError::new("No product in seed state"))?;
|
||||
let cache_id = product_json
|
||||
.get("cacheId")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or(FetchError::ParseError("No cacheId in seed state"))?;
|
||||
.ok_or(SimpleError::new("No cacheId in seed state"))?;
|
||||
let (_, product_sku_json) = state
|
||||
.iter()
|
||||
.find(|(key, val)| {
|
||||
|
@ -228,11 +263,11 @@ fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, FetchErro
|
|||
.map_or(false, |typename| typename == "SKU")
|
||||
})
|
||||
})
|
||||
.ok_or(FetchError::ParseError("No Product:cacheId* found"))?;
|
||||
.ok_or(SimpleError::new("No Product:cacheId* found"))?;
|
||||
product_sku_json
|
||||
.get("ean")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or(FetchError::ParseError("No product SKU in seed state"))?
|
||||
.ok_or(SimpleError::new("No product SKU in seed state"))?
|
||||
.to_string()
|
||||
};
|
||||
|
||||
|
@ -248,7 +283,10 @@ fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, FetchErro
|
|||
})
|
||||
}
|
||||
|
||||
fn get_meta_content<'a>(dom: &'a VDom<'a>, prop: &str) -> Result<Option<Cow<'a, str>>, FetchError> {
|
||||
fn get_meta_content<'a>(
|
||||
dom: &'a VDom<'a>,
|
||||
prop: &str,
|
||||
) -> Result<Option<Cow<'a, str>>, SimpleError> {
|
||||
let tag = &dom
|
||||
.query_selector(&format!("meta[property=\"{}\"]", prop))
|
||||
.and_then(|mut iter| iter.next())
|
||||
|
@ -259,14 +297,14 @@ fn get_meta_content<'a>(dom: &'a VDom<'a>, prop: &str) -> Result<Option<Cow<'a,
|
|||
tag.attributes()
|
||||
.get("content")
|
||||
.flatten()
|
||||
.ok_or(FetchError::ParseError("Failed to get content attr"))?
|
||||
.ok_or(SimpleError::new("Failed to get content attr"))?
|
||||
.as_utf8_str(),
|
||||
)),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, FetchError> {
|
||||
fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, SimpleError> {
|
||||
let parser = dom.parser();
|
||||
let inner_html = &dom
|
||||
.query_selector(&format!(
|
||||
|
@ -282,11 +320,11 @@ fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, Fet
|
|||
.iter()
|
||||
.find(|n| n.as_tag().is_some())
|
||||
})
|
||||
.ok_or(FetchError::ParseError("Failed to get script tag"))?
|
||||
.ok_or(SimpleError::new("Failed to get script tag"))?
|
||||
.inner_html(parser);
|
||||
Ok(inner_html
|
||||
inner_html
|
||||
.parse()
|
||||
.map_err(|_| FetchError::ParseError("Couldn't parse JSON in script"))?)
|
||||
.map_err(|_| SimpleError::new("Couldn't parse JSON in script"))
|
||||
}
|
||||
|
||||
fn now_sec() -> u64 {
|
||||
|
@ -300,7 +338,10 @@ fn now_sec() -> u64 {
|
|||
async fn db_writer(rx: Receiver<PrecioPoint>) {
|
||||
// let conn = Connection::open("../scraper/sqlite.db").unwrap();
|
||||
// let mut stmt = conn.prepare("SELECT id, name, data FROM person")?;
|
||||
let mut n = 0;
|
||||
while let Ok(res) = rx.recv().await {
|
||||
println!("{:?}", res)
|
||||
n += 1;
|
||||
println!("{}", n);
|
||||
// println!("{:?}", res)
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue