mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 11:36:20 +00:00
Compare commits
9 commits
348d054b7b
...
972d5ade18
Author | SHA1 | Date | |
---|---|---|---|
972d5ade18 | |||
1348bee6c7 | |||
8e8fe8ddaf | |||
37ceb15e74 | |||
f2401aa965 | |||
3a31586193 | |||
8f6f62a261 | |||
b696551949 | |||
27aee01c1a |
11 changed files with 702 additions and 228 deletions
|
@ -1,3 +1,3 @@
|
||||||
version https://git-lfs.github.com/spec/v1
|
version https://git-lfs.github.com/spec/v1
|
||||||
oid sha256:6299b470d9debc9a173b40c2ba91208eb43a6c8cde02a4819e7bcd76368e4363
|
oid sha256:f231884c2b9fd0b633746892a00824379b4d8aa110e6348309197b83b0d1c555
|
||||||
size 922185
|
size 926218
|
||||||
|
|
214
scraper-rs/Cargo.lock
generated
214
scraper-rs/Cargo.lock
generated
|
@ -61,6 +61,60 @@ version = "0.2.16"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
|
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstream"
|
||||||
|
version = "0.6.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d664a92ecae85fd0a7392615844904654d1d5f5514837f471ddef4a057aba1b6"
|
||||||
|
dependencies = [
|
||||||
|
"anstyle",
|
||||||
|
"anstyle-parse",
|
||||||
|
"anstyle-query",
|
||||||
|
"anstyle-wincon",
|
||||||
|
"colorchoice",
|
||||||
|
"utf8parse",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle"
|
||||||
|
version = "1.0.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle-parse"
|
||||||
|
version = "0.2.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c"
|
||||||
|
dependencies = [
|
||||||
|
"utf8parse",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle-query"
|
||||||
|
version = "1.0.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648"
|
||||||
|
dependencies = [
|
||||||
|
"windows-sys 0.52.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle-wincon"
|
||||||
|
version = "3.0.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7"
|
||||||
|
dependencies = [
|
||||||
|
"anstyle",
|
||||||
|
"windows-sys 0.52.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anyhow"
|
||||||
|
version = "1.0.79"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "async-channel"
|
name = "async-channel"
|
||||||
version = "2.1.1"
|
version = "2.1.1"
|
||||||
|
@ -175,6 +229,52 @@ version = "1.0.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap"
|
||||||
|
version = "4.4.15"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c12ed66a79a555082f595f7eb980d08669de95009dd4b3d61168c573ebe38fc9"
|
||||||
|
dependencies = [
|
||||||
|
"clap_builder",
|
||||||
|
"clap_derive",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap_builder"
|
||||||
|
version = "4.4.15"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0f4645eab3431e5a8403a96bea02506a8b35d28cd0f0330977dd5d22f9c84f43"
|
||||||
|
dependencies = [
|
||||||
|
"anstream",
|
||||||
|
"anstyle",
|
||||||
|
"clap_lex",
|
||||||
|
"strsim",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap_derive"
|
||||||
|
version = "4.4.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442"
|
||||||
|
dependencies = [
|
||||||
|
"heck",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap_lex"
|
||||||
|
version = "0.6.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "colorchoice"
|
||||||
|
version = "1.0.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "concurrent-queue"
|
name = "concurrent-queue"
|
||||||
version = "2.4.0"
|
version = "2.4.0"
|
||||||
|
@ -452,6 +552,12 @@ dependencies = [
|
||||||
"hashbrown",
|
"hashbrown",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "heck"
|
||||||
|
version = "0.4.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "hermit-abi"
|
name = "hermit-abi"
|
||||||
version = "0.3.3"
|
version = "0.3.3"
|
||||||
|
@ -647,7 +753,7 @@ checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
"wasi 0.11.0+wasi-snapshot-preview1",
|
"wasi 0.11.0+wasi-snapshot-preview1",
|
||||||
"windows-sys",
|
"windows-sys 0.48.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -751,7 +857,7 @@ dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
"redox_syscall 0.4.1",
|
"redox_syscall 0.4.1",
|
||||||
"smallvec",
|
"smallvec",
|
||||||
"windows-targets",
|
"windows-targets 0.48.5",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -945,7 +1051,7 @@ dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
"spin",
|
"spin",
|
||||||
"untrusted",
|
"untrusted",
|
||||||
"windows-sys",
|
"windows-sys 0.48.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1016,7 +1122,9 @@ name = "scraper-rs"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"again",
|
"again",
|
||||||
|
"anyhow",
|
||||||
"async-channel",
|
"async-channel",
|
||||||
|
"clap",
|
||||||
"nanoid",
|
"nanoid",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
@ -1130,7 +1238,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9"
|
checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
"windows-sys",
|
"windows-sys 0.48.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1139,6 +1247,12 @@ version = "0.9.8"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
|
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "strsim"
|
||||||
|
version = "0.10.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "syn"
|
name = "syn"
|
||||||
version = "2.0.48"
|
version = "2.0.48"
|
||||||
|
@ -1237,7 +1351,7 @@ dependencies = [
|
||||||
"signal-hook-registry",
|
"signal-hook-registry",
|
||||||
"socket2",
|
"socket2",
|
||||||
"tokio-macros",
|
"tokio-macros",
|
||||||
"windows-sys",
|
"windows-sys 0.48.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1394,6 +1508,12 @@ dependencies = [
|
||||||
"percent-encoding",
|
"percent-encoding",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "utf8parse"
|
||||||
|
version = "0.2.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "valuable"
|
name = "valuable"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
|
@ -1558,7 +1678,16 @@ version = "0.48.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
|
checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"windows-targets",
|
"windows-targets 0.48.5",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-sys"
|
||||||
|
version = "0.52.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
|
||||||
|
dependencies = [
|
||||||
|
"windows-targets 0.52.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1567,13 +1696,28 @@ version = "0.48.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
|
checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"windows_aarch64_gnullvm",
|
"windows_aarch64_gnullvm 0.48.5",
|
||||||
"windows_aarch64_msvc",
|
"windows_aarch64_msvc 0.48.5",
|
||||||
"windows_i686_gnu",
|
"windows_i686_gnu 0.48.5",
|
||||||
"windows_i686_msvc",
|
"windows_i686_msvc 0.48.5",
|
||||||
"windows_x86_64_gnu",
|
"windows_x86_64_gnu 0.48.5",
|
||||||
"windows_x86_64_gnullvm",
|
"windows_x86_64_gnullvm 0.48.5",
|
||||||
"windows_x86_64_msvc",
|
"windows_x86_64_msvc 0.48.5",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-targets"
|
||||||
|
version = "0.52.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd"
|
||||||
|
dependencies = [
|
||||||
|
"windows_aarch64_gnullvm 0.52.0",
|
||||||
|
"windows_aarch64_msvc 0.52.0",
|
||||||
|
"windows_i686_gnu 0.52.0",
|
||||||
|
"windows_i686_msvc 0.52.0",
|
||||||
|
"windows_x86_64_gnu 0.52.0",
|
||||||
|
"windows_x86_64_gnullvm 0.52.0",
|
||||||
|
"windows_x86_64_msvc 0.52.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1582,42 +1726,84 @@ version = "0.48.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
|
checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_aarch64_gnullvm"
|
||||||
|
version = "0.52.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "windows_aarch64_msvc"
|
name = "windows_aarch64_msvc"
|
||||||
version = "0.48.5"
|
version = "0.48.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
|
checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_aarch64_msvc"
|
||||||
|
version = "0.52.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "windows_i686_gnu"
|
name = "windows_i686_gnu"
|
||||||
version = "0.48.5"
|
version = "0.48.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
|
checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_i686_gnu"
|
||||||
|
version = "0.52.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "windows_i686_msvc"
|
name = "windows_i686_msvc"
|
||||||
version = "0.48.5"
|
version = "0.48.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
|
checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_i686_msvc"
|
||||||
|
version = "0.52.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "windows_x86_64_gnu"
|
name = "windows_x86_64_gnu"
|
||||||
version = "0.48.5"
|
version = "0.48.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
|
checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_x86_64_gnu"
|
||||||
|
version = "0.52.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "windows_x86_64_gnullvm"
|
name = "windows_x86_64_gnullvm"
|
||||||
version = "0.48.5"
|
version = "0.48.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
|
checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_x86_64_gnullvm"
|
||||||
|
version = "0.52.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "windows_x86_64_msvc"
|
name = "windows_x86_64_msvc"
|
||||||
version = "0.48.5"
|
version = "0.48.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
|
checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_x86_64_msvc"
|
||||||
|
version = "0.52.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "winreg"
|
name = "winreg"
|
||||||
version = "0.50.0"
|
version = "0.50.0"
|
||||||
|
@ -1625,7 +1811,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1"
|
checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"windows-sys",
|
"windows-sys 0.48.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
|
@ -7,7 +7,9 @@ edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
again = "0.1.2"
|
again = "0.1.2"
|
||||||
|
anyhow = "1.0.79"
|
||||||
async-channel = "2.1.1"
|
async-channel = "2.1.1"
|
||||||
|
clap = { version = "4.4.15", features = ["derive"] }
|
||||||
nanoid = "0.4.0"
|
nanoid = "0.4.0"
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
# lol_html = "1.2.0"
|
# lol_html = "1.2.0"
|
||||||
|
|
|
@ -1,112 +1,44 @@
|
||||||
use again::RetryPolicy;
|
use again::RetryPolicy;
|
||||||
use async_channel::{Receiver, Sender};
|
use async_channel::{Receiver, Sender};
|
||||||
|
use clap::Parser;
|
||||||
use nanoid::nanoid;
|
use nanoid::nanoid;
|
||||||
use rand::seq::SliceRandom;
|
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
use rusqlite::Connection;
|
use rusqlite::Connection;
|
||||||
use simple_error::{bail, SimpleError};
|
use simple_error::{bail, SimpleError};
|
||||||
use std::{
|
use std::{
|
||||||
borrow::Cow,
|
env::{self},
|
||||||
env::{self, args},
|
|
||||||
fs,
|
fs,
|
||||||
path::PathBuf,
|
path::PathBuf,
|
||||||
time::{Duration, SystemTime, UNIX_EPOCH},
|
time::Duration,
|
||||||
};
|
};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use tl::VDom;
|
use tl::VDom;
|
||||||
use tokio::io::{stderr, AsyncWriteExt};
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Parser)] // requires `derive` feature
|
||||||
struct PrecioPoint {
|
enum Args {
|
||||||
ean: String,
|
FetchList(FetchListArgs),
|
||||||
// unix
|
ParseFile(ParseFileArgs),
|
||||||
fetched_at: u64,
|
}
|
||||||
precio_centavos: Option<u64>,
|
#[derive(clap::Args)]
|
||||||
in_stock: Option<bool>,
|
struct FetchListArgs {
|
||||||
url: String,
|
list_path: String,
|
||||||
parser_version: u16,
|
}
|
||||||
name: Option<String>,
|
#[derive(clap::Args)]
|
||||||
image_url: Option<String>,
|
struct ParseFileArgs {
|
||||||
|
file_path: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
// fn main() {
|
|
||||||
// let arg = args().skip(1).next().unwrap();
|
|
||||||
|
|
||||||
// let file_iter = fs::read_dir(arg)
|
|
||||||
// .unwrap()
|
|
||||||
// .filter(|pr| {
|
|
||||||
// if let Ok(p) = pr {
|
|
||||||
// !p.file_name().to_str().unwrap().ends_with(".link")
|
|
||||||
// } else {
|
|
||||||
// false
|
|
||||||
// }
|
|
||||||
// })
|
|
||||||
// .take(1000)
|
|
||||||
// .map(|f| fs::read(f.unwrap().path()).unwrap());
|
|
||||||
|
|
||||||
// let mut i = 0;
|
|
||||||
// for item in file_iter {
|
|
||||||
// i = i + 1;
|
|
||||||
// {
|
|
||||||
// // let mut text: Option<String> = None;
|
|
||||||
// // let mut price_str: Option<String> = None;
|
|
||||||
// // let mut rewriter = HtmlRewriter::new(
|
|
||||||
// // Settings {
|
|
||||||
// // element_content_handlers: vec![
|
|
||||||
// // // Rewrite insecure hyperlinks
|
|
||||||
// // element!("a[href]", |el| {
|
|
||||||
// // let href = el.get_attribute("href").unwrap().replace("http:", "https:");
|
|
||||||
|
|
||||||
// // el.set_attribute("href", &href).unwrap();
|
|
||||||
|
|
||||||
// // Ok(())
|
|
||||||
// // }),
|
|
||||||
// // (
|
|
||||||
// // Cow::Owned("a".parse().unwrap()),
|
|
||||||
// // ElementContentHandlers::default().text(extract_first_text(&mut text)),
|
|
||||||
// // ),
|
|
||||||
// // element!(
|
|
||||||
// // "meta[property=\"product:price:amount\"]",
|
|
||||||
// // extract_first_attr(&mut price_str, "content")
|
|
||||||
// // ),
|
|
||||||
// // ],
|
|
||||||
// // memory_settings: lol_html::MemorySettings {
|
|
||||||
// // preallocated_parsing_buffer_size: 1024 * 16,
|
|
||||||
// // max_allowed_memory_usage: std::usize::MAX,
|
|
||||||
// // },
|
|
||||||
// // ..Settings::default()
|
|
||||||
// // },
|
|
||||||
// // |_: &[u8]| {},
|
|
||||||
// // );
|
|
||||||
|
|
||||||
// // rewriter.write(&item).unwrap();
|
|
||||||
// // rewriter.end().unwrap();
|
|
||||||
// // println!("{:#?}", price_str);
|
|
||||||
|
|
||||||
// // let html = scraper::Html::parse_document(&String::from_utf8(item).unwrap());
|
|
||||||
|
|
||||||
// let html = String::from_utf8(item).unwrap();
|
|
||||||
// let dom = tl::parse(&html, tl::ParserOptions::default()).unwrap();
|
|
||||||
|
|
||||||
// match parse_carrefour("".into(), &dom) {
|
|
||||||
// Ok(point) => {
|
|
||||||
// // println!("{:?}", point);
|
|
||||||
// }
|
|
||||||
// Err(err) => {
|
|
||||||
// // println!("Error {:#?}: {}", err, html);
|
|
||||||
// }
|
|
||||||
// };
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// println!("n={}", i);
|
|
||||||
// }
|
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
async fn main() -> anyhow::Result<()> {
|
||||||
tracing_subscriber::fmt::init();
|
tracing_subscriber::fmt::init();
|
||||||
|
|
||||||
let mut args = args().skip(1);
|
match Args::parse() {
|
||||||
let links_list_path = args.next().expect("Falta arg para path de lista de urls");
|
Args::FetchList(a) => fetch_list_cli(a.list_path).await,
|
||||||
|
Args::ParseFile(a) => parse_file_cli(a.file_path).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
|
||||||
let links_str = fs::read_to_string(links_list_path).unwrap();
|
let links_str = fs::read_to_string(links_list_path).unwrap();
|
||||||
let links = links_str
|
let links = links_str
|
||||||
.split('\n')
|
.split('\n')
|
||||||
|
@ -146,8 +78,12 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn build_client() -> reqwest::Client {
|
||||||
|
reqwest::ClientBuilder::default().build().unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
async fn worker(rx: Receiver<String>, tx: Sender<PrecioPoint>) {
|
async fn worker(rx: Receiver<String>, tx: Sender<PrecioPoint>) {
|
||||||
let client = reqwest::ClientBuilder::default().build().unwrap();
|
let client = build_client();
|
||||||
while let Ok(url) = rx.recv().await {
|
while let Ok(url) = rx.recv().await {
|
||||||
let res = fetch_and_parse(&client, url.clone()).await;
|
let res = fetch_and_parse(&client, url.clone()).await;
|
||||||
match res {
|
match res {
|
||||||
|
@ -174,7 +110,10 @@ enum FetchError {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tracing::instrument(skip(client))]
|
#[tracing::instrument(skip(client))]
|
||||||
async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result<PrecioPoint, FetchError> {
|
async fn fetch_and_parse(
|
||||||
|
client: &reqwest::Client,
|
||||||
|
url: String,
|
||||||
|
) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
let policy = RetryPolicy::exponential(Duration::from_millis(300))
|
let policy = RetryPolicy::exponential(Duration::from_millis(300))
|
||||||
.with_max_retries(10)
|
.with_max_retries(10)
|
||||||
.with_jitter(true);
|
.with_jitter(true);
|
||||||
|
@ -187,14 +126,11 @@ async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result<Precio
|
||||||
.await
|
.await
|
||||||
.map_err(FetchError::Http)?;
|
.map_err(FetchError::Http)?;
|
||||||
if !response.status().is_success() {
|
if !response.status().is_success() {
|
||||||
return Err(FetchError::HttpStatus(response.status()));
|
bail!(FetchError::HttpStatus(response.status()));
|
||||||
}
|
}
|
||||||
let body = response.text().await.map_err(FetchError::Http)?;
|
let body = response.text().await.map_err(FetchError::Http)?;
|
||||||
|
|
||||||
let maybe_point = {
|
let maybe_point = { scrap_url(client, url, &body).await };
|
||||||
let dom = tl::parse(&body, tl::ParserOptions::default()).map_err(FetchError::Tl)?;
|
|
||||||
parse_carrefour(url, &dom)
|
|
||||||
};
|
|
||||||
|
|
||||||
let point = match maybe_point {
|
let point = match maybe_point {
|
||||||
Ok(p) => Ok(p),
|
Ok(p) => Ok(p),
|
||||||
|
@ -211,128 +147,48 @@ async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result<Precio
|
||||||
Ok(point)
|
Ok(point)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_carrefour(url: String, dom: &tl::VDom) -> Result<PrecioPoint, SimpleError> {
|
async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
|
||||||
let precio_centavos = {
|
let file = tokio::fs::read_to_string(file_path).await?;
|
||||||
get_meta_content(dom, "product:price:amount")?
|
|
||||||
.map(|s| {
|
|
||||||
s.parse::<f64>()
|
|
||||||
.map_err(|_| SimpleError::new("Failed to parse number"))
|
|
||||||
})
|
|
||||||
.transpose()
|
|
||||||
.map(|f| f.map(|f| (f * 100.0) as u64))
|
|
||||||
}?;
|
|
||||||
|
|
||||||
let in_stock_meta = get_meta_content(dom, "product:availability")?.map(|s| s.into_owned());
|
let client = build_client();
|
||||||
let in_stock = match in_stock_meta {
|
|
||||||
Some(s) => match s.as_ref() {
|
|
||||||
"oos" => Some(false),
|
|
||||||
"instock" => Some(true),
|
|
||||||
_ => return Err(SimpleError::new("Not a valid product:availability")),
|
|
||||||
},
|
|
||||||
None => None,
|
|
||||||
};
|
|
||||||
|
|
||||||
let ean = {
|
let url = {
|
||||||
let json = &parse_script_json(dom, "__STATE__")?;
|
let dom = tl::parse(&file, tl::ParserOptions::default())?;
|
||||||
let state = json
|
dom.query_selector("link[rel=\"canonical\"]")
|
||||||
.as_object()
|
.unwrap()
|
||||||
.ok_or(SimpleError::new("Seed state not an object"))?;
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
if state.is_empty() {
|
.filter_map(|n| n.as_tag())
|
||||||
bail!("Seed state is an empty object")
|
.next()
|
||||||
}
|
.and_then(|t| t.attributes().get("href").flatten())
|
||||||
let (_, product_json) = state
|
.expect("No meta canonical")
|
||||||
.into_iter()
|
.as_utf8_str()
|
||||||
.find(|(key, val)| {
|
|
||||||
key.starts_with("Product:")
|
|
||||||
&& val
|
|
||||||
.as_object()
|
|
||||||
.and_then(|val| val.get("__typename"))
|
|
||||||
.map_or(false, |typename| typename == "Product")
|
|
||||||
})
|
|
||||||
.ok_or(SimpleError::new("No product in seed state"))?;
|
|
||||||
let cache_id = product_json
|
|
||||||
.get("cacheId")
|
|
||||||
.and_then(|v| v.as_str())
|
|
||||||
.ok_or(SimpleError::new("No cacheId in seed state"))?;
|
|
||||||
let (_, product_sku_json) = state
|
|
||||||
.iter()
|
|
||||||
.find(|(key, val)| {
|
|
||||||
key.starts_with(&format!("Product:{}", cache_id))
|
|
||||||
&& val.as_object().map_or(false, |obj| {
|
|
||||||
obj.get("__typename")
|
|
||||||
.map_or(false, |typename| typename == "SKU")
|
|
||||||
})
|
|
||||||
})
|
|
||||||
.ok_or(SimpleError::new("No Product:cacheId* found"))?;
|
|
||||||
product_sku_json
|
|
||||||
.get("ean")
|
|
||||||
.and_then(|v| v.as_str())
|
|
||||||
.ok_or(SimpleError::new("No product SKU in seed state"))?
|
|
||||||
.to_string()
|
.to_string()
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(PrecioPoint {
|
println!("URL: {}", &url);
|
||||||
ean,
|
println!("{:?}", scrap_url(&client, url, &file).await);
|
||||||
fetched_at: now_sec(),
|
Ok(())
|
||||||
in_stock,
|
|
||||||
name: None,
|
|
||||||
image_url: None,
|
|
||||||
parser_version: 5,
|
|
||||||
precio_centavos,
|
|
||||||
url,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_meta_content<'a>(
|
async fn scrap_url(
|
||||||
dom: &'a VDom<'a>,
|
client: &reqwest::Client,
|
||||||
prop: &str,
|
url: String,
|
||||||
) -> Result<Option<Cow<'a, str>>, SimpleError> {
|
body: &str,
|
||||||
let tag = &dom
|
) -> anyhow::Result<PrecioPoint> {
|
||||||
.query_selector(&format!("meta[property=\"{}\"]", prop))
|
let url_p = Url::parse(&url).unwrap();
|
||||||
.and_then(|mut iter| iter.next())
|
match url_p.host_str().unwrap() {
|
||||||
.and_then(|h| h.get(dom.parser()))
|
"www.carrefour.com.ar" => {
|
||||||
.and_then(|n| n.as_tag());
|
sites::carrefour::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
|
||||||
match tag {
|
}
|
||||||
Some(tag) => Ok(Some(
|
"diaonline.supermercadosdia.com.ar" => {
|
||||||
tag.attributes()
|
sites::dia::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
|
||||||
.get("content")
|
}
|
||||||
.flatten()
|
"www.cotodigital3.com.ar" => {
|
||||||
.ok_or(SimpleError::new("Failed to get content attr"))?
|
sites::coto::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
|
||||||
.as_utf8_str(),
|
}
|
||||||
)),
|
"www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await,
|
||||||
None => Ok(None),
|
s => bail!("Unknown host {}", s),
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, SimpleError> {
|
|
||||||
let parser = dom.parser();
|
|
||||||
let inner_html = &dom
|
|
||||||
.query_selector(&format!(
|
|
||||||
"template[data-type=\"json\"][data-varname=\"{}\"]",
|
|
||||||
varname
|
|
||||||
))
|
|
||||||
.and_then(|mut iter| iter.next())
|
|
||||||
.and_then(|h| h.get(parser))
|
|
||||||
.and_then(|n| n.as_tag())
|
|
||||||
.and_then(|t| {
|
|
||||||
t.children()
|
|
||||||
.all(parser)
|
|
||||||
.iter()
|
|
||||||
.find(|n| n.as_tag().is_some())
|
|
||||||
})
|
|
||||||
.ok_or(SimpleError::new("Failed to get script tag"))?
|
|
||||||
.inner_html(parser);
|
|
||||||
inner_html
|
|
||||||
.parse()
|
|
||||||
.map_err(|_| SimpleError::new("Couldn't parse JSON in script"))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn now_sec() -> u64 {
|
|
||||||
let start = SystemTime::now();
|
|
||||||
let since_the_epoch = start
|
|
||||||
.duration_since(UNIX_EPOCH)
|
|
||||||
.expect("Time went backwards");
|
|
||||||
since_the_epoch.as_secs()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn db_writer(rx: Receiver<PrecioPoint>) {
|
async fn db_writer(rx: Receiver<PrecioPoint>) {
|
||||||
|
@ -345,3 +201,28 @@ async fn db_writer(rx: Receiver<PrecioPoint>) {
|
||||||
// println!("{:?}", res)
|
// println!("{:?}", res)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
use std::time::{SystemTime, UNIX_EPOCH};
|
||||||
|
|
||||||
|
mod sites;
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct PrecioPoint {
|
||||||
|
ean: String,
|
||||||
|
// unix
|
||||||
|
fetched_at: u64,
|
||||||
|
precio_centavos: Option<u64>,
|
||||||
|
in_stock: Option<bool>,
|
||||||
|
url: String,
|
||||||
|
parser_version: u16,
|
||||||
|
name: Option<String>,
|
||||||
|
image_url: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn now_sec() -> u64 {
|
||||||
|
let start = SystemTime::now();
|
||||||
|
let since_the_epoch = start
|
||||||
|
.duration_since(UNIX_EPOCH)
|
||||||
|
.expect("Time went backwards");
|
||||||
|
since_the_epoch.as_secs()
|
||||||
|
}
|
||||||
|
|
68
scraper-rs/src/sites/carrefour.rs
Normal file
68
scraper-rs/src/sites/carrefour.rs
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
use simple_error::bail;
|
||||||
|
use simple_error::SimpleError;
|
||||||
|
|
||||||
|
use crate::sites::common;
|
||||||
|
use crate::sites::vtex;
|
||||||
|
use crate::PrecioPoint;
|
||||||
|
|
||||||
|
use super::vtex::find_product_ld;
|
||||||
|
|
||||||
|
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
|
let precio_centavos = common::price_from_meta(dom)?;
|
||||||
|
|
||||||
|
let in_stock = vtex::in_stock_from_meta(dom)?;
|
||||||
|
|
||||||
|
let ean = {
|
||||||
|
let json = &vtex::parse_script_json(dom, "__STATE__")?;
|
||||||
|
let state = json
|
||||||
|
.as_object()
|
||||||
|
.ok_or(SimpleError::new("Seed state not an object"))?;
|
||||||
|
if state.is_empty() {
|
||||||
|
bail!("Seed state is an empty object")
|
||||||
|
}
|
||||||
|
let (_, product_json) = state
|
||||||
|
.iter()
|
||||||
|
.find(|(key, val)| {
|
||||||
|
key.starts_with("Product:") && val.get("__typename").is_some_and(|t| t == "Product")
|
||||||
|
})
|
||||||
|
.ok_or(SimpleError::new("No product in seed state"))?;
|
||||||
|
let cache_id = product_json
|
||||||
|
.get("cacheId")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.ok_or(SimpleError::new("No cacheId in seed state"))?;
|
||||||
|
let (_, product_sku_json) = state
|
||||||
|
.iter()
|
||||||
|
.find(|(key, val)| {
|
||||||
|
key.starts_with(&format!("Product:{}", cache_id))
|
||||||
|
&& val.get("__typename").is_some_and(|t| t == "SKU")
|
||||||
|
})
|
||||||
|
.ok_or(SimpleError::new("No Product:cacheId* found"))?;
|
||||||
|
product_sku_json
|
||||||
|
.get("ean")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.ok_or(SimpleError::new("No product SKU in seed state"))?
|
||||||
|
.to_string()
|
||||||
|
};
|
||||||
|
|
||||||
|
let (name, image_url) = match find_product_ld(dom) {
|
||||||
|
Some(pm) => {
|
||||||
|
let p = pm?;
|
||||||
|
(Some(p.name), Some(p.image))
|
||||||
|
}
|
||||||
|
None => match in_stock {
|
||||||
|
true => bail!("No JSONLD product in in stock product"),
|
||||||
|
false => (None, None),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(PrecioPoint {
|
||||||
|
ean,
|
||||||
|
fetched_at: crate::now_sec(),
|
||||||
|
in_stock: Some(in_stock),
|
||||||
|
name,
|
||||||
|
image_url,
|
||||||
|
parser_version: 5,
|
||||||
|
precio_centavos,
|
||||||
|
url,
|
||||||
|
})
|
||||||
|
}
|
19
scraper-rs/src/sites/common.rs
Normal file
19
scraper-rs/src/sites/common.rs
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
use std::borrow::Cow;
|
||||||
|
|
||||||
|
use tl::VDom;
|
||||||
|
|
||||||
|
pub fn get_meta_content<'a>(dom: &'a VDom<'a>, prop: &str) -> Option<Cow<'a, str>> {
|
||||||
|
dom.query_selector(&format!("meta[property=\"{}\"]", prop))
|
||||||
|
.and_then(|mut iter| iter.next())
|
||||||
|
.and_then(|h| h.get(dom.parser()))
|
||||||
|
.and_then(|n| n.as_tag())
|
||||||
|
.and_then(|tag| tag.attributes().get("content").flatten())
|
||||||
|
.map(|s| s.as_utf8_str())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn price_from_meta(dom: &tl::VDom<'_>) -> Result<Option<u64>, anyhow::Error> {
|
||||||
|
let precio_centavos = get_meta_content(dom, "product:price:amount")
|
||||||
|
.map(|s| s.parse::<f64>().map(|f| (f * 100.0) as u64))
|
||||||
|
.transpose()?;
|
||||||
|
Ok(precio_centavos)
|
||||||
|
}
|
77
scraper-rs/src/sites/coto.rs
Normal file
77
scraper-rs/src/sites/coto.rs
Normal file
|
@ -0,0 +1,77 @@
|
||||||
|
use anyhow::Context;
|
||||||
|
|
||||||
|
use crate::PrecioPoint;
|
||||||
|
|
||||||
|
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
|
let ean = dom
|
||||||
|
.query_selector("div#brandText")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.find(|t| t.inner_text(dom.parser()).as_ref().contains("| EAN: "))
|
||||||
|
.context("No encuentro eanparent")?
|
||||||
|
.query_selector(dom.parser(), "span.span_codigoplu")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.nth(1)
|
||||||
|
.context("no encuentro el ean")?
|
||||||
|
.inner_text(dom.parser())
|
||||||
|
.trim()
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
let precio_centavos = dom
|
||||||
|
.query_selector(".atg_store_newPrice")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.next()
|
||||||
|
.map(|t| t.inner_text(dom.parser()))
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
.map(|s| {
|
||||||
|
let s = s.replacen('$', "", 1).replace('.', "").replace(',', ".");
|
||||||
|
let s = s.trim();
|
||||||
|
s.parse::<f64>()
|
||||||
|
})
|
||||||
|
.transpose()
|
||||||
|
.context("Parseando precio")?
|
||||||
|
.map(|f| (f * 100.0) as u64);
|
||||||
|
|
||||||
|
let in_stock = Some(
|
||||||
|
dom.query_selector(".product_not_available")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.next()
|
||||||
|
.is_some(),
|
||||||
|
);
|
||||||
|
|
||||||
|
let name = dom
|
||||||
|
.query_selector("h1.product_page")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.next()
|
||||||
|
.map(|t| t.inner_text(dom.parser()))
|
||||||
|
.map(|s| s.trim().to_string());
|
||||||
|
|
||||||
|
let image_url = dom
|
||||||
|
.query_selector(".zoomImage1")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.next()
|
||||||
|
.and_then(|t| t.attributes().get("src").flatten())
|
||||||
|
.map(|s| s.as_utf8_str().to_string());
|
||||||
|
|
||||||
|
Ok(PrecioPoint {
|
||||||
|
ean,
|
||||||
|
fetched_at: crate::now_sec(),
|
||||||
|
in_stock,
|
||||||
|
name,
|
||||||
|
image_url,
|
||||||
|
parser_version: 5,
|
||||||
|
precio_centavos,
|
||||||
|
url,
|
||||||
|
})
|
||||||
|
}
|
41
scraper-rs/src/sites/dia.rs
Normal file
41
scraper-rs/src/sites/dia.rs
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
use anyhow::Context;
|
||||||
|
use simple_error::bail;
|
||||||
|
|
||||||
|
use crate::sites::common;
|
||||||
|
use crate::PrecioPoint;
|
||||||
|
|
||||||
|
use super::vtex::find_product_ld;
|
||||||
|
use super::vtex::AvailabilityLd;
|
||||||
|
|
||||||
|
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
|
let ean = common::get_meta_content(dom, "product:retailer_item_id")
|
||||||
|
.context("Parsing EAN")?
|
||||||
|
.to_string();
|
||||||
|
let precio_centavos = common::price_from_meta(dom)?;
|
||||||
|
|
||||||
|
let (name, image_url, in_stock) = match find_product_ld(dom) {
|
||||||
|
Some(pm) => {
|
||||||
|
let p = pm?;
|
||||||
|
(
|
||||||
|
Some(p.name),
|
||||||
|
Some(p.image),
|
||||||
|
Some(
|
||||||
|
p.offers.offers.first().context("No offer")?.availability
|
||||||
|
== AvailabilityLd::InStock,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
None => bail!("No JSON/LD"),
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(PrecioPoint {
|
||||||
|
ean,
|
||||||
|
fetched_at: crate::now_sec(),
|
||||||
|
in_stock,
|
||||||
|
name,
|
||||||
|
image_url,
|
||||||
|
parser_version: 5,
|
||||||
|
precio_centavos,
|
||||||
|
url,
|
||||||
|
})
|
||||||
|
}
|
92
scraper-rs/src/sites/jumbo.rs
Normal file
92
scraper-rs/src/sites/jumbo.rs
Normal file
|
@ -0,0 +1,92 @@
|
||||||
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use anyhow::Context;
|
||||||
|
use reqwest::Url;
|
||||||
|
use serde::Deserialize;
|
||||||
|
use simple_error::bail;
|
||||||
|
|
||||||
|
use crate::sites::common;
|
||||||
|
use crate::PrecioPoint;
|
||||||
|
|
||||||
|
use super::vtex;
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct JumboSearch {
|
||||||
|
items: Vec<JumboSearchItem>,
|
||||||
|
}
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct JumboSearchItem {
|
||||||
|
ean: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_ean_from_search(
|
||||||
|
client: &reqwest::Client,
|
||||||
|
retailer_sku: String,
|
||||||
|
) -> anyhow::Result<String> {
|
||||||
|
let s = client
|
||||||
|
.get({
|
||||||
|
let mut url =
|
||||||
|
Url::from_str("https://www.jumbo.com.ar/api/catalog_system/pub/products/search")
|
||||||
|
.unwrap();
|
||||||
|
url.set_query(Some(&format!("fq=skuId:{}", retailer_sku)));
|
||||||
|
url
|
||||||
|
})
|
||||||
|
.send()
|
||||||
|
.await?
|
||||||
|
.text()
|
||||||
|
.await?;
|
||||||
|
let ean = {
|
||||||
|
let search: Vec<JumboSearch> = serde_json::from_str(&s)?;
|
||||||
|
let result = search.first().context("No search result")?;
|
||||||
|
let ean = result
|
||||||
|
.items
|
||||||
|
.first()
|
||||||
|
.context("No search result")?
|
||||||
|
.ean
|
||||||
|
.clone();
|
||||||
|
if !result.items.iter().all(|i| i.ean == ean) {
|
||||||
|
bail!("Inesperado: no todos los items tienen el mismo EAN")
|
||||||
|
}
|
||||||
|
ean
|
||||||
|
};
|
||||||
|
Ok(ean)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn scrap(
|
||||||
|
client: &reqwest::Client,
|
||||||
|
url: String,
|
||||||
|
body: &str,
|
||||||
|
) -> Result<PrecioPoint, anyhow::Error> {
|
||||||
|
let (name, image_url, sku, precio_centavos, in_stock) = {
|
||||||
|
let dom = tl::parse(body, tl::ParserOptions::default())?;
|
||||||
|
let precio_centavos = common::price_from_meta(&dom)?;
|
||||||
|
let in_stock = vtex::in_stock_from_meta(&dom)?;
|
||||||
|
|
||||||
|
match vtex::find_product_ld(&dom) {
|
||||||
|
Some(pm) => {
|
||||||
|
let p = pm?;
|
||||||
|
(
|
||||||
|
Some(p.name),
|
||||||
|
Some(p.image),
|
||||||
|
p.sku.context("No retailer SKU in Product LD")?,
|
||||||
|
precio_centavos,
|
||||||
|
in_stock,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
None => bail!("No JSON/LD"),
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let ean = get_ean_from_search(client, sku).await?;
|
||||||
|
|
||||||
|
Ok(PrecioPoint {
|
||||||
|
ean,
|
||||||
|
fetched_at: crate::now_sec(),
|
||||||
|
in_stock: Some(in_stock),
|
||||||
|
name,
|
||||||
|
image_url,
|
||||||
|
parser_version: 5,
|
||||||
|
precio_centavos,
|
||||||
|
url,
|
||||||
|
})
|
||||||
|
}
|
6
scraper-rs/src/sites/mod.rs
Normal file
6
scraper-rs/src/sites/mod.rs
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
pub mod carrefour;
|
||||||
|
mod common;
|
||||||
|
pub mod coto;
|
||||||
|
pub mod dia;
|
||||||
|
pub mod jumbo;
|
||||||
|
mod vtex;
|
102
scraper-rs/src/sites/vtex.rs
Normal file
102
scraper-rs/src/sites/vtex.rs
Normal file
|
@ -0,0 +1,102 @@
|
||||||
|
use anyhow::{bail, Context};
|
||||||
|
use serde::Deserialize;
|
||||||
|
use simple_error::SimpleError;
|
||||||
|
use tl::VDom;
|
||||||
|
|
||||||
|
use super::common;
|
||||||
|
|
||||||
|
pub fn parse_script_json(dom: &VDom, varname: &str) -> Result<serde_json::Value, anyhow::Error> {
|
||||||
|
let inner_html = &dom
|
||||||
|
.query_selector("template[data-type=\"json\"]")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()).and_then(|n| n.as_tag()))
|
||||||
|
.find(|t| {
|
||||||
|
t.attributes()
|
||||||
|
.get("data-varname")
|
||||||
|
.flatten()
|
||||||
|
.map_or(false, |v| v.as_utf8_str() == varname)
|
||||||
|
})
|
||||||
|
.ok_or(SimpleError::new("Failed to get template tag"))?
|
||||||
|
.query_selector(dom.parser(), "script")
|
||||||
|
.and_then(|mut it| it.next())
|
||||||
|
.and_then(|h| h.get(dom.parser()))
|
||||||
|
.ok_or(SimpleError::new("Failed to get script tag"))?
|
||||||
|
.inner_html(dom.parser());
|
||||||
|
inner_html.parse().context("Couldn't parse JSON in script")
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_json_lds<'a>(
|
||||||
|
dom: &'a VDom,
|
||||||
|
) -> impl Iterator<Item = std::result::Result<serde_json::Value, serde_json::Error>> + 'a {
|
||||||
|
dom.query_selector("script[type=\"application/ld+json\"]")
|
||||||
|
.unwrap()
|
||||||
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
|
.filter_map(|n| n.as_tag())
|
||||||
|
.map(|t| serde_json::from_str(&t.inner_html(dom.parser())))
|
||||||
|
}
|
||||||
|
pub fn find_json_ld(dom: &VDom, typ: &str) -> Option<Result<Ld, serde_json::Error>> {
|
||||||
|
get_json_lds(dom)
|
||||||
|
.filter_map(|v| v.ok())
|
||||||
|
.find(|v| v.get("@type").is_some_and(|t| t == typ))
|
||||||
|
.map(serde_json::from_value)
|
||||||
|
}
|
||||||
|
pub fn find_product_ld(dom: &VDom) -> Option<Result<ProductLd, serde_json::Error>> {
|
||||||
|
find_json_ld(dom, "Product").map(|l| {
|
||||||
|
l.map(|l| match l {
|
||||||
|
Ld::Product(p) => p,
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
#[serde(tag = "@type")]
|
||||||
|
pub enum Ld {
|
||||||
|
Product(ProductLd),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct ProductLd {
|
||||||
|
pub name: String,
|
||||||
|
pub image: String,
|
||||||
|
pub sku: Option<String>,
|
||||||
|
pub offers: OffersLd,
|
||||||
|
}
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
pub struct OffersLd {
|
||||||
|
pub offers: Vec<OfferLd>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct OfferLd {
|
||||||
|
#[serde(rename = "@type")]
|
||||||
|
_type: OfferTypeLd,
|
||||||
|
pub price: f64,
|
||||||
|
pub price_currency: String,
|
||||||
|
pub availability: AvailabilityLd,
|
||||||
|
}
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
pub enum OfferTypeLd {
|
||||||
|
Offer,
|
||||||
|
}
|
||||||
|
#[derive(Deserialize, PartialEq)]
|
||||||
|
pub enum AvailabilityLd {
|
||||||
|
#[serde(rename = "http://schema.org/InStock")]
|
||||||
|
InStock,
|
||||||
|
#[serde(rename = "http://schema.org/OutOfStock")]
|
||||||
|
OutOfStock,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn in_stock_from_meta(dom: &VDom) -> anyhow::Result<bool> {
|
||||||
|
Ok(
|
||||||
|
match common::get_meta_content(dom, "product:availability") {
|
||||||
|
Some(s) => match s.as_ref() {
|
||||||
|
"oos" => false,
|
||||||
|
"instock" => true,
|
||||||
|
_ => bail!("Not a valid product:availability"),
|
||||||
|
},
|
||||||
|
None => bail!("No product:availability in carrefour"),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
Loading…
Reference in a new issue