mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-22 06:16:18 +00:00
refactor, simplify, get proxy list by env
This commit is contained in:
parent
a7afef5b95
commit
c946e7fe35
7 changed files with 127 additions and 998 deletions
664
scraper-rs/Cargo.lock
generated
664
scraper-rs/Cargo.lock
generated
|
@ -155,12 +155,6 @@ dependencies = [
|
|||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "atomic-waker"
|
||||
version = "1.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.3.0"
|
||||
|
@ -251,12 +245,6 @@ version = "3.16.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
|
||||
|
||||
[[package]]
|
||||
name = "bytecount"
|
||||
version = "0.6.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.5.0"
|
||||
|
@ -341,35 +329,12 @@ version = "1.0.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
|
||||
|
||||
[[package]]
|
||||
name = "console"
|
||||
version = "0.15.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb"
|
||||
dependencies = [
|
||||
"encode_unicode",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"unicode-width",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const-oid"
|
||||
version = "0.9.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
|
||||
|
||||
[[package]]
|
||||
name = "core-foundation"
|
||||
version = "0.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
|
||||
dependencies = [
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "core-foundation-sys"
|
||||
version = "0.8.6"
|
||||
|
@ -445,29 +410,6 @@ dependencies = [
|
|||
"typenum",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cssparser"
|
||||
version = "0.31.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be"
|
||||
dependencies = [
|
||||
"cssparser-macros",
|
||||
"dtoa-short",
|
||||
"itoa",
|
||||
"phf 0.11.2",
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cssparser-macros"
|
||||
version = "0.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"syn 2.0.66",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "der"
|
||||
version = "0.7.9"
|
||||
|
@ -479,17 +421,6 @@ dependencies = [
|
|||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_more"
|
||||
version = "0.99.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.66",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "digest"
|
||||
version = "0.10.7"
|
||||
|
@ -508,27 +439,6 @@ version = "0.15.7"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
|
||||
|
||||
[[package]]
|
||||
name = "dtoa"
|
||||
version = "1.0.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653"
|
||||
|
||||
[[package]]
|
||||
name = "dtoa-short"
|
||||
version = "0.3.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
|
||||
dependencies = [
|
||||
"dtoa",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ego-tree"
|
||||
version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591"
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.12.0"
|
||||
|
@ -538,21 +448,6 @@ dependencies = [
|
|||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encode_unicode"
|
||||
version = "0.3.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
version = "0.8.34"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "equivalent"
|
||||
version = "1.0.1"
|
||||
|
@ -631,21 +526,6 @@ version = "1.0.7"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
||||
|
||||
[[package]]
|
||||
name = "foreign-types"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
|
||||
dependencies = [
|
||||
"foreign-types-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "foreign-types-shared"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
|
||||
|
||||
[[package]]
|
||||
name = "form_urlencoded"
|
||||
version = "1.2.1"
|
||||
|
@ -655,16 +535,6 @@ dependencies = [
|
|||
"percent-encoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futf"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
|
||||
dependencies = [
|
||||
"mac",
|
||||
"new_debug_unreachable",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures"
|
||||
version = "0.3.30"
|
||||
|
@ -765,15 +635,6 @@ dependencies = [
|
|||
"slab",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fxhash"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "generic-array"
|
||||
version = "0.14.7"
|
||||
|
@ -784,15 +645,6 @@ dependencies = [
|
|||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getopts"
|
||||
version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
|
||||
dependencies = [
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.1.16"
|
||||
|
@ -821,25 +673,6 @@ version = "0.29.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd"
|
||||
|
||||
[[package]]
|
||||
name = "h2"
|
||||
version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab"
|
||||
dependencies = [
|
||||
"atomic-waker",
|
||||
"bytes",
|
||||
"fnv",
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
"http",
|
||||
"indexmap",
|
||||
"slab",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.14.5"
|
||||
|
@ -922,20 +755,6 @@ dependencies = [
|
|||
"utf8-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "html5ever"
|
||||
version = "0.26.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
|
||||
dependencies = [
|
||||
"log",
|
||||
"mac",
|
||||
"markup5ever",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http"
|
||||
version = "1.1.0"
|
||||
|
@ -976,12 +795,6 @@ version = "1.9.4"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9"
|
||||
|
||||
[[package]]
|
||||
name = "humantime"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
|
||||
|
||||
[[package]]
|
||||
name = "hyper"
|
||||
version = "1.3.1"
|
||||
|
@ -991,7 +804,6 @@ dependencies = [
|
|||
"bytes",
|
||||
"futures-channel",
|
||||
"futures-util",
|
||||
"h2",
|
||||
"http",
|
||||
"http-body",
|
||||
"httparse",
|
||||
|
@ -1020,22 +832,6 @@ dependencies = [
|
|||
"webpki-roots",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-tls"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"http-body-util",
|
||||
"hyper",
|
||||
"hyper-util",
|
||||
"native-tls",
|
||||
"tokio",
|
||||
"tokio-native-tls",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-util"
|
||||
version = "0.1.5"
|
||||
|
@ -1099,19 +895,6 @@ dependencies = [
|
|||
"hashbrown",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "indicatif"
|
||||
version = "0.17.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3"
|
||||
dependencies = [
|
||||
"console",
|
||||
"instant",
|
||||
"number_prefix",
|
||||
"portable-atomic",
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "instant"
|
||||
version = "0.1.13"
|
||||
|
@ -1211,26 +994,6 @@ version = "0.4.21"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
|
||||
|
||||
[[package]]
|
||||
name = "mac"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
||||
|
||||
[[package]]
|
||||
name = "markup5ever"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
|
||||
dependencies = [
|
||||
"log",
|
||||
"phf 0.10.1",
|
||||
"phf_codegen",
|
||||
"string_cache",
|
||||
"string_cache_codegen",
|
||||
"tendril",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "md-5"
|
||||
version = "0.10.6"
|
||||
|
@ -1288,29 +1051,6 @@ dependencies = [
|
|||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "native-tls"
|
||||
version = "0.2.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"log",
|
||||
"openssl",
|
||||
"openssl-probe",
|
||||
"openssl-sys",
|
||||
"schannel",
|
||||
"security-framework",
|
||||
"security-framework-sys",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "new_debug_unreachable"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
|
||||
|
||||
[[package]]
|
||||
name = "nom"
|
||||
version = "7.1.3"
|
||||
|
@ -1388,12 +1128,6 @@ dependencies = [
|
|||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "number_prefix"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
|
||||
|
||||
[[package]]
|
||||
name = "object"
|
||||
version = "0.36.0"
|
||||
|
@ -1409,67 +1143,12 @@ version = "1.19.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
|
||||
|
||||
[[package]]
|
||||
name = "openssl"
|
||||
version = "0.10.64"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f"
|
||||
dependencies = [
|
||||
"bitflags 2.5.0",
|
||||
"cfg-if",
|
||||
"foreign-types",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"openssl-macros",
|
||||
"openssl-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openssl-macros"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.66",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openssl-probe"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
|
||||
|
||||
[[package]]
|
||||
name = "openssl-sys"
|
||||
version = "0.9.102"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"pkg-config",
|
||||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "overload"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
|
||||
|
||||
[[package]]
|
||||
name = "papergrid"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ad43c07024ef767f9160710b3a6773976194758c7919b17e63b863db0bdf7fb"
|
||||
dependencies = [
|
||||
"bytecount",
|
||||
"fnv",
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.11.2"
|
||||
|
@ -1539,86 +1218,6 @@ version = "2.3.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
|
||||
|
||||
[[package]]
|
||||
name = "phf"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
|
||||
dependencies = [
|
||||
"phf_shared 0.10.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
|
||||
dependencies = [
|
||||
"phf_macros",
|
||||
"phf_shared 0.11.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_codegen"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
|
||||
dependencies = [
|
||||
"phf_generator 0.10.0",
|
||||
"phf_shared 0.10.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_generator"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
|
||||
dependencies = [
|
||||
"phf_shared 0.10.0",
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_generator"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0"
|
||||
dependencies = [
|
||||
"phf_shared 0.11.2",
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_macros"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b"
|
||||
dependencies = [
|
||||
"phf_generator 0.11.2",
|
||||
"phf_shared 0.11.2",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.66",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_shared"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
|
||||
dependencies = [
|
||||
"siphasher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_shared"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
|
||||
dependencies = [
|
||||
"siphasher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project"
|
||||
version = "1.1.5"
|
||||
|
@ -1678,48 +1277,12 @@ version = "0.3.30"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec"
|
||||
|
||||
[[package]]
|
||||
name = "portable-atomic"
|
||||
version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0"
|
||||
|
||||
[[package]]
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
|
||||
|
||||
[[package]]
|
||||
name = "precomputed-hash"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-error"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
|
||||
dependencies = [
|
||||
"proc-macro-error-attr",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-error-attr"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.85"
|
||||
|
@ -1729,24 +1292,6 @@ dependencies = [
|
|||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proxy-scraper-checker"
|
||||
version = "0.1.3"
|
||||
source = "git+https://github.com/catdevnull/Proxy-Scraper-Checker?rev=429ca83d137abdf5377a1d22ee85d62bfb00437c#429ca83d137abdf5377a1d22ee85d62bfb00437c"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"futures",
|
||||
"humantime",
|
||||
"indicatif",
|
||||
"reqwest",
|
||||
"rlimit",
|
||||
"scraper",
|
||||
"serde_json",
|
||||
"tabled",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.31.0"
|
||||
|
@ -1919,22 +1464,18 @@ dependencies = [
|
|||
"async-compression",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"encoding_rs",
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"h2",
|
||||
"http",
|
||||
"http-body",
|
||||
"http-body-util",
|
||||
"hyper",
|
||||
"hyper-rustls",
|
||||
"hyper-tls",
|
||||
"hyper-util",
|
||||
"ipnet",
|
||||
"js-sys",
|
||||
"log",
|
||||
"mime",
|
||||
"native-tls",
|
||||
"once_cell",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
|
@ -1946,9 +1487,7 @@ dependencies = [
|
|||
"serde_json",
|
||||
"serde_urlencoded",
|
||||
"sync_wrapper",
|
||||
"system-configuration",
|
||||
"tokio",
|
||||
"tokio-native-tls",
|
||||
"tokio-rustls",
|
||||
"tokio-socks",
|
||||
"tokio-util",
|
||||
|
@ -1976,15 +1515,6 @@ dependencies = [
|
|||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rlimit"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3560f70f30a0f16d11d01ed078a07740fe6b489667abc7c7b029155d9f21c3d8"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rsa"
|
||||
version = "0.9.6"
|
||||
|
@ -2091,37 +1621,12 @@ version = "1.0.18"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
|
||||
|
||||
[[package]]
|
||||
name = "schannel"
|
||||
version = "0.1.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534"
|
||||
dependencies = [
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scopeguard"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
||||
|
||||
[[package]]
|
||||
name = "scraper"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b80b33679ff7a0ea53d37f3b39de77ea0c75b12c5805ac43ec0c33b3051af1b"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"cssparser",
|
||||
"ego-tree",
|
||||
"getopts",
|
||||
"html5ever",
|
||||
"once_cell",
|
||||
"selectors",
|
||||
"tendril",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scraper-rs"
|
||||
version = "0.1.0"
|
||||
|
@ -2134,10 +1639,8 @@ dependencies = [
|
|||
"cron",
|
||||
"futures",
|
||||
"html-escape",
|
||||
"indicatif",
|
||||
"itertools",
|
||||
"nanoid",
|
||||
"proxy-scraper-checker",
|
||||
"quick-xml",
|
||||
"rand 0.8.5",
|
||||
"reqwest",
|
||||
|
@ -2153,48 +1656,6 @@ dependencies = [
|
|||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "security-framework"
|
||||
version = "2.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c627723fd09706bacdb5cf41499e95098555af3c3c29d014dc3c458ef6be11c0"
|
||||
dependencies = [
|
||||
"bitflags 2.5.0",
|
||||
"core-foundation",
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
"security-framework-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "security-framework-sys"
|
||||
version = "2.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "317936bbbd05227752583946b9e66d7ce3b489f84e11a94a510b4437fef407d7"
|
||||
dependencies = [
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "selectors"
|
||||
version = "0.25.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06"
|
||||
dependencies = [
|
||||
"bitflags 2.5.0",
|
||||
"cssparser",
|
||||
"derive_more",
|
||||
"fxhash",
|
||||
"log",
|
||||
"new_debug_unreachable",
|
||||
"phf 0.10.1",
|
||||
"phf_codegen",
|
||||
"precomputed-hash",
|
||||
"servo_arc",
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.203"
|
||||
|
@ -2238,15 +1699,6 @@ dependencies = [
|
|||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "servo_arc"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44"
|
||||
dependencies = [
|
||||
"stable_deref_trait",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sha1"
|
||||
version = "0.10.6"
|
||||
|
@ -2303,12 +1755,6 @@ version = "0.3.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7e2accd2c41a0e920d2abd91b2badcfa1da784662f54fbc47e0e3a51f1e2e1cf"
|
||||
|
||||
[[package]]
|
||||
name = "siphasher"
|
||||
version = "0.3.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
|
||||
|
||||
[[package]]
|
||||
name = "slab"
|
||||
version = "0.4.9"
|
||||
|
@ -2562,38 +2008,6 @@ dependencies = [
|
|||
"urlencoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "stable_deref_trait"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
|
||||
|
||||
[[package]]
|
||||
name = "string_cache"
|
||||
version = "0.8.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b"
|
||||
dependencies = [
|
||||
"new_debug_unreachable",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.3",
|
||||
"phf_shared 0.10.0",
|
||||
"precomputed-hash",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "string_cache_codegen"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988"
|
||||
dependencies = [
|
||||
"phf_generator 0.10.0",
|
||||
"phf_shared 0.10.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "stringprep"
|
||||
version = "0.1.5"
|
||||
|
@ -2645,51 +2059,6 @@ version = "1.0.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394"
|
||||
|
||||
[[package]]
|
||||
name = "system-configuration"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"core-foundation",
|
||||
"system-configuration-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "system-configuration-sys"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9"
|
||||
dependencies = [
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tabled"
|
||||
version = "0.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c998b0c8b921495196a48aabaf1901ff28be0760136e31604f7967b0792050e"
|
||||
dependencies = [
|
||||
"papergrid",
|
||||
"tabled_derive",
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tabled_derive"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c138f99377e5d653a371cdad263615634cfc8467685dfe8e73e2b8e98f44b17"
|
||||
dependencies = [
|
||||
"heck 0.4.1",
|
||||
"proc-macro-error",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.10.1"
|
||||
|
@ -2702,17 +2071,6 @@ dependencies = [
|
|||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tendril"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
|
||||
dependencies = [
|
||||
"futf",
|
||||
"mac",
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.61"
|
||||
|
@ -2793,16 +2151,6 @@ dependencies = [
|
|||
"syn 2.0.66",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-native-tls"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-rustls"
|
||||
version = "0.26.0"
|
||||
|
@ -2980,12 +2328,6 @@ version = "1.11.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.1.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d"
|
||||
|
||||
[[package]]
|
||||
name = "unicode_categories"
|
||||
version = "0.1.1"
|
||||
|
@ -3015,12 +2357,6 @@ version = "2.1.3"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
|
||||
|
||||
[[package]]
|
||||
name = "utf-8"
|
||||
version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
|
||||
|
||||
[[package]]
|
||||
name = "utf8-width"
|
||||
version = "0.1.7"
|
||||
|
|
|
@ -35,5 +35,3 @@ tl = { git = "https://github.com/evertedsphere/tl", branch = "patch-1" }
|
|||
tokio = { version = "1.35.1", features = ["full"] }
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = "0.3"
|
||||
proxy-scraper-checker = { git = "https://github.com/catdevnull/Proxy-Scraper-Checker", rev = "429ca83d137abdf5377a1d22ee85d62bfb00437c" }
|
||||
indicatif = "0.17.8"
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
use super::fetch_list;
|
||||
use super::now_sec;
|
||||
use super::supermercado::Supermercado;
|
||||
use super::AutoArgs;
|
||||
|
@ -58,7 +57,7 @@ impl Auto {
|
|||
// }
|
||||
{
|
||||
let t0 = now_sec();
|
||||
let counters = fetch_list(&self.db, links).await;
|
||||
let counters = self.scraper.fetch_list(&self.db, links).await;
|
||||
self.inform(&format!(
|
||||
"Downloaded {:?}: {:?} (took {})",
|
||||
&supermercado,
|
||||
|
|
|
@ -2,7 +2,7 @@ use again::RetryPolicy;
|
|||
use clap::{Parser, ValueEnum};
|
||||
use cron::Schedule;
|
||||
use db::Db;
|
||||
use futures::{future, stream, StreamExt, TryFutureExt};
|
||||
use futures::{future, TryFutureExt};
|
||||
|
||||
use reqwest::{header::HeaderMap, IntoUrl, StatusCode};
|
||||
use scraper::Scraper;
|
||||
|
@ -58,7 +58,7 @@ struct AutoArgs {
|
|||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
async fn main() -> () {
|
||||
tracing_subscriber::fmt::init();
|
||||
|
||||
match Args::parse() {
|
||||
|
@ -70,11 +70,12 @@ async fn main() -> anyhow::Result<()> {
|
|||
Args::Auto(a) => auto_cli(a).await,
|
||||
Args::Cron(_) => cron_cli().await,
|
||||
}
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
async fn scrap_url_cli(url: String) -> anyhow::Result<()> {
|
||||
let scraper = Scraper::new();
|
||||
let res = scraper.fetch_and_parse(url.clone()).await;
|
||||
let scraper = Scraper::from_env().await?;
|
||||
let res = scraper.fetch_and_scrap(url.clone()).await;
|
||||
|
||||
println!("Result: {:#?}", res);
|
||||
res.map(|_| ())
|
||||
|
@ -98,37 +99,13 @@ async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
|
|||
.collect::<Vec<_>>();
|
||||
|
||||
let db = Db::connect().await?;
|
||||
let counters = fetch_list(&db, links).await;
|
||||
let scraper = Scraper::from_env().await?;
|
||||
let counters = scraper.fetch_list(&db, links).await;
|
||||
|
||||
println!("Finished: {:?}", counters);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn fetch_list(db: &Db, links: Vec<String>) -> Counters {
|
||||
let n_coroutines = env::var("N_COROUTINES")
|
||||
.map_or(Ok(24), |s| s.parse::<usize>())
|
||||
.expect("N_COROUTINES no es un número");
|
||||
|
||||
let scraper = Scraper::new();
|
||||
|
||||
stream::iter(links)
|
||||
.map(|url| {
|
||||
let db = db.clone();
|
||||
let scraper = scraper.clone();
|
||||
tokio::spawn(fetch_and_save(scraper, url, db))
|
||||
})
|
||||
.buffer_unordered(n_coroutines)
|
||||
.fold(Counters::default(), move |x, y| {
|
||||
let ret = y.unwrap();
|
||||
future::ready(Counters {
|
||||
success: x.success + ret.success,
|
||||
errored: x.errored + ret.errored,
|
||||
skipped: x.skipped + ret.skipped,
|
||||
})
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
mod db;
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
|
@ -138,29 +115,6 @@ struct Counters {
|
|||
skipped: u64,
|
||||
}
|
||||
|
||||
async fn fetch_and_save(scraper: Scraper, url: String, db: Db) -> Counters {
|
||||
let res = scraper.fetch_and_parse(url.clone()).await;
|
||||
let mut counters = Counters::default();
|
||||
match res {
|
||||
Ok(res) => {
|
||||
counters.success += 1;
|
||||
db.insert_precio(res).await.unwrap();
|
||||
}
|
||||
Err(err) => {
|
||||
match err.downcast_ref::<reqwest::Error>() {
|
||||
Some(e) => match e.status() {
|
||||
Some(StatusCode::NOT_FOUND) => counters.skipped += 1,
|
||||
_ => counters.errored += 1,
|
||||
},
|
||||
_ => counters.errored += 1,
|
||||
}
|
||||
|
||||
tracing::error!(error=%err, url=url);
|
||||
}
|
||||
}
|
||||
counters
|
||||
}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
enum FetchError {
|
||||
#[error("parse error")]
|
||||
|
@ -222,7 +176,7 @@ pub fn anyhow_retry_if_wasnt_not_found(err: &anyhow::Error) -> bool {
|
|||
async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
|
||||
let file = tokio::fs::read_to_string(file_path).await?;
|
||||
|
||||
let scraper = Scraper::new();
|
||||
let scraper = Scraper::from_env().await?;
|
||||
|
||||
let url = {
|
||||
let dom = tl::parse(&file, tl::ParserOptions::default())?;
|
||||
|
@ -242,7 +196,7 @@ async fn parse_file_cli(file_path: String) -> anyhow::Result<()> {
|
|||
}
|
||||
|
||||
async fn get_url_list_cli(supermercado: Supermercado) -> anyhow::Result<()> {
|
||||
let scraper = Scraper::new();
|
||||
let scraper = Scraper::from_env().await?;
|
||||
let urls = scraper.get_urls_for_supermercado(&supermercado).await?;
|
||||
urls.iter().for_each(|s| {
|
||||
println!("{}", s);
|
||||
|
@ -276,7 +230,7 @@ async fn auto_cli(args: AutoArgs) -> anyhow::Result<()> {
|
|||
db,
|
||||
telegram,
|
||||
args,
|
||||
scraper: Scraper::new(),
|
||||
scraper: Scraper::from_env().await?,
|
||||
}
|
||||
};
|
||||
auto.inform("[auto] Empezando scrap").await;
|
||||
|
@ -289,7 +243,7 @@ async fn auto_cli(args: AutoArgs) -> anyhow::Result<()> {
|
|||
let handles: Vec<_> = supermercados
|
||||
.iter()
|
||||
.map(|s| {
|
||||
let x = s.clone();
|
||||
let x = *s;
|
||||
tokio::spawn(
|
||||
auto.clone()
|
||||
.download_supermercado(s.to_owned())
|
||||
|
|
|
@ -1,274 +1,58 @@
|
|||
use std::{collections::HashSet, str::FromStr, sync::Arc, time::Duration};
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
use futures::{future::join_all, stream, FutureExt, StreamExt};
|
||||
use indicatif::ProgressBar;
|
||||
use itertools::Itertools;
|
||||
use proxy_scraper_checker::{Proxy, ProxyChecker};
|
||||
use rand::Rng;
|
||||
use reqwest::{IntoUrl, Url};
|
||||
use serde::Deserialize;
|
||||
use tokio::sync::{RwLock, Semaphore};
|
||||
|
||||
use crate::build_header_map;
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ProxyClient {
|
||||
proxies: RwLock<Option<Vec<Url>>>,
|
||||
clients: RwLock<[Option<reqwest::Client>; 10]>,
|
||||
// proxies: Vec<Url>,
|
||||
clients: Vec<reqwest::Client>,
|
||||
}
|
||||
|
||||
impl ProxyClient {
|
||||
pub async fn do_request(&self, url: impl IntoUrl + Clone) -> anyhow::Result<reqwest::Response> {
|
||||
loop {
|
||||
let client = {
|
||||
let mut client_ptr = self.clients.write().await;
|
||||
if let Some(client) = (*client_ptr).clone() {
|
||||
client
|
||||
} else {
|
||||
let proxies = self.get_proxies().await?;
|
||||
// let proxy = stream::iter(proxies)
|
||||
// .filter_map(|proxy| async {
|
||||
// println!("trying proxy {}", proxy);
|
||||
// check_proxy(
|
||||
// proxy.clone(),
|
||||
// "https://www.cotodigital3.com.ar/sitios/cdigi/".to_string(),
|
||||
// 3,
|
||||
// )
|
||||
// .map(|r| r.ok())
|
||||
// .await
|
||||
// })
|
||||
// .next()
|
||||
// .await
|
||||
// .unwrap()
|
||||
// .clone();
|
||||
let proxy = loop {
|
||||
let proxy = proxies[rand::thread_rng().gen_range(0..proxies.len())].clone();
|
||||
println!("trying proxy {}", proxy);
|
||||
match check_proxy(
|
||||
proxy,
|
||||
"https://www.cotodigital3.com.ar/sitios/cdigi/".to_string(),
|
||||
10,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(proxy) => break proxy,
|
||||
Err(_) => continue,
|
||||
}
|
||||
};
|
||||
|
||||
println!("chose proxy {}", proxy);
|
||||
let new_client = reqwest::ClientBuilder::default()
|
||||
.timeout(Duration::from_secs(300))
|
||||
.connect_timeout(Duration::from_secs(150))
|
||||
.default_headers(build_header_map())
|
||||
.proxy(reqwest::Proxy::all(proxy)?)
|
||||
.build()
|
||||
.unwrap();
|
||||
let ret = new_client.clone();
|
||||
*client_ptr = Some(new_client);
|
||||
ret
|
||||
}
|
||||
};
|
||||
let req = client.get(url.clone()).build()?;
|
||||
match client.execute(req).await {
|
||||
Ok(res) => return Ok(res),
|
||||
Err(_) => {
|
||||
// possibly IP locked, reset client to get another IP
|
||||
{
|
||||
println!("request failed, resetting client");
|
||||
*(self.clients.write().await) = None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get_proxies(&self) -> anyhow::Result<Vec<Url>> {
|
||||
let mut proxies_ptr = self.proxies.write().await;
|
||||
if let Some(proxies) = (*proxies_ptr).clone() {
|
||||
Ok(proxies)
|
||||
pub fn from_proxy_list(proxies: &str) -> anyhow::Result<Self> {
|
||||
let proxies = Self::parse_proxy_list(proxies)?;
|
||||
let clients = if proxies.is_empty() {
|
||||
tracing::warn!("No proxies available; using no proxy");
|
||||
vec![Self::client_builder().build()?]
|
||||
} else {
|
||||
// let scraper = proxy_scraper_checker::ProxyScraper::default();
|
||||
proxies
|
||||
.clone()
|
||||
.into_iter()
|
||||
.map(Self::build_client_with_proxy)
|
||||
.try_collect()?
|
||||
};
|
||||
Ok(Self { clients })
|
||||
}
|
||||
|
||||
// let archive_urls = scraper.scrape_archive_urls().await?;
|
||||
// let futures: Vec<_> = archive_urls
|
||||
// .into_iter()
|
||||
// .map(|url| {
|
||||
// tokio::task::spawn({
|
||||
// let value = scraper.clone();
|
||||
// async move { value.scrape_proxies(url, true).await }
|
||||
// })
|
||||
// })
|
||||
// .collect();
|
||||
// let results: Vec<_> = join_all(futures).await.into_iter().try_collect()?;
|
||||
// let proxies: Vec<_> = results
|
||||
// .into_iter()
|
||||
// .filter_map(|res| if let Ok(res) = res { Some(res) } else { None })
|
||||
// .flatten()
|
||||
// .filter(|x| {
|
||||
// if let Proxy::Socks5(_) = x {
|
||||
// true
|
||||
// } else {
|
||||
// false
|
||||
// }
|
||||
// })
|
||||
// .collect();
|
||||
fn parse_proxy_list(proxies: &str) -> anyhow::Result<Vec<Url>> {
|
||||
Ok(proxies
|
||||
.split("\n")
|
||||
.filter(|s| !s.trim().is_empty())
|
||||
.map(Url::parse)
|
||||
.try_collect()?)
|
||||
}
|
||||
fn client_builder() -> reqwest::ClientBuilder {
|
||||
reqwest::ClientBuilder::default()
|
||||
.timeout(Duration::from_secs(300))
|
||||
.connect_timeout(Duration::from_secs(150))
|
||||
.default_headers(build_header_map())
|
||||
}
|
||||
fn build_client_with_proxy(proxy: Url) -> reqwest::Result<reqwest::Client> {
|
||||
Self::client_builder()
|
||||
.proxy(reqwest::Proxy::all(proxy)?)
|
||||
.build()
|
||||
}
|
||||
|
||||
let socks5_proxies = get_proxy_list_from_raw_list(
|
||||
"https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks5.txt",
|
||||
"socks5",
|
||||
)
|
||||
.await?;
|
||||
let http_proxies = get_proxy_list_from_raw_list(
|
||||
"https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt",
|
||||
"http",
|
||||
)
|
||||
.await?;
|
||||
let fosy_http_proxies =
|
||||
get_proxy_list_from_raw_list("https://fosy.club/api/free/list?type=http", "http")
|
||||
.await?;
|
||||
let fosy_socks5_proxies = get_proxy_list_from_raw_list(
|
||||
"https://fosy.club/api/free/list?type=socks5",
|
||||
"socks5",
|
||||
)
|
||||
.await?;
|
||||
let geonode_proxies = get_proxy_list_geonode()
|
||||
.await
|
||||
.inspect_err(|e| tracing::error!("getting proxy list ({error})", error = e))?;
|
||||
|
||||
// let proxies: Vec<_> = [
|
||||
// // socks5_proxies,
|
||||
// // http_proxies,
|
||||
// fosy_http_proxies,
|
||||
// fosy_socks5_proxies,
|
||||
// geonode_proxies,
|
||||
// ]
|
||||
// .into_iter()
|
||||
// .flatten()
|
||||
// .collect();
|
||||
|
||||
let checked_proxies: Vec<_> = {
|
||||
let proxiess: HashSet<_> = proxies
|
||||
.into_iter()
|
||||
.filter_map(|p| match p.scheme() {
|
||||
"socks5" => Some(Proxy::Socks5(p.host_str()?.to_string())),
|
||||
"http" => Some(Proxy::Http(p.host_str()?.to_string())),
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
let checker = ProxyChecker::new(
|
||||
Arc::new(Semaphore::new(32)),
|
||||
ProgressBar::new(proxiess.len().try_into().unwrap()),
|
||||
);
|
||||
checker
|
||||
.check_proxies(proxiess.into(), "https://milei.nulo.in/".to_string(), 8)
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|p| Url::from_str(&p.url()))
|
||||
.try_collect()?
|
||||
};
|
||||
|
||||
let ret = checked_proxies.clone();
|
||||
println!("got {} proxies", ret.len());
|
||||
*proxies_ptr = Some(checked_proxies);
|
||||
Ok(ret)
|
||||
}
|
||||
pub async fn do_request(
|
||||
&self,
|
||||
url: impl IntoUrl + Clone,
|
||||
) -> reqwest::Result<reqwest::Response> {
|
||||
let client = self.clients[rand::thread_rng().gen_range(0..self.clients.len())].clone();
|
||||
let req = client.get(url.clone()).build()?;
|
||||
client.execute(req).await
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn check_proxy(proxy: Url, url: String, timeout: u64) -> anyhow::Result<Url> {
|
||||
let client = reqwest::Client::builder()
|
||||
.proxy(reqwest::Proxy::all(proxy.clone())?)
|
||||
.timeout(Duration::from_secs(timeout))
|
||||
.build()?;
|
||||
|
||||
client
|
||||
.get(url)
|
||||
.send()
|
||||
.await
|
||||
.context("Request failed")?
|
||||
.error_for_status()
|
||||
.context("Request returned an error status code")?;
|
||||
|
||||
Ok(proxy)
|
||||
}
|
||||
|
||||
// pub async fn find_first_working_proxy(proxies: Vec<String>) -> anyhow::Result<Url> {
|
||||
// let semaphore = Arc::new(Semaphore::new(64));
|
||||
// for proxy in proxies {
|
||||
// let semaphore = semaphore.clone();
|
||||
|
||||
// }
|
||||
|
||||
// let proxy = stream::iter(proxies)
|
||||
// .filter_map(|proxy| async {
|
||||
// println!("trying proxy {}", proxy);
|
||||
// check_proxy(
|
||||
// proxy.clone(),
|
||||
// "https://www.cotodigital3.com.ar/sitios/cdigi/".to_string(),
|
||||
// 3,
|
||||
// )
|
||||
// .map(|r| r.ok())
|
||||
// .await
|
||||
// }).concu
|
||||
// .next()
|
||||
// .await
|
||||
// .unwrap()
|
||||
// .clone();
|
||||
|
||||
// }
|
||||
|
||||
pub async fn get_proxy_list_from_raw_list<U: IntoUrl>(
|
||||
list_url: U,
|
||||
protocol: &str,
|
||||
) -> anyhow::Result<Vec<Url>> {
|
||||
let res = reqwest::get(list_url).await?;
|
||||
let text = res.text().await?;
|
||||
Ok(text
|
||||
.lines()
|
||||
.map(|l| Url::from_str(&format!("{}://{}", protocol, l)))
|
||||
.filter_map(|r| r.ok())
|
||||
.collect())
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Ips {
|
||||
data: Vec<Ip>,
|
||||
}
|
||||
#[derive(Deserialize)]
|
||||
struct Ip {
|
||||
ip: String,
|
||||
port: String,
|
||||
protocols: Vec<String>,
|
||||
}
|
||||
pub async fn get_proxy_list_geonode() -> anyhow::Result<Vec<Url>> {
|
||||
let ips = reqwest::get("https://proxylist.geonode.com/api/proxy-list?protocols=socks5%2Chttp&filterUpTime=90&limit=500&page=1&sort_by=lastChecked&sort_type=asc").await?.json::<Ips>().await?;
|
||||
Ok(ips
|
||||
.data
|
||||
.into_iter()
|
||||
.map(|i| Url::from_str(&format!("{}://{}:{}", i.protocols[0], i.ip, i.port)))
|
||||
.filter_map(|r| r.ok())
|
||||
.collect())
|
||||
}
|
||||
pub async fn get_proxy_list_checkerproxy() -> anyhow::Result<Vec<Url>> {
|
||||
let scraper = proxy_scraper_checker::ProxyScraper::default();
|
||||
let archive_urls = scraper.scrape_archive_urls().await?;
|
||||
let futures: Vec<_> = archive_urls
|
||||
.into_iter()
|
||||
.map(|url| {
|
||||
tokio::task::spawn({
|
||||
let value = scraper.clone();
|
||||
async move { value.scrape_proxies(url, true).await }
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
let results: Vec<_> = join_all(futures).await.into_iter().try_collect()?;
|
||||
let proxies: Vec<_> = results
|
||||
.into_iter()
|
||||
.filter_map(|res| if let Ok(res) = res { Some(res) } else { None })
|
||||
.flatten()
|
||||
.map(|p| Url::from_str(&p.url()))
|
||||
.try_collect()?;
|
||||
Ok(proxies)
|
||||
}
|
||||
|
|
|
@ -1,25 +1,38 @@
|
|||
use std::sync::Arc;
|
||||
use std::env;
|
||||
|
||||
use reqwest::Url;
|
||||
use futures::{future, stream, StreamExt};
|
||||
use reqwest::{StatusCode, Url};
|
||||
use simple_error::bail;
|
||||
use tokio::fs;
|
||||
|
||||
use crate::{
|
||||
anyhow_retry_if_wasnt_not_found, build_client, get_fetch_retry_policy, get_parse_retry_policy,
|
||||
proxy_client::ProxyClient, sites, supermercado::Supermercado, PrecioPoint,
|
||||
anyhow_retry_if_wasnt_not_found, build_client, db::Db, get_fetch_retry_policy,
|
||||
get_parse_retry_policy, proxy_client::ProxyClient, sites, supermercado::Supermercado, Counters,
|
||||
PrecioPoint,
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Scraper {
|
||||
default_client: reqwest::Client,
|
||||
proxy_client: Arc<ProxyClient>,
|
||||
proxy_client: ProxyClient,
|
||||
}
|
||||
|
||||
impl Scraper {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
pub async fn from_env() -> anyhow::Result<Self> {
|
||||
let proxy_list = match env::var("PROXY_LIST") {
|
||||
Ok(list) => list,
|
||||
Err(_) => match env::var("PROXY_LIST_PATH") {
|
||||
Ok(path) => fs::read_to_string(path).await?,
|
||||
Err(_) => "".to_owned(),
|
||||
},
|
||||
};
|
||||
Self::build(&proxy_list)
|
||||
}
|
||||
pub fn build(proxy_list: &str) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
default_client: build_client(),
|
||||
proxy_client: ProxyClient::default().into(),
|
||||
}
|
||||
proxy_client: ProxyClient::from_proxy_list(proxy_list)?,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn get_urls_for_supermercado(
|
||||
|
@ -36,7 +49,7 @@ impl Scraper {
|
|||
}
|
||||
|
||||
#[tracing::instrument(skip(self))]
|
||||
pub async fn fetch_and_parse(&self, url: String) -> Result<PrecioPoint, anyhow::Error> {
|
||||
pub async fn fetch_and_scrap(&self, url: String) -> Result<PrecioPoint, anyhow::Error> {
|
||||
async fn fetch_and_scrap(
|
||||
scraper: &Scraper,
|
||||
url: String,
|
||||
|
@ -92,6 +105,52 @@ impl Scraper {
|
|||
Ok(res.text().await?)
|
||||
}
|
||||
|
||||
pub async fn fetch_and_save(&self, url: String, db: Db) -> Counters {
|
||||
let res = self.fetch_and_scrap(url.clone()).await;
|
||||
let mut counters = Counters::default();
|
||||
match res {
|
||||
Ok(res) => {
|
||||
counters.success += 1;
|
||||
db.insert_precio(res).await.unwrap();
|
||||
}
|
||||
Err(err) => {
|
||||
match err.downcast_ref::<reqwest::Error>() {
|
||||
Some(e) => match e.status() {
|
||||
Some(StatusCode::NOT_FOUND) => counters.skipped += 1,
|
||||
_ => counters.errored += 1,
|
||||
},
|
||||
_ => counters.errored += 1,
|
||||
}
|
||||
|
||||
tracing::error!(error=%err, url=url);
|
||||
}
|
||||
}
|
||||
counters
|
||||
}
|
||||
|
||||
pub async fn fetch_list(&self, db: &Db, links: Vec<String>) -> Counters {
|
||||
let n_coroutines = env::var("N_COROUTINES")
|
||||
.map_or(Ok(24), |s| s.parse::<usize>())
|
||||
.expect("N_COROUTINES no es un número");
|
||||
|
||||
stream::iter(links)
|
||||
.map(|url| {
|
||||
let db = db.clone();
|
||||
let scraper = self.clone();
|
||||
tokio::spawn(async move { scraper.fetch_and_save(url, db).await })
|
||||
})
|
||||
.buffer_unordered(n_coroutines)
|
||||
.fold(Counters::default(), move |x, y| {
|
||||
let ret = y.unwrap();
|
||||
future::ready(Counters {
|
||||
success: x.success + ret.success,
|
||||
errored: x.errored + ret.errored,
|
||||
skipped: x.skipped + ret.skipped,
|
||||
})
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn scrap_url(&self, url: String, res_body: &str) -> anyhow::Result<PrecioPoint> {
|
||||
let url_p = Url::parse(&url).unwrap();
|
||||
match Supermercado::from_url(&url_p) {
|
||||
|
|
|
@ -1,12 +1,10 @@
|
|||
use again::Task;
|
||||
use anyhow::{anyhow, Context};
|
||||
use futures::{stream, StreamExt, TryFutureExt, TryStreamExt};
|
||||
use futures::{stream, StreamExt, TryStreamExt};
|
||||
use itertools::Itertools;
|
||||
use reqwest::Url;
|
||||
|
||||
use crate::{
|
||||
anyhow_retry_if_wasnt_not_found, get_fetch_retry_policy, proxy_client::ProxyClient,
|
||||
retry_if_wasnt_not_found, PrecioPoint,
|
||||
anyhow_retry_if_wasnt_not_found, get_fetch_retry_policy, proxy_client::ProxyClient, PrecioPoint,
|
||||
};
|
||||
|
||||
pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error> {
|
||||
|
@ -83,7 +81,7 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error>
|
|||
pub async fn get_urls(proxy_client: &ProxyClient) -> anyhow::Result<Vec<String>> {
|
||||
let initial = Url::parse("https://www.cotodigital3.com.ar/sitios/cdigi/browse?Nf=product.endDate%7CGTEQ+1.7032032E12%7C%7Cproduct.startDate%7CLTEQ+1.7032032E12&Nr=AND%28product.sDisp_200%3A1004%2Cproduct.language%3Aespa%C3%B1ol%2COR%28product.siteId%3ACotoDigital%29%29")?;
|
||||
|
||||
let page_size = 50;
|
||||
let page_size = 100;
|
||||
let handles: Vec<Vec<String>> = stream::iter(0..29000 / page_size)
|
||||
.map(|i| {
|
||||
let mut u = initial.clone();
|
||||
|
@ -136,6 +134,7 @@ pub async fn get_urls(proxy_client: &ProxyClient) -> anyhow::Result<Vec<String>>
|
|||
})
|
||||
})
|
||||
.try_collect()?;
|
||||
tracing::debug!("got {} products", list.len());
|
||||
Ok::<Vec<String>, anyhow::Error>(list)
|
||||
}
|
||||
})
|
||||
|
|
Loading…
Reference in a new issue