From c946e7fe35c68672a49d105e25d6cabff7404688 Mon Sep 17 00:00:00 2001 From: Nulo Date: Sun, 23 Jun 2024 13:27:29 -0300 Subject: [PATCH] refactor, simplify, get proxy list by env --- scraper-rs/Cargo.lock | 664 --------------------------------- scraper-rs/Cargo.toml | 2 - scraper-rs/src/auto.rs | 3 +- scraper-rs/src/main.rs | 68 +--- scraper-rs/src/proxy_client.rs | 300 +++------------ scraper-rs/src/scraper.rs | 79 +++- scraper-rs/src/sites/coto.rs | 9 +- 7 files changed, 127 insertions(+), 998 deletions(-) diff --git a/scraper-rs/Cargo.lock b/scraper-rs/Cargo.lock index 3c4a9ed..3e858c1 100644 --- a/scraper-rs/Cargo.lock +++ b/scraper-rs/Cargo.lock @@ -155,12 +155,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "atomic-waker" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" - [[package]] name = "autocfg" version = "1.3.0" @@ -251,12 +245,6 @@ version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" -[[package]] -name = "bytecount" -version = "0.6.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" - [[package]] name = "byteorder" version = "1.5.0" @@ -341,35 +329,12 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" -[[package]] -name = "console" -version = "0.15.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb" -dependencies = [ - "encode_unicode", - "lazy_static", - "libc", - "unicode-width", - "windows-sys 0.52.0", -] - [[package]] name = "const-oid" version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" -[[package]] -name = "core-foundation" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" -dependencies = [ - "core-foundation-sys", - "libc", -] - [[package]] name = "core-foundation-sys" version = "0.8.6" @@ -445,29 +410,6 @@ dependencies = [ "typenum", ] -[[package]] -name = "cssparser" -version = "0.31.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be" -dependencies = [ - "cssparser-macros", - "dtoa-short", - "itoa", - "phf 0.11.2", - "smallvec", -] - -[[package]] -name = "cssparser-macros" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" -dependencies = [ - "quote", - "syn 2.0.66", -] - [[package]] name = "der" version = "0.7.9" @@ -479,17 +421,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "derive_more" -version = "0.99.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.66", -] - [[package]] name = "digest" version = "0.10.7" @@ -508,27 +439,6 @@ version = "0.15.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" -[[package]] -name = "dtoa" -version = "1.0.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653" - -[[package]] -name = "dtoa-short" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" -dependencies = [ - "dtoa", -] - -[[package]] -name = "ego-tree" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591" - [[package]] name = "either" version = "1.12.0" @@ -538,21 +448,6 @@ dependencies = [ "serde", ] -[[package]] -name = "encode_unicode" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" - -[[package]] -name = "encoding_rs" -version = "0.8.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" -dependencies = [ - "cfg-if", -] - [[package]] name = "equivalent" version = "1.0.1" @@ -631,21 +526,6 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" -[[package]] -name = "foreign-types" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" -dependencies = [ - "foreign-types-shared", -] - -[[package]] -name = "foreign-types-shared" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" - [[package]] name = "form_urlencoded" version = "1.2.1" @@ -655,16 +535,6 @@ dependencies = [ "percent-encoding", ] -[[package]] -name = "futf" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" -dependencies = [ - "mac", - "new_debug_unreachable", -] - [[package]] name = "futures" version = "0.3.30" @@ -765,15 +635,6 @@ dependencies = [ "slab", ] -[[package]] -name = "fxhash" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" -dependencies = [ - "byteorder", -] - [[package]] name = "generic-array" version = "0.14.7" @@ -784,15 +645,6 @@ dependencies = [ "version_check", ] -[[package]] -name = "getopts" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" -dependencies = [ - "unicode-width", -] - [[package]] name = "getrandom" version = "0.1.16" @@ -821,25 +673,6 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" -[[package]] -name = "h2" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab" -dependencies = [ - "atomic-waker", - "bytes", - "fnv", - "futures-core", - "futures-sink", - "http", - "indexmap", - "slab", - "tokio", - "tokio-util", - "tracing", -] - [[package]] name = "hashbrown" version = "0.14.5" @@ -922,20 +755,6 @@ dependencies = [ "utf8-width", ] -[[package]] -name = "html5ever" -version = "0.26.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" -dependencies = [ - "log", - "mac", - "markup5ever", - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "http" version = "1.1.0" @@ -976,12 +795,6 @@ version = "1.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9" -[[package]] -name = "humantime" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" - [[package]] name = "hyper" version = "1.3.1" @@ -991,7 +804,6 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "h2", "http", "http-body", "httparse", @@ -1020,22 +832,6 @@ dependencies = [ "webpki-roots", ] -[[package]] -name = "hyper-tls" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" -dependencies = [ - "bytes", - "http-body-util", - "hyper", - "hyper-util", - "native-tls", - "tokio", - "tokio-native-tls", - "tower-service", -] - [[package]] name = "hyper-util" version = "0.1.5" @@ -1099,19 +895,6 @@ dependencies = [ "hashbrown", ] -[[package]] -name = "indicatif" -version = "0.17.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3" -dependencies = [ - "console", - "instant", - "number_prefix", - "portable-atomic", - "unicode-width", -] - [[package]] name = "instant" version = "0.1.13" @@ -1211,26 +994,6 @@ version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" -[[package]] -name = "mac" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" - -[[package]] -name = "markup5ever" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" -dependencies = [ - "log", - "phf 0.10.1", - "phf_codegen", - "string_cache", - "string_cache_codegen", - "tendril", -] - [[package]] name = "md-5" version = "0.10.6" @@ -1288,29 +1051,6 @@ dependencies = [ "rand 0.8.5", ] -[[package]] -name = "native-tls" -version = "0.2.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" -dependencies = [ - "libc", - "log", - "openssl", - "openssl-probe", - "openssl-sys", - "schannel", - "security-framework", - "security-framework-sys", - "tempfile", -] - -[[package]] -name = "new_debug_unreachable" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" - [[package]] name = "nom" version = "7.1.3" @@ -1388,12 +1128,6 @@ dependencies = [ "libc", ] -[[package]] -name = "number_prefix" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" - [[package]] name = "object" version = "0.36.0" @@ -1409,67 +1143,12 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" -[[package]] -name = "openssl" -version = "0.10.64" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f" -dependencies = [ - "bitflags 2.5.0", - "cfg-if", - "foreign-types", - "libc", - "once_cell", - "openssl-macros", - "openssl-sys", -] - -[[package]] -name = "openssl-macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.66", -] - -[[package]] -name = "openssl-probe" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" - -[[package]] -name = "openssl-sys" -version = "0.9.102" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2" -dependencies = [ - "cc", - "libc", - "pkg-config", - "vcpkg", -] - [[package]] name = "overload" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" -[[package]] -name = "papergrid" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ad43c07024ef767f9160710b3a6773976194758c7919b17e63b863db0bdf7fb" -dependencies = [ - "bytecount", - "fnv", - "unicode-width", -] - [[package]] name = "parking_lot" version = "0.11.2" @@ -1539,86 +1218,6 @@ version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" -[[package]] -name = "phf" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" -dependencies = [ - "phf_shared 0.10.0", -] - -[[package]] -name = "phf" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" -dependencies = [ - "phf_macros", - "phf_shared 0.11.2", -] - -[[package]] -name = "phf_codegen" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" -dependencies = [ - "phf_generator 0.10.0", - "phf_shared 0.10.0", -] - -[[package]] -name = "phf_generator" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" -dependencies = [ - "phf_shared 0.10.0", - "rand 0.8.5", -] - -[[package]] -name = "phf_generator" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" -dependencies = [ - "phf_shared 0.11.2", - "rand 0.8.5", -] - -[[package]] -name = "phf_macros" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b" -dependencies = [ - "phf_generator 0.11.2", - "phf_shared 0.11.2", - "proc-macro2", - "quote", - "syn 2.0.66", -] - -[[package]] -name = "phf_shared" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" -dependencies = [ - "siphasher", -] - -[[package]] -name = "phf_shared" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" -dependencies = [ - "siphasher", -] - [[package]] name = "pin-project" version = "1.1.5" @@ -1678,48 +1277,12 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" -[[package]] -name = "portable-atomic" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" - [[package]] name = "ppv-lite86" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" -[[package]] -name = "precomputed-hash" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" - -[[package]] -name = "proc-macro-error" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" -dependencies = [ - "proc-macro-error-attr", - "proc-macro2", - "quote", - "syn 1.0.109", - "version_check", -] - -[[package]] -name = "proc-macro-error-attr" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" -dependencies = [ - "proc-macro2", - "quote", - "version_check", -] - [[package]] name = "proc-macro2" version = "1.0.85" @@ -1729,24 +1292,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "proxy-scraper-checker" -version = "0.1.3" -source = "git+https://github.com/catdevnull/Proxy-Scraper-Checker?rev=429ca83d137abdf5377a1d22ee85d62bfb00437c#429ca83d137abdf5377a1d22ee85d62bfb00437c" -dependencies = [ - "anyhow", - "clap", - "futures", - "humantime", - "indicatif", - "reqwest", - "rlimit", - "scraper", - "serde_json", - "tabled", - "tokio", -] - [[package]] name = "quick-xml" version = "0.31.0" @@ -1919,22 +1464,18 @@ dependencies = [ "async-compression", "base64 0.22.1", "bytes", - "encoding_rs", "futures-core", "futures-util", - "h2", "http", "http-body", "http-body-util", "hyper", "hyper-rustls", - "hyper-tls", "hyper-util", "ipnet", "js-sys", "log", "mime", - "native-tls", "once_cell", "percent-encoding", "pin-project-lite", @@ -1946,9 +1487,7 @@ dependencies = [ "serde_json", "serde_urlencoded", "sync_wrapper", - "system-configuration", "tokio", - "tokio-native-tls", "tokio-rustls", "tokio-socks", "tokio-util", @@ -1976,15 +1515,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "rlimit" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3560f70f30a0f16d11d01ed078a07740fe6b489667abc7c7b029155d9f21c3d8" -dependencies = [ - "libc", -] - [[package]] name = "rsa" version = "0.9.6" @@ -2091,37 +1621,12 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" -[[package]] -name = "schannel" -version = "0.1.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" -dependencies = [ - "windows-sys 0.52.0", -] - [[package]] name = "scopeguard" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" -[[package]] -name = "scraper" -version = "0.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b80b33679ff7a0ea53d37f3b39de77ea0c75b12c5805ac43ec0c33b3051af1b" -dependencies = [ - "ahash", - "cssparser", - "ego-tree", - "getopts", - "html5ever", - "once_cell", - "selectors", - "tendril", -] - [[package]] name = "scraper-rs" version = "0.1.0" @@ -2134,10 +1639,8 @@ dependencies = [ "cron", "futures", "html-escape", - "indicatif", "itertools", "nanoid", - "proxy-scraper-checker", "quick-xml", "rand 0.8.5", "reqwest", @@ -2153,48 +1656,6 @@ dependencies = [ "tracing-subscriber", ] -[[package]] -name = "security-framework" -version = "2.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c627723fd09706bacdb5cf41499e95098555af3c3c29d014dc3c458ef6be11c0" -dependencies = [ - "bitflags 2.5.0", - "core-foundation", - "core-foundation-sys", - "libc", - "security-framework-sys", -] - -[[package]] -name = "security-framework-sys" -version = "2.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "317936bbbd05227752583946b9e66d7ce3b489f84e11a94a510b4437fef407d7" -dependencies = [ - "core-foundation-sys", - "libc", -] - -[[package]] -name = "selectors" -version = "0.25.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" -dependencies = [ - "bitflags 2.5.0", - "cssparser", - "derive_more", - "fxhash", - "log", - "new_debug_unreachable", - "phf 0.10.1", - "phf_codegen", - "precomputed-hash", - "servo_arc", - "smallvec", -] - [[package]] name = "serde" version = "1.0.203" @@ -2238,15 +1699,6 @@ dependencies = [ "serde", ] -[[package]] -name = "servo_arc" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44" -dependencies = [ - "stable_deref_trait", -] - [[package]] name = "sha1" version = "0.10.6" @@ -2303,12 +1755,6 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e2accd2c41a0e920d2abd91b2badcfa1da784662f54fbc47e0e3a51f1e2e1cf" -[[package]] -name = "siphasher" -version = "0.3.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" - [[package]] name = "slab" version = "0.4.9" @@ -2562,38 +2008,6 @@ dependencies = [ "urlencoding", ] -[[package]] -name = "stable_deref_trait" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" - -[[package]] -name = "string_cache" -version = "0.8.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" -dependencies = [ - "new_debug_unreachable", - "once_cell", - "parking_lot 0.12.3", - "phf_shared 0.10.0", - "precomputed-hash", - "serde", -] - -[[package]] -name = "string_cache_codegen" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" -dependencies = [ - "phf_generator 0.10.0", - "phf_shared 0.10.0", - "proc-macro2", - "quote", -] - [[package]] name = "stringprep" version = "0.1.5" @@ -2645,51 +2059,6 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" -[[package]] -name = "system-configuration" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" -dependencies = [ - "bitflags 1.3.2", - "core-foundation", - "system-configuration-sys", -] - -[[package]] -name = "system-configuration-sys" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9" -dependencies = [ - "core-foundation-sys", - "libc", -] - -[[package]] -name = "tabled" -version = "0.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c998b0c8b921495196a48aabaf1901ff28be0760136e31604f7967b0792050e" -dependencies = [ - "papergrid", - "tabled_derive", - "unicode-width", -] - -[[package]] -name = "tabled_derive" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c138f99377e5d653a371cdad263615634cfc8467685dfe8e73e2b8e98f44b17" -dependencies = [ - "heck 0.4.1", - "proc-macro-error", - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "tempfile" version = "3.10.1" @@ -2702,17 +2071,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "tendril" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" -dependencies = [ - "futf", - "mac", - "utf-8", -] - [[package]] name = "thiserror" version = "1.0.61" @@ -2793,16 +2151,6 @@ dependencies = [ "syn 2.0.66", ] -[[package]] -name = "tokio-native-tls" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" -dependencies = [ - "native-tls", - "tokio", -] - [[package]] name = "tokio-rustls" version = "0.26.0" @@ -2980,12 +2328,6 @@ version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" -[[package]] -name = "unicode-width" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" - [[package]] name = "unicode_categories" version = "0.1.1" @@ -3015,12 +2357,6 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" -[[package]] -name = "utf-8" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" - [[package]] name = "utf8-width" version = "0.1.7" diff --git a/scraper-rs/Cargo.toml b/scraper-rs/Cargo.toml index 4046f88..8c7679d 100644 --- a/scraper-rs/Cargo.toml +++ b/scraper-rs/Cargo.toml @@ -35,5 +35,3 @@ tl = { git = "https://github.com/evertedsphere/tl", branch = "patch-1" } tokio = { version = "1.35.1", features = ["full"] } tracing = "0.1" tracing-subscriber = "0.3" -proxy-scraper-checker = { git = "https://github.com/catdevnull/Proxy-Scraper-Checker", rev = "429ca83d137abdf5377a1d22ee85d62bfb00437c" } -indicatif = "0.17.8" diff --git a/scraper-rs/src/auto.rs b/scraper-rs/src/auto.rs index ad9236c..9ba16f9 100644 --- a/scraper-rs/src/auto.rs +++ b/scraper-rs/src/auto.rs @@ -1,4 +1,3 @@ -use super::fetch_list; use super::now_sec; use super::supermercado::Supermercado; use super::AutoArgs; @@ -58,7 +57,7 @@ impl Auto { // } { let t0 = now_sec(); - let counters = fetch_list(&self.db, links).await; + let counters = self.scraper.fetch_list(&self.db, links).await; self.inform(&format!( "Downloaded {:?}: {:?} (took {})", &supermercado, diff --git a/scraper-rs/src/main.rs b/scraper-rs/src/main.rs index 3db3813..3571793 100644 --- a/scraper-rs/src/main.rs +++ b/scraper-rs/src/main.rs @@ -2,7 +2,7 @@ use again::RetryPolicy; use clap::{Parser, ValueEnum}; use cron::Schedule; use db::Db; -use futures::{future, stream, StreamExt, TryFutureExt}; +use futures::{future, TryFutureExt}; use reqwest::{header::HeaderMap, IntoUrl, StatusCode}; use scraper::Scraper; @@ -58,7 +58,7 @@ struct AutoArgs { } #[tokio::main] -async fn main() -> anyhow::Result<()> { +async fn main() -> () { tracing_subscriber::fmt::init(); match Args::parse() { @@ -70,11 +70,12 @@ async fn main() -> anyhow::Result<()> { Args::Auto(a) => auto_cli(a).await, Args::Cron(_) => cron_cli().await, } + .unwrap() } async fn scrap_url_cli(url: String) -> anyhow::Result<()> { - let scraper = Scraper::new(); - let res = scraper.fetch_and_parse(url.clone()).await; + let scraper = Scraper::from_env().await?; + let res = scraper.fetch_and_scrap(url.clone()).await; println!("Result: {:#?}", res); res.map(|_| ()) @@ -98,37 +99,13 @@ async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> { .collect::>(); let db = Db::connect().await?; - let counters = fetch_list(&db, links).await; + let scraper = Scraper::from_env().await?; + let counters = scraper.fetch_list(&db, links).await; println!("Finished: {:?}", counters); Ok(()) } -async fn fetch_list(db: &Db, links: Vec) -> Counters { - let n_coroutines = env::var("N_COROUTINES") - .map_or(Ok(24), |s| s.parse::()) - .expect("N_COROUTINES no es un número"); - - let scraper = Scraper::new(); - - stream::iter(links) - .map(|url| { - let db = db.clone(); - let scraper = scraper.clone(); - tokio::spawn(fetch_and_save(scraper, url, db)) - }) - .buffer_unordered(n_coroutines) - .fold(Counters::default(), move |x, y| { - let ret = y.unwrap(); - future::ready(Counters { - success: x.success + ret.success, - errored: x.errored + ret.errored, - skipped: x.skipped + ret.skipped, - }) - }) - .await -} - mod db; #[derive(Default, Debug)] @@ -138,29 +115,6 @@ struct Counters { skipped: u64, } -async fn fetch_and_save(scraper: Scraper, url: String, db: Db) -> Counters { - let res = scraper.fetch_and_parse(url.clone()).await; - let mut counters = Counters::default(); - match res { - Ok(res) => { - counters.success += 1; - db.insert_precio(res).await.unwrap(); - } - Err(err) => { - match err.downcast_ref::() { - Some(e) => match e.status() { - Some(StatusCode::NOT_FOUND) => counters.skipped += 1, - _ => counters.errored += 1, - }, - _ => counters.errored += 1, - } - - tracing::error!(error=%err, url=url); - } - } - counters -} - #[derive(Debug, Error)] enum FetchError { #[error("parse error")] @@ -222,7 +176,7 @@ pub fn anyhow_retry_if_wasnt_not_found(err: &anyhow::Error) -> bool { async fn parse_file_cli(file_path: String) -> anyhow::Result<()> { let file = tokio::fs::read_to_string(file_path).await?; - let scraper = Scraper::new(); + let scraper = Scraper::from_env().await?; let url = { let dom = tl::parse(&file, tl::ParserOptions::default())?; @@ -242,7 +196,7 @@ async fn parse_file_cli(file_path: String) -> anyhow::Result<()> { } async fn get_url_list_cli(supermercado: Supermercado) -> anyhow::Result<()> { - let scraper = Scraper::new(); + let scraper = Scraper::from_env().await?; let urls = scraper.get_urls_for_supermercado(&supermercado).await?; urls.iter().for_each(|s| { println!("{}", s); @@ -276,7 +230,7 @@ async fn auto_cli(args: AutoArgs) -> anyhow::Result<()> { db, telegram, args, - scraper: Scraper::new(), + scraper: Scraper::from_env().await?, } }; auto.inform("[auto] Empezando scrap").await; @@ -289,7 +243,7 @@ async fn auto_cli(args: AutoArgs) -> anyhow::Result<()> { let handles: Vec<_> = supermercados .iter() .map(|s| { - let x = s.clone(); + let x = *s; tokio::spawn( auto.clone() .download_supermercado(s.to_owned()) diff --git a/scraper-rs/src/proxy_client.rs b/scraper-rs/src/proxy_client.rs index ce4aff1..0936a70 100644 --- a/scraper-rs/src/proxy_client.rs +++ b/scraper-rs/src/proxy_client.rs @@ -1,274 +1,58 @@ -use std::{collections::HashSet, str::FromStr, sync::Arc, time::Duration}; +use std::time::Duration; -use anyhow::Context; -use futures::{future::join_all, stream, FutureExt, StreamExt}; -use indicatif::ProgressBar; use itertools::Itertools; -use proxy_scraper_checker::{Proxy, ProxyChecker}; use rand::Rng; use reqwest::{IntoUrl, Url}; -use serde::Deserialize; -use tokio::sync::{RwLock, Semaphore}; use crate::build_header_map; -#[derive(Default, Debug)] +#[derive(Debug, Clone)] pub struct ProxyClient { - proxies: RwLock>>, - clients: RwLock<[Option; 10]>, + // proxies: Vec, + clients: Vec, } impl ProxyClient { - pub async fn do_request(&self, url: impl IntoUrl + Clone) -> anyhow::Result { - loop { - let client = { - let mut client_ptr = self.clients.write().await; - if let Some(client) = (*client_ptr).clone() { - client - } else { - let proxies = self.get_proxies().await?; - // let proxy = stream::iter(proxies) - // .filter_map(|proxy| async { - // println!("trying proxy {}", proxy); - // check_proxy( - // proxy.clone(), - // "https://www.cotodigital3.com.ar/sitios/cdigi/".to_string(), - // 3, - // ) - // .map(|r| r.ok()) - // .await - // }) - // .next() - // .await - // .unwrap() - // .clone(); - let proxy = loop { - let proxy = proxies[rand::thread_rng().gen_range(0..proxies.len())].clone(); - println!("trying proxy {}", proxy); - match check_proxy( - proxy, - "https://www.cotodigital3.com.ar/sitios/cdigi/".to_string(), - 10, - ) - .await - { - Ok(proxy) => break proxy, - Err(_) => continue, - } - }; - - println!("chose proxy {}", proxy); - let new_client = reqwest::ClientBuilder::default() - .timeout(Duration::from_secs(300)) - .connect_timeout(Duration::from_secs(150)) - .default_headers(build_header_map()) - .proxy(reqwest::Proxy::all(proxy)?) - .build() - .unwrap(); - let ret = new_client.clone(); - *client_ptr = Some(new_client); - ret - } - }; - let req = client.get(url.clone()).build()?; - match client.execute(req).await { - Ok(res) => return Ok(res), - Err(_) => { - // possibly IP locked, reset client to get another IP - { - println!("request failed, resetting client"); - *(self.clients.write().await) = None; - } - } - } - } - } - - pub async fn get_proxies(&self) -> anyhow::Result> { - let mut proxies_ptr = self.proxies.write().await; - if let Some(proxies) = (*proxies_ptr).clone() { - Ok(proxies) + pub fn from_proxy_list(proxies: &str) -> anyhow::Result { + let proxies = Self::parse_proxy_list(proxies)?; + let clients = if proxies.is_empty() { + tracing::warn!("No proxies available; using no proxy"); + vec![Self::client_builder().build()?] } else { - // let scraper = proxy_scraper_checker::ProxyScraper::default(); + proxies + .clone() + .into_iter() + .map(Self::build_client_with_proxy) + .try_collect()? + }; + Ok(Self { clients }) + } - // let archive_urls = scraper.scrape_archive_urls().await?; - // let futures: Vec<_> = archive_urls - // .into_iter() - // .map(|url| { - // tokio::task::spawn({ - // let value = scraper.clone(); - // async move { value.scrape_proxies(url, true).await } - // }) - // }) - // .collect(); - // let results: Vec<_> = join_all(futures).await.into_iter().try_collect()?; - // let proxies: Vec<_> = results - // .into_iter() - // .filter_map(|res| if let Ok(res) = res { Some(res) } else { None }) - // .flatten() - // .filter(|x| { - // if let Proxy::Socks5(_) = x { - // true - // } else { - // false - // } - // }) - // .collect(); + fn parse_proxy_list(proxies: &str) -> anyhow::Result> { + Ok(proxies + .split("\n") + .filter(|s| !s.trim().is_empty()) + .map(Url::parse) + .try_collect()?) + } + fn client_builder() -> reqwest::ClientBuilder { + reqwest::ClientBuilder::default() + .timeout(Duration::from_secs(300)) + .connect_timeout(Duration::from_secs(150)) + .default_headers(build_header_map()) + } + fn build_client_with_proxy(proxy: Url) -> reqwest::Result { + Self::client_builder() + .proxy(reqwest::Proxy::all(proxy)?) + .build() + } - let socks5_proxies = get_proxy_list_from_raw_list( - "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks5.txt", - "socks5", - ) - .await?; - let http_proxies = get_proxy_list_from_raw_list( - "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt", - "http", - ) - .await?; - let fosy_http_proxies = - get_proxy_list_from_raw_list("https://fosy.club/api/free/list?type=http", "http") - .await?; - let fosy_socks5_proxies = get_proxy_list_from_raw_list( - "https://fosy.club/api/free/list?type=socks5", - "socks5", - ) - .await?; - let geonode_proxies = get_proxy_list_geonode() - .await - .inspect_err(|e| tracing::error!("getting proxy list ({error})", error = e))?; - - // let proxies: Vec<_> = [ - // // socks5_proxies, - // // http_proxies, - // fosy_http_proxies, - // fosy_socks5_proxies, - // geonode_proxies, - // ] - // .into_iter() - // .flatten() - // .collect(); - - let checked_proxies: Vec<_> = { - let proxiess: HashSet<_> = proxies - .into_iter() - .filter_map(|p| match p.scheme() { - "socks5" => Some(Proxy::Socks5(p.host_str()?.to_string())), - "http" => Some(Proxy::Http(p.host_str()?.to_string())), - _ => None, - }) - .collect(); - let checker = ProxyChecker::new( - Arc::new(Semaphore::new(32)), - ProgressBar::new(proxiess.len().try_into().unwrap()), - ); - checker - .check_proxies(proxiess.into(), "https://milei.nulo.in/".to_string(), 8) - .await? - .into_iter() - .map(|p| Url::from_str(&p.url())) - .try_collect()? - }; - - let ret = checked_proxies.clone(); - println!("got {} proxies", ret.len()); - *proxies_ptr = Some(checked_proxies); - Ok(ret) - } + pub async fn do_request( + &self, + url: impl IntoUrl + Clone, + ) -> reqwest::Result { + let client = self.clients[rand::thread_rng().gen_range(0..self.clients.len())].clone(); + let req = client.get(url.clone()).build()?; + client.execute(req).await } } - -pub async fn check_proxy(proxy: Url, url: String, timeout: u64) -> anyhow::Result { - let client = reqwest::Client::builder() - .proxy(reqwest::Proxy::all(proxy.clone())?) - .timeout(Duration::from_secs(timeout)) - .build()?; - - client - .get(url) - .send() - .await - .context("Request failed")? - .error_for_status() - .context("Request returned an error status code")?; - - Ok(proxy) -} - -// pub async fn find_first_working_proxy(proxies: Vec) -> anyhow::Result { -// let semaphore = Arc::new(Semaphore::new(64)); -// for proxy in proxies { -// let semaphore = semaphore.clone(); - -// } - -// let proxy = stream::iter(proxies) -// .filter_map(|proxy| async { -// println!("trying proxy {}", proxy); -// check_proxy( -// proxy.clone(), -// "https://www.cotodigital3.com.ar/sitios/cdigi/".to_string(), -// 3, -// ) -// .map(|r| r.ok()) -// .await -// }).concu -// .next() -// .await -// .unwrap() -// .clone(); - -// } - -pub async fn get_proxy_list_from_raw_list( - list_url: U, - protocol: &str, -) -> anyhow::Result> { - let res = reqwest::get(list_url).await?; - let text = res.text().await?; - Ok(text - .lines() - .map(|l| Url::from_str(&format!("{}://{}", protocol, l))) - .filter_map(|r| r.ok()) - .collect()) -} - -#[derive(Deserialize)] -struct Ips { - data: Vec, -} -#[derive(Deserialize)] -struct Ip { - ip: String, - port: String, - protocols: Vec, -} -pub async fn get_proxy_list_geonode() -> anyhow::Result> { - let ips = reqwest::get("https://proxylist.geonode.com/api/proxy-list?protocols=socks5%2Chttp&filterUpTime=90&limit=500&page=1&sort_by=lastChecked&sort_type=asc").await?.json::().await?; - Ok(ips - .data - .into_iter() - .map(|i| Url::from_str(&format!("{}://{}:{}", i.protocols[0], i.ip, i.port))) - .filter_map(|r| r.ok()) - .collect()) -} -pub async fn get_proxy_list_checkerproxy() -> anyhow::Result> { - let scraper = proxy_scraper_checker::ProxyScraper::default(); - let archive_urls = scraper.scrape_archive_urls().await?; - let futures: Vec<_> = archive_urls - .into_iter() - .map(|url| { - tokio::task::spawn({ - let value = scraper.clone(); - async move { value.scrape_proxies(url, true).await } - }) - }) - .collect(); - let results: Vec<_> = join_all(futures).await.into_iter().try_collect()?; - let proxies: Vec<_> = results - .into_iter() - .filter_map(|res| if let Ok(res) = res { Some(res) } else { None }) - .flatten() - .map(|p| Url::from_str(&p.url())) - .try_collect()?; - Ok(proxies) -} diff --git a/scraper-rs/src/scraper.rs b/scraper-rs/src/scraper.rs index 2a1eb2e..2c9a49e 100644 --- a/scraper-rs/src/scraper.rs +++ b/scraper-rs/src/scraper.rs @@ -1,25 +1,38 @@ -use std::sync::Arc; +use std::env; -use reqwest::Url; +use futures::{future, stream, StreamExt}; +use reqwest::{StatusCode, Url}; use simple_error::bail; +use tokio::fs; use crate::{ - anyhow_retry_if_wasnt_not_found, build_client, get_fetch_retry_policy, get_parse_retry_policy, - proxy_client::ProxyClient, sites, supermercado::Supermercado, PrecioPoint, + anyhow_retry_if_wasnt_not_found, build_client, db::Db, get_fetch_retry_policy, + get_parse_retry_policy, proxy_client::ProxyClient, sites, supermercado::Supermercado, Counters, + PrecioPoint, }; #[derive(Debug, Clone)] pub struct Scraper { default_client: reqwest::Client, - proxy_client: Arc, + proxy_client: ProxyClient, } impl Scraper { - pub fn new() -> Self { - Self { + pub async fn from_env() -> anyhow::Result { + let proxy_list = match env::var("PROXY_LIST") { + Ok(list) => list, + Err(_) => match env::var("PROXY_LIST_PATH") { + Ok(path) => fs::read_to_string(path).await?, + Err(_) => "".to_owned(), + }, + }; + Self::build(&proxy_list) + } + pub fn build(proxy_list: &str) -> anyhow::Result { + Ok(Self { default_client: build_client(), - proxy_client: ProxyClient::default().into(), - } + proxy_client: ProxyClient::from_proxy_list(proxy_list)?, + }) } pub async fn get_urls_for_supermercado( @@ -36,7 +49,7 @@ impl Scraper { } #[tracing::instrument(skip(self))] - pub async fn fetch_and_parse(&self, url: String) -> Result { + pub async fn fetch_and_scrap(&self, url: String) -> Result { async fn fetch_and_scrap( scraper: &Scraper, url: String, @@ -92,6 +105,52 @@ impl Scraper { Ok(res.text().await?) } + pub async fn fetch_and_save(&self, url: String, db: Db) -> Counters { + let res = self.fetch_and_scrap(url.clone()).await; + let mut counters = Counters::default(); + match res { + Ok(res) => { + counters.success += 1; + db.insert_precio(res).await.unwrap(); + } + Err(err) => { + match err.downcast_ref::() { + Some(e) => match e.status() { + Some(StatusCode::NOT_FOUND) => counters.skipped += 1, + _ => counters.errored += 1, + }, + _ => counters.errored += 1, + } + + tracing::error!(error=%err, url=url); + } + } + counters + } + + pub async fn fetch_list(&self, db: &Db, links: Vec) -> Counters { + let n_coroutines = env::var("N_COROUTINES") + .map_or(Ok(24), |s| s.parse::()) + .expect("N_COROUTINES no es un número"); + + stream::iter(links) + .map(|url| { + let db = db.clone(); + let scraper = self.clone(); + tokio::spawn(async move { scraper.fetch_and_save(url, db).await }) + }) + .buffer_unordered(n_coroutines) + .fold(Counters::default(), move |x, y| { + let ret = y.unwrap(); + future::ready(Counters { + success: x.success + ret.success, + errored: x.errored + ret.errored, + skipped: x.skipped + ret.skipped, + }) + }) + .await + } + pub async fn scrap_url(&self, url: String, res_body: &str) -> anyhow::Result { let url_p = Url::parse(&url).unwrap(); match Supermercado::from_url(&url_p) { diff --git a/scraper-rs/src/sites/coto.rs b/scraper-rs/src/sites/coto.rs index 19e970f..5de3ed6 100644 --- a/scraper-rs/src/sites/coto.rs +++ b/scraper-rs/src/sites/coto.rs @@ -1,12 +1,10 @@ -use again::Task; use anyhow::{anyhow, Context}; -use futures::{stream, StreamExt, TryFutureExt, TryStreamExt}; +use futures::{stream, StreamExt, TryStreamExt}; use itertools::Itertools; use reqwest::Url; use crate::{ - anyhow_retry_if_wasnt_not_found, get_fetch_retry_policy, proxy_client::ProxyClient, - retry_if_wasnt_not_found, PrecioPoint, + anyhow_retry_if_wasnt_not_found, get_fetch_retry_policy, proxy_client::ProxyClient, PrecioPoint, }; pub fn parse(url: String, dom: &tl::VDom) -> Result { @@ -83,7 +81,7 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result pub async fn get_urls(proxy_client: &ProxyClient) -> anyhow::Result> { let initial = Url::parse("https://www.cotodigital3.com.ar/sitios/cdigi/browse?Nf=product.endDate%7CGTEQ+1.7032032E12%7C%7Cproduct.startDate%7CLTEQ+1.7032032E12&Nr=AND%28product.sDisp_200%3A1004%2Cproduct.language%3Aespa%C3%B1ol%2COR%28product.siteId%3ACotoDigital%29%29")?; - let page_size = 50; + let page_size = 100; let handles: Vec> = stream::iter(0..29000 / page_size) .map(|i| { let mut u = initial.clone(); @@ -136,6 +134,7 @@ pub async fn get_urls(proxy_client: &ProxyClient) -> anyhow::Result> }) }) .try_collect()?; + tracing::debug!("got {} products", list.len()); Ok::, anyhow::Error>(list) } })