mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 03:26:19 +00:00
paralelizar
This commit is contained in:
parent
ba484709f8
commit
6b04dee2e4
3 changed files with 22 additions and 10 deletions
1
scraper-rs/Cargo.lock
generated
1
scraper-rs/Cargo.lock
generated
|
@ -1165,6 +1165,7 @@ dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"async-channel",
|
"async-channel",
|
||||||
"clap",
|
"clap",
|
||||||
|
"futures",
|
||||||
"itertools",
|
"itertools",
|
||||||
"nanoid",
|
"nanoid",
|
||||||
"r2d2",
|
"r2d2",
|
||||||
|
|
|
@ -10,6 +10,7 @@ again = "0.1.2"
|
||||||
anyhow = "1.0.79"
|
anyhow = "1.0.79"
|
||||||
async-channel = "2.1.1"
|
async-channel = "2.1.1"
|
||||||
clap = { version = "4.4.15", features = ["derive"] }
|
clap = { version = "4.4.15", features = ["derive"] }
|
||||||
|
futures = "0.3.30"
|
||||||
itertools = "0.12.0"
|
itertools = "0.12.0"
|
||||||
nanoid = "0.4.0"
|
nanoid = "0.4.0"
|
||||||
r2d2 = "0.8.10"
|
r2d2 = "0.8.10"
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
|
use futures::{stream, StreamExt, TryStreamExt};
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use simple_error::SimpleError;
|
use simple_error::SimpleError;
|
||||||
|
@ -116,16 +117,25 @@ pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> {
|
||||||
.collect())
|
.collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn get_urls_from_sitemap(sitemaps: &[&str]) -> anyhow::Result<Vec<String>> {
|
pub async fn get_urls_from_sitemap<'a>(sitemaps: &[&str]) -> anyhow::Result<Vec<String>> {
|
||||||
let mut total = vec![];
|
let mut total: Vec<String> = vec![];
|
||||||
let client = &build_client();
|
let client = build_client();
|
||||||
for url in sitemaps {
|
let handles = stream::iter(sitemaps)
|
||||||
|
.map(|url| {
|
||||||
|
let client = &client;
|
||||||
|
async move {
|
||||||
let text = get_retry_policy()
|
let text = get_retry_policy()
|
||||||
.retry(|| do_request(client, url))
|
.retry(|| do_request(client, &url))
|
||||||
.await?
|
.await?
|
||||||
.text()
|
.text()
|
||||||
.await?;
|
.await?;
|
||||||
let mut urls = parse_urls_from_sitemap(&text)?;
|
parse_urls_from_sitemap(&text)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.buffer_unordered(8)
|
||||||
|
.try_collect::<Vec<_>>()
|
||||||
|
.await?;
|
||||||
|
for mut urls in handles {
|
||||||
total.append(&mut urls);
|
total.append(&mut urls);
|
||||||
}
|
}
|
||||||
Ok(total.into_iter().unique().collect())
|
Ok(total.into_iter().unique().collect())
|
||||||
|
|
Loading…
Reference in a new issue