mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-25 19:16:19 +00:00
paralelizar
This commit is contained in:
parent
ba484709f8
commit
6b04dee2e4
3 changed files with 22 additions and 10 deletions
1
scraper-rs/Cargo.lock
generated
1
scraper-rs/Cargo.lock
generated
|
@ -1165,6 +1165,7 @@ dependencies = [
|
|||
"anyhow",
|
||||
"async-channel",
|
||||
"clap",
|
||||
"futures",
|
||||
"itertools",
|
||||
"nanoid",
|
||||
"r2d2",
|
||||
|
|
|
@ -10,6 +10,7 @@ again = "0.1.2"
|
|||
anyhow = "1.0.79"
|
||||
async-channel = "2.1.1"
|
||||
clap = { version = "4.4.15", features = ["derive"] }
|
||||
futures = "0.3.30"
|
||||
itertools = "0.12.0"
|
||||
nanoid = "0.4.0"
|
||||
r2d2 = "0.8.10"
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
use anyhow::{bail, Context};
|
||||
use futures::{stream, StreamExt, TryStreamExt};
|
||||
use itertools::Itertools;
|
||||
use serde::Deserialize;
|
||||
use simple_error::SimpleError;
|
||||
|
@ -116,16 +117,25 @@ pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> {
|
|||
.collect())
|
||||
}
|
||||
|
||||
pub async fn get_urls_from_sitemap(sitemaps: &[&str]) -> anyhow::Result<Vec<String>> {
|
||||
let mut total = vec![];
|
||||
let client = &build_client();
|
||||
for url in sitemaps {
|
||||
let text = get_retry_policy()
|
||||
.retry(|| do_request(client, url))
|
||||
.await?
|
||||
.text()
|
||||
.await?;
|
||||
let mut urls = parse_urls_from_sitemap(&text)?;
|
||||
pub async fn get_urls_from_sitemap<'a>(sitemaps: &[&str]) -> anyhow::Result<Vec<String>> {
|
||||
let mut total: Vec<String> = vec![];
|
||||
let client = build_client();
|
||||
let handles = stream::iter(sitemaps)
|
||||
.map(|url| {
|
||||
let client = &client;
|
||||
async move {
|
||||
let text = get_retry_policy()
|
||||
.retry(|| do_request(client, &url))
|
||||
.await?
|
||||
.text()
|
||||
.await?;
|
||||
parse_urls_from_sitemap(&text)
|
||||
}
|
||||
})
|
||||
.buffer_unordered(8)
|
||||
.try_collect::<Vec<_>>()
|
||||
.await?;
|
||||
for mut urls in handles {
|
||||
total.append(&mut urls);
|
||||
}
|
||||
Ok(total.into_iter().unique().collect())
|
||||
|
|
Loading…
Reference in a new issue