paralelizar

This commit is contained in:
Cat /dev/Nulo 2024-01-12 10:06:44 -03:00
parent ba484709f8
commit 6b04dee2e4
3 changed files with 22 additions and 10 deletions

1
scraper-rs/Cargo.lock generated
View file

@ -1165,6 +1165,7 @@ dependencies = [
"anyhow", "anyhow",
"async-channel", "async-channel",
"clap", "clap",
"futures",
"itertools", "itertools",
"nanoid", "nanoid",
"r2d2", "r2d2",

View file

@ -10,6 +10,7 @@ again = "0.1.2"
anyhow = "1.0.79" anyhow = "1.0.79"
async-channel = "2.1.1" async-channel = "2.1.1"
clap = { version = "4.4.15", features = ["derive"] } clap = { version = "4.4.15", features = ["derive"] }
futures = "0.3.30"
itertools = "0.12.0" itertools = "0.12.0"
nanoid = "0.4.0" nanoid = "0.4.0"
r2d2 = "0.8.10" r2d2 = "0.8.10"

View file

@ -1,4 +1,5 @@
use anyhow::{bail, Context}; use anyhow::{bail, Context};
use futures::{stream, StreamExt, TryStreamExt};
use itertools::Itertools; use itertools::Itertools;
use serde::Deserialize; use serde::Deserialize;
use simple_error::SimpleError; use simple_error::SimpleError;
@ -116,16 +117,25 @@ pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> {
.collect()) .collect())
} }
pub async fn get_urls_from_sitemap(sitemaps: &[&str]) -> anyhow::Result<Vec<String>> { pub async fn get_urls_from_sitemap<'a>(sitemaps: &[&str]) -> anyhow::Result<Vec<String>> {
let mut total = vec![]; let mut total: Vec<String> = vec![];
let client = &build_client(); let client = build_client();
for url in sitemaps { let handles = stream::iter(sitemaps)
let text = get_retry_policy() .map(|url| {
.retry(|| do_request(client, url)) let client = &client;
.await? async move {
.text() let text = get_retry_policy()
.await?; .retry(|| do_request(client, &url))
let mut urls = parse_urls_from_sitemap(&text)?; .await?
.text()
.await?;
parse_urls_from_sitemap(&text)
}
})
.buffer_unordered(8)
.try_collect::<Vec<_>>()
.await?;
for mut urls in handles {
total.append(&mut urls); total.append(&mut urls);
} }
Ok(total.into_iter().unique().collect()) Ok(total.into_iter().unique().collect())