paralelizar

This commit is contained in:
Cat /dev/Nulo 2024-01-12 10:06:44 -03:00
parent ba484709f8
commit 6b04dee2e4
3 changed files with 22 additions and 10 deletions

1
scraper-rs/Cargo.lock generated
View file

@ -1165,6 +1165,7 @@ dependencies = [
"anyhow",
"async-channel",
"clap",
"futures",
"itertools",
"nanoid",
"r2d2",

View file

@ -10,6 +10,7 @@ again = "0.1.2"
anyhow = "1.0.79"
async-channel = "2.1.1"
clap = { version = "4.4.15", features = ["derive"] }
futures = "0.3.30"
itertools = "0.12.0"
nanoid = "0.4.0"
r2d2 = "0.8.10"

View file

@ -1,4 +1,5 @@
use anyhow::{bail, Context};
use futures::{stream, StreamExt, TryStreamExt};
use itertools::Itertools;
use serde::Deserialize;
use simple_error::SimpleError;
@ -116,16 +117,25 @@ pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> {
.collect())
}
pub async fn get_urls_from_sitemap(sitemaps: &[&str]) -> anyhow::Result<Vec<String>> {
let mut total = vec![];
let client = &build_client();
for url in sitemaps {
pub async fn get_urls_from_sitemap<'a>(sitemaps: &[&str]) -> anyhow::Result<Vec<String>> {
let mut total: Vec<String> = vec![];
let client = build_client();
let handles = stream::iter(sitemaps)
.map(|url| {
let client = &client;
async move {
let text = get_retry_policy()
.retry(|| do_request(client, url))
.retry(|| do_request(client, &url))
.await?
.text()
.await?;
let mut urls = parse_urls_from_sitemap(&text)?;
parse_urls_from_sitemap(&text)
}
})
.buffer_unordered(8)
.try_collect::<Vec<_>>()
.await?;
for mut urls in handles {
total.append(&mut urls);
}
Ok(total.into_iter().unique().collect())