parsear correctamente urls en sitemaps xml

This commit is contained in:
Cat /dev/Nulo 2024-01-14 10:45:47 -03:00
parent d2dbd3c093
commit dade60d677
3 changed files with 35 additions and 4 deletions

10
scraper-rs/Cargo.lock generated
View file

@ -902,6 +902,15 @@ dependencies = [
"unicode-ident", "unicode-ident",
] ]
[[package]]
name = "quick-xml"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "quote" name = "quote"
version = "1.0.35" version = "1.0.35"
@ -1132,6 +1141,7 @@ dependencies = [
"futures", "futures",
"itertools", "itertools",
"nanoid", "nanoid",
"quick-xml",
"rand 0.8.5", "rand 0.8.5",
"reqwest", "reqwest",
"rusqlite", "rusqlite",

View file

@ -14,6 +14,7 @@ deadpool-sqlite = "0.7.0"
futures = "0.3.30" futures = "0.3.30"
itertools = "0.12.0" itertools = "0.12.0"
nanoid = "0.4.0" nanoid = "0.4.0"
quick-xml = "0.31.0"
rand = "0.8.5" rand = "0.8.5"
reqwest = { version = "0.11.23", default-features = false, features = [ reqwest = { version = "0.11.23", default-features = false, features = [
"rustls-tls", "rustls-tls",

View file

@ -107,14 +107,34 @@ pub fn in_stock_from_meta(dom: &VDom) -> anyhow::Result<bool> {
pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> { pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> {
let dom = tl::parse(sitemap, tl::ParserOptions::default())?; let dom = tl::parse(sitemap, tl::ParserOptions::default())?;
Ok(dom dom.query_selector("loc")
.query_selector("loc")
.unwrap() .unwrap()
.filter_map(|h| h.get(dom.parser())) .filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag()) .filter_map(|n| n.as_tag())
.map(|t| t.inner_text(dom.parser())) .map(|t| t.inner_text(dom.parser()))
.map(|s| s.to_string()) .map(|s| -> anyhow::Result<String> {
.collect()) Ok(quick_xml::escape::unescape(s.as_ref())?.to_string())
})
.try_collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_decode_url() -> anyhow::Result<()> {
let links = parse_urls_from_sitemap(
r#"
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
<url>
<loc>https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g&#x200B;-684952/p</loc>
<lastmod>2024-01-12T10:41:25.962Z</lastmod>
</url>"#,
)?;
assert_eq!(links[0], "https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g\u{200b}-684952/p");
Ok(())
}
} }
pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<String>> { pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<String>> {