mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 03:26:19 +00:00
parsear correctamente urls en sitemaps xml
This commit is contained in:
parent
d2dbd3c093
commit
dade60d677
3 changed files with 35 additions and 4 deletions
10
scraper-rs/Cargo.lock
generated
10
scraper-rs/Cargo.lock
generated
|
@ -902,6 +902,15 @@ dependencies = [
|
||||||
"unicode-ident",
|
"unicode-ident",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "quick-xml"
|
||||||
|
version = "0.31.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "quote"
|
name = "quote"
|
||||||
version = "1.0.35"
|
version = "1.0.35"
|
||||||
|
@ -1132,6 +1141,7 @@ dependencies = [
|
||||||
"futures",
|
"futures",
|
||||||
"itertools",
|
"itertools",
|
||||||
"nanoid",
|
"nanoid",
|
||||||
|
"quick-xml",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"rusqlite",
|
"rusqlite",
|
||||||
|
|
|
@ -14,6 +14,7 @@ deadpool-sqlite = "0.7.0"
|
||||||
futures = "0.3.30"
|
futures = "0.3.30"
|
||||||
itertools = "0.12.0"
|
itertools = "0.12.0"
|
||||||
nanoid = "0.4.0"
|
nanoid = "0.4.0"
|
||||||
|
quick-xml = "0.31.0"
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
reqwest = { version = "0.11.23", default-features = false, features = [
|
reqwest = { version = "0.11.23", default-features = false, features = [
|
||||||
"rustls-tls",
|
"rustls-tls",
|
||||||
|
|
|
@ -107,14 +107,34 @@ pub fn in_stock_from_meta(dom: &VDom) -> anyhow::Result<bool> {
|
||||||
|
|
||||||
pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> {
|
pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> {
|
||||||
let dom = tl::parse(sitemap, tl::ParserOptions::default())?;
|
let dom = tl::parse(sitemap, tl::ParserOptions::default())?;
|
||||||
Ok(dom
|
dom.query_selector("loc")
|
||||||
.query_selector("loc")
|
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.filter_map(|h| h.get(dom.parser()))
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
.filter_map(|n| n.as_tag())
|
.filter_map(|n| n.as_tag())
|
||||||
.map(|t| t.inner_text(dom.parser()))
|
.map(|t| t.inner_text(dom.parser()))
|
||||||
.map(|s| s.to_string())
|
.map(|s| -> anyhow::Result<String> {
|
||||||
.collect())
|
Ok(quick_xml::escape::unescape(s.as_ref())?.to_string())
|
||||||
|
})
|
||||||
|
.try_collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_decode_url() -> anyhow::Result<()> {
|
||||||
|
let links = parse_urls_from_sitemap(
|
||||||
|
r#"
|
||||||
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
|
||||||
|
<url>
|
||||||
|
<loc>https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g​-684952/p</loc>
|
||||||
|
<lastmod>2024-01-12T10:41:25.962Z</lastmod>
|
||||||
|
</url>"#,
|
||||||
|
)?;
|
||||||
|
assert_eq!(links[0], "https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g\u{200b}-684952/p");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<String>> {
|
pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<String>> {
|
||||||
|
|
Loading…
Reference in a new issue