coto: decodear html entities

This commit is contained in:
Cat /dev/Nulo 2024-01-25 16:49:31 -03:00
parent 856dfcb1a4
commit f7bc0a9db8
3 changed files with 19 additions and 1 deletions

16
scraper-rs/Cargo.lock generated
View file

@ -604,6 +604,15 @@ version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
[[package]]
name = "html-escape"
version = "0.2.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476"
dependencies = [
"utf8-width",
]
[[package]] [[package]]
name = "http" name = "http"
version = "0.2.11" version = "0.2.11"
@ -1229,6 +1238,7 @@ dependencies = [
"deadpool", "deadpool",
"deadpool-sqlite", "deadpool-sqlite",
"futures", "futures",
"html-escape",
"itertools", "itertools",
"nanoid", "nanoid",
"quick-xml", "quick-xml",
@ -1614,6 +1624,12 @@ dependencies = [
"percent-encoding", "percent-encoding",
] ]
[[package]]
name = "utf8-width"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3"
[[package]] [[package]]
name = "utf8parse" name = "utf8parse"
version = "0.2.1" version = "0.2.1"

View file

@ -14,6 +14,7 @@ cron = "0.12.0"
deadpool = "0.10.0" deadpool = "0.10.0"
deadpool-sqlite = "0.7.0" deadpool-sqlite = "0.7.0"
futures = "0.3.30" futures = "0.3.30"
html-escape = "0.2.13"
itertools = "0.12.0" itertools = "0.12.0"
nanoid = "0.4.0" nanoid = "0.4.0"
quick-xml = "0.31.0" quick-xml = "0.31.0"

View file

@ -53,7 +53,8 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error>
.filter_map(|h| h.get(dom.parser())) .filter_map(|h| h.get(dom.parser()))
.find_map(|n| n.as_tag()) .find_map(|n| n.as_tag())
.map(|t| t.inner_text(dom.parser())) .map(|t| t.inner_text(dom.parser()))
.map(|s| s.trim().to_string()); // https://github.com/catdevnull/preciazo/issues/24
.map(|s| html_escape::decode_html_entities(s.trim()).to_string());
let image_url = dom let image_url = dom
.query_selector(".zoomImage1") .query_selector(".zoomImage1")