Compare commits

...

6 commits

Author SHA1 Message Date
ec9ba5c53d search: buscar keywords por separado y escapar
fixes #21
2024-01-25 17:16:12 -03:00
c687ea1484 seguir al comandante clippy 2024-01-25 17:12:32 -03:00
cce34571f1 no requerir telegram para auto 2024-01-25 16:56:27 -03:00
f7bc0a9db8 coto: decodear html entities 2024-01-25 16:49:31 -03:00
856dfcb1a4 cli: scrap url individual 2024-01-25 16:47:51 -03:00
94510825c1 ci: update 2024-01-24 21:20:34 -03:00
7 changed files with 100 additions and 52 deletions

View file

@ -13,15 +13,15 @@ jobs:
name: chequear typescript del sitio
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- uses: pnpm/action-setup@v2
with:
version: 8
- name: Use Node.js 20
uses: actions/setup-node@v3
uses: actions/setup-node@v4
with:
node-version: 20
cache: 'pnpm'
cache: "pnpm"
- name: Install dependencies
run: pnpm install

16
scraper-rs/Cargo.lock generated
View file

@ -604,6 +604,15 @@ version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
[[package]]
name = "html-escape"
version = "0.2.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476"
dependencies = [
"utf8-width",
]
[[package]]
name = "http"
version = "0.2.11"
@ -1229,6 +1238,7 @@ dependencies = [
"deadpool",
"deadpool-sqlite",
"futures",
"html-escape",
"itertools",
"nanoid",
"quick-xml",
@ -1614,6 +1624,12 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "utf8-width"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3"
[[package]]
name = "utf8parse"
version = "0.2.1"

View file

@ -14,6 +14,7 @@ cron = "0.12.0"
deadpool = "0.10.0"
deadpool-sqlite = "0.7.0"
futures = "0.3.30"
html-escape = "0.2.13"
itertools = "0.12.0"
nanoid = "0.4.0"
quick-xml = "0.31.0"

View file

@ -23,7 +23,7 @@ enum Supermercado {
Coto,
}
impl Supermercado {
fn host(self: &Self) -> &'static str {
fn host(&self) -> &'static str {
match self {
Self::Dia => "diaonline.supermercadosdia.com.ar",
Self::Carrefour => "www.carrefour.com.ar",
@ -38,6 +38,7 @@ enum Args {
FetchList(FetchListArgs),
ParseFile(ParseFileArgs),
GetUrlList(GetUrlListArgs),
ScrapUrl(ScrapUrlArgs),
Auto(AutoArgs),
Cron(AutoArgs),
}
@ -55,6 +56,10 @@ struct GetUrlListArgs {
supermercado: Supermercado,
}
#[derive(clap::Args)]
struct ScrapUrlArgs {
url: String,
}
#[derive(clap::Args)]
struct AutoArgs {}
#[tokio::main]
@ -65,11 +70,20 @@ async fn main() -> anyhow::Result<()> {
Args::FetchList(a) => fetch_list_cli(a.list_path).await,
Args::ParseFile(a) => parse_file_cli(a.file_path).await,
Args::GetUrlList(a) => get_url_list_cli(a.supermercado).await,
Args::ScrapUrl(a) => scrap_url_cli(a.url).await,
Args::Auto(_) => auto_cli().await,
Args::Cron(_) => cron_cli().await,
}
}
async fn scrap_url_cli(url: String) -> anyhow::Result<()> {
let client = build_client();
let res = fetch_and_parse(&client, url.clone()).await;
println!("Result: {:#?}", res);
res.map(|_| ())
}
async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
let links_str = fs::read_to_string(links_list_path).unwrap();
let links = links_str
@ -114,8 +128,7 @@ async fn fetch_list(pool: &Pool, links: Vec<String>) -> Counters {
fn connect_db() -> Pool {
let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string());
let cfg = deadpool_sqlite::Config::new(db_path);
let pool = cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap();
pool
cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap()
}
fn build_client() -> reqwest::Client {
@ -269,27 +282,32 @@ async fn scrap_url(
let url_p = Url::parse(&url).unwrap();
match url_p.host_str().unwrap() {
"www.carrefour.com.ar" => {
sites::carrefour::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
sites::carrefour::parse(url, &tl::parse(body, tl::ParserOptions::default())?)
}
"diaonline.supermercadosdia.com.ar" => {
sites::dia::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
sites::dia::parse(url, &tl::parse(body, tl::ParserOptions::default())?)
}
"www.cotodigital3.com.ar" => {
sites::coto::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
sites::coto::parse(url, &tl::parse(body, tl::ParserOptions::default())?)
}
"www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await,
s => bail!("Unknown host {}", s),
}
}
#[derive(Clone)]
struct AutoTelegram {
token: String,
chat_id: String,
}
#[derive(Clone)]
struct Auto {
pool: Pool,
telegram_token: String,
telegram_chat_id: String,
telegram: Option<AutoTelegram>,
}
impl Auto {
async fn download_supermercado(self: Self, supermercado: Supermercado) -> anyhow::Result<()> {
async fn download_supermercado(self, supermercado: Supermercado) -> anyhow::Result<()> {
{
let t0 = now_sec();
self.get_and_save_urls(&supermercado).await?;
@ -341,7 +359,7 @@ impl Auto {
Ok(())
}
async fn get_and_save_urls(self: &Self, supermercado: &Supermercado) -> anyhow::Result<()> {
async fn get_and_save_urls(&self, supermercado: &Supermercado) -> anyhow::Result<()> {
let urls = get_urls(supermercado).await?;
self.pool
.get()
@ -367,30 +385,37 @@ impl Auto {
Ok(())
}
async fn inform(self: &Self, msg: &str) {
async fn inform(&self, msg: &str) {
println!("{}", msg);
let u = Url::parse_with_params(
&format!(
"https://api.telegram.org/bot{}/sendMessage",
self.telegram_token
),
&[
("chat_id", self.telegram_chat_id.clone()),
("text", msg.to_string()),
],
)
.unwrap();
reqwest::get(u).await.unwrap();
if let Some(telegram) = &self.telegram {
let u = Url::parse_with_params(
&format!("https://api.telegram.org/bot{}/sendMessage", telegram.token),
&[
("chat_id", telegram.chat_id.clone()),
("text", msg.to_string()),
],
)
.unwrap();
reqwest::get(u).await.unwrap();
}
}
}
async fn auto_cli() -> anyhow::Result<()> {
let db = connect_db();
let auto = Auto {
pool: db,
telegram_token: env::var("TELEGRAM_BOT_TOKEN")?,
telegram_chat_id: env::var("TELEGRAM_BOT_CHAT_ID")?,
let telegram = {
match (
env::var("TELEGRAM_BOT_TOKEN"),
env::var("TELEGRAM_BOT_CHAT_ID"),
) {
(Ok(token), Ok(chat_id)) => Some(AutoTelegram { token, chat_id }),
_ => {
tracing::warn!("No token or chat_id for telegram");
None
}
}
};
let auto = Auto { pool: db, telegram };
auto.inform("[auto] Empezando scrap").await;
let handles: Vec<_> = Supermercado::value_variants()
.iter()

View file

@ -53,7 +53,8 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error>
.filter_map(|h| h.get(dom.parser()))
.find_map(|n| n.as_tag())
.map(|t| t.inner_text(dom.parser()))
.map(|s| s.trim().to_string());
// https://github.com/catdevnull/preciazo/issues/24
.map(|s| html_escape::decode_html_entities(s.trim()).to_string());
let image_url = dom
.query_selector(".zoomImage1")

View file

@ -118,25 +118,6 @@ pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> {
.try_collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_decode_url() -> anyhow::Result<()> {
let links = parse_urls_from_sitemap(
r#"
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
<url>
<loc>https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g&#x200B;-684952/p</loc>
<lastmod>2024-01-12T10:41:25.962Z</lastmod>
</url>"#,
)?;
assert_eq!(links[0], "https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g\u{200b}-684952/p");
Ok(())
}
}
pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<String>> {
let mut total: Vec<String> = vec![];
let client = build_client();
@ -146,7 +127,6 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<St
let url = url.to_string();
async move {
let client = client;
let url = url;
let text = get_retry_policy()
.retry_if(|| do_request(&client, &url), retry_if_wasnt_not_found)
.await?
@ -165,3 +145,22 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<St
}
Ok(total.into_iter().unique().collect())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_decode_url() -> anyhow::Result<()> {
let links = parse_urls_from_sitemap(
r#"
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
<url>
<loc>https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g&#x200B;-684952/p</loc>
<lastmod>2024-01-12T10:41:25.962Z</lastmod>
</url>"#,
)?;
assert_eq!(links[0], "https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g\u{200b}-684952/p");
Ok(())
}
}

View file

@ -8,9 +8,15 @@ export const load: PageServerLoad = async ({ url }) => {
const query = url.searchParams.get("q");
let results: null | { ean: string; name: string; imageUrl: string }[] = null;
if (query) {
const sQuery = query
.replaceAll(`"`, `""`)
.split(" ")
.map((s) => `"${s}"`)
.join(" ");
console.debug(sQuery);
const sqlQuery = sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
join precios p on p.ean = f.ean
where f.name match ${`"${query}"`}
where f.name match ${sQuery}
group by p.ean
having max(p.fetched_at)
order by p.in_stock desc;`;