Compare commits

..

No commits in common. "ec9ba5c53d53aa0fbf51ba70176ad03ec3a3581f" and "290d29ea78c1ed07524e4e8bf4a66dc22634ac1e" have entirely different histories.

7 changed files with 61 additions and 109 deletions

View file

@ -13,15 +13,15 @@ jobs:
name: chequear typescript del sitio name: chequear typescript del sitio
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v3
- uses: pnpm/action-setup@v2 - uses: pnpm/action-setup@v2
with: with:
version: 8 version: 8
- name: Use Node.js 20 - name: Use Node.js 20
uses: actions/setup-node@v4 uses: actions/setup-node@v3
with: with:
node-version: 20 node-version: 20
cache: "pnpm" cache: 'pnpm'
- name: Install dependencies - name: Install dependencies
run: pnpm install run: pnpm install

16
scraper-rs/Cargo.lock generated
View file

@ -604,15 +604,6 @@ version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
[[package]]
name = "html-escape"
version = "0.2.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476"
dependencies = [
"utf8-width",
]
[[package]] [[package]]
name = "http" name = "http"
version = "0.2.11" version = "0.2.11"
@ -1238,7 +1229,6 @@ dependencies = [
"deadpool", "deadpool",
"deadpool-sqlite", "deadpool-sqlite",
"futures", "futures",
"html-escape",
"itertools", "itertools",
"nanoid", "nanoid",
"quick-xml", "quick-xml",
@ -1624,12 +1614,6 @@ dependencies = [
"percent-encoding", "percent-encoding",
] ]
[[package]]
name = "utf8-width"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3"
[[package]] [[package]]
name = "utf8parse" name = "utf8parse"
version = "0.2.1" version = "0.2.1"

View file

@ -14,7 +14,6 @@ cron = "0.12.0"
deadpool = "0.10.0" deadpool = "0.10.0"
deadpool-sqlite = "0.7.0" deadpool-sqlite = "0.7.0"
futures = "0.3.30" futures = "0.3.30"
html-escape = "0.2.13"
itertools = "0.12.0" itertools = "0.12.0"
nanoid = "0.4.0" nanoid = "0.4.0"
quick-xml = "0.31.0" quick-xml = "0.31.0"

View file

@ -23,7 +23,7 @@ enum Supermercado {
Coto, Coto,
} }
impl Supermercado { impl Supermercado {
fn host(&self) -> &'static str { fn host(self: &Self) -> &'static str {
match self { match self {
Self::Dia => "diaonline.supermercadosdia.com.ar", Self::Dia => "diaonline.supermercadosdia.com.ar",
Self::Carrefour => "www.carrefour.com.ar", Self::Carrefour => "www.carrefour.com.ar",
@ -38,7 +38,6 @@ enum Args {
FetchList(FetchListArgs), FetchList(FetchListArgs),
ParseFile(ParseFileArgs), ParseFile(ParseFileArgs),
GetUrlList(GetUrlListArgs), GetUrlList(GetUrlListArgs),
ScrapUrl(ScrapUrlArgs),
Auto(AutoArgs), Auto(AutoArgs),
Cron(AutoArgs), Cron(AutoArgs),
} }
@ -56,10 +55,6 @@ struct GetUrlListArgs {
supermercado: Supermercado, supermercado: Supermercado,
} }
#[derive(clap::Args)] #[derive(clap::Args)]
struct ScrapUrlArgs {
url: String,
}
#[derive(clap::Args)]
struct AutoArgs {} struct AutoArgs {}
#[tokio::main] #[tokio::main]
@ -70,20 +65,11 @@ async fn main() -> anyhow::Result<()> {
Args::FetchList(a) => fetch_list_cli(a.list_path).await, Args::FetchList(a) => fetch_list_cli(a.list_path).await,
Args::ParseFile(a) => parse_file_cli(a.file_path).await, Args::ParseFile(a) => parse_file_cli(a.file_path).await,
Args::GetUrlList(a) => get_url_list_cli(a.supermercado).await, Args::GetUrlList(a) => get_url_list_cli(a.supermercado).await,
Args::ScrapUrl(a) => scrap_url_cli(a.url).await,
Args::Auto(_) => auto_cli().await, Args::Auto(_) => auto_cli().await,
Args::Cron(_) => cron_cli().await, Args::Cron(_) => cron_cli().await,
} }
} }
async fn scrap_url_cli(url: String) -> anyhow::Result<()> {
let client = build_client();
let res = fetch_and_parse(&client, url.clone()).await;
println!("Result: {:#?}", res);
res.map(|_| ())
}
async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> { async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
let links_str = fs::read_to_string(links_list_path).unwrap(); let links_str = fs::read_to_string(links_list_path).unwrap();
let links = links_str let links = links_str
@ -128,7 +114,8 @@ async fn fetch_list(pool: &Pool, links: Vec<String>) -> Counters {
fn connect_db() -> Pool { fn connect_db() -> Pool {
let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string()); let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string());
let cfg = deadpool_sqlite::Config::new(db_path); let cfg = deadpool_sqlite::Config::new(db_path);
cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap() let pool = cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap();
pool
} }
fn build_client() -> reqwest::Client { fn build_client() -> reqwest::Client {
@ -282,32 +269,27 @@ async fn scrap_url(
let url_p = Url::parse(&url).unwrap(); let url_p = Url::parse(&url).unwrap();
match url_p.host_str().unwrap() { match url_p.host_str().unwrap() {
"www.carrefour.com.ar" => { "www.carrefour.com.ar" => {
sites::carrefour::parse(url, &tl::parse(body, tl::ParserOptions::default())?) sites::carrefour::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
} }
"diaonline.supermercadosdia.com.ar" => { "diaonline.supermercadosdia.com.ar" => {
sites::dia::parse(url, &tl::parse(body, tl::ParserOptions::default())?) sites::dia::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
} }
"www.cotodigital3.com.ar" => { "www.cotodigital3.com.ar" => {
sites::coto::parse(url, &tl::parse(body, tl::ParserOptions::default())?) sites::coto::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
} }
"www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await, "www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await,
s => bail!("Unknown host {}", s), s => bail!("Unknown host {}", s),
} }
} }
#[derive(Clone)]
struct AutoTelegram {
token: String,
chat_id: String,
}
#[derive(Clone)] #[derive(Clone)]
struct Auto { struct Auto {
pool: Pool, pool: Pool,
telegram: Option<AutoTelegram>, telegram_token: String,
telegram_chat_id: String,
} }
impl Auto { impl Auto {
async fn download_supermercado(self, supermercado: Supermercado) -> anyhow::Result<()> { async fn download_supermercado(self: Self, supermercado: Supermercado) -> anyhow::Result<()> {
{ {
let t0 = now_sec(); let t0 = now_sec();
self.get_and_save_urls(&supermercado).await?; self.get_and_save_urls(&supermercado).await?;
@ -359,7 +341,7 @@ impl Auto {
Ok(()) Ok(())
} }
async fn get_and_save_urls(&self, supermercado: &Supermercado) -> anyhow::Result<()> { async fn get_and_save_urls(self: &Self, supermercado: &Supermercado) -> anyhow::Result<()> {
let urls = get_urls(supermercado).await?; let urls = get_urls(supermercado).await?;
self.pool self.pool
.get() .get()
@ -385,37 +367,30 @@ impl Auto {
Ok(()) Ok(())
} }
async fn inform(&self, msg: &str) { async fn inform(self: &Self, msg: &str) {
println!("{}", msg); println!("{}", msg);
if let Some(telegram) = &self.telegram {
let u = Url::parse_with_params( let u = Url::parse_with_params(
&format!("https://api.telegram.org/bot{}/sendMessage", telegram.token), &format!(
"https://api.telegram.org/bot{}/sendMessage",
self.telegram_token
),
&[ &[
("chat_id", telegram.chat_id.clone()), ("chat_id", self.telegram_chat_id.clone()),
("text", msg.to_string()), ("text", msg.to_string()),
], ],
) )
.unwrap(); .unwrap();
reqwest::get(u).await.unwrap(); reqwest::get(u).await.unwrap();
} }
}
} }
async fn auto_cli() -> anyhow::Result<()> { async fn auto_cli() -> anyhow::Result<()> {
let db = connect_db(); let db = connect_db();
let telegram = { let auto = Auto {
match ( pool: db,
env::var("TELEGRAM_BOT_TOKEN"), telegram_token: env::var("TELEGRAM_BOT_TOKEN")?,
env::var("TELEGRAM_BOT_CHAT_ID"), telegram_chat_id: env::var("TELEGRAM_BOT_CHAT_ID")?,
) {
(Ok(token), Ok(chat_id)) => Some(AutoTelegram { token, chat_id }),
_ => {
tracing::warn!("No token or chat_id for telegram");
None
}
}
}; };
let auto = Auto { pool: db, telegram };
auto.inform("[auto] Empezando scrap").await; auto.inform("[auto] Empezando scrap").await;
let handles: Vec<_> = Supermercado::value_variants() let handles: Vec<_> = Supermercado::value_variants()
.iter() .iter()

View file

@ -53,8 +53,7 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error>
.filter_map(|h| h.get(dom.parser())) .filter_map(|h| h.get(dom.parser()))
.find_map(|n| n.as_tag()) .find_map(|n| n.as_tag())
.map(|t| t.inner_text(dom.parser())) .map(|t| t.inner_text(dom.parser()))
// https://github.com/catdevnull/preciazo/issues/24 .map(|s| s.trim().to_string());
.map(|s| html_escape::decode_html_entities(s.trim()).to_string());
let image_url = dom let image_url = dom
.query_selector(".zoomImage1") .query_selector(".zoomImage1")

View file

@ -118,34 +118,6 @@ pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> {
.try_collect() .try_collect()
} }
pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<String>> {
let mut total: Vec<String> = vec![];
let client = build_client();
let handles = stream::iter(sitemaps)
.map(|url| {
let client = client.clone();
let url = url.to_string();
async move {
let client = client;
let text = get_retry_policy()
.retry_if(|| do_request(&client, &url), retry_if_wasnt_not_found)
.await?
.text()
.await?;
parse_urls_from_sitemap(&text)
}
})
// https://github.com/rust-lang/rust/issues/89976#issuecomment-1073115246
.boxed()
.buffer_unordered(8)
.try_collect::<Vec<_>>()
.await?;
for mut urls in handles {
total.append(&mut urls);
}
Ok(total.into_iter().unique().collect())
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
@ -164,3 +136,32 @@ mod tests {
Ok(()) Ok(())
} }
} }
pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<String>> {
let mut total: Vec<String> = vec![];
let client = build_client();
let handles = stream::iter(sitemaps)
.map(|url| {
let client = client.clone();
let url = url.to_string();
async move {
let client = client;
let url = url;
let text = get_retry_policy()
.retry_if(|| do_request(&client, &url), retry_if_wasnt_not_found)
.await?
.text()
.await?;
parse_urls_from_sitemap(&text)
}
})
// https://github.com/rust-lang/rust/issues/89976#issuecomment-1073115246
.boxed()
.buffer_unordered(8)
.try_collect::<Vec<_>>()
.await?;
for mut urls in handles {
total.append(&mut urls);
}
Ok(total.into_iter().unique().collect())
}

View file

@ -8,15 +8,9 @@ export const load: PageServerLoad = async ({ url }) => {
const query = url.searchParams.get("q"); const query = url.searchParams.get("q");
let results: null | { ean: string; name: string; imageUrl: string }[] = null; let results: null | { ean: string; name: string; imageUrl: string }[] = null;
if (query) { if (query) {
const sQuery = query
.replaceAll(`"`, `""`)
.split(" ")
.map((s) => `"${s}"`)
.join(" ");
console.debug(sQuery);
const sqlQuery = sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f const sqlQuery = sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
join precios p on p.ean = f.ean join precios p on p.ean = f.ean
where f.name match ${sQuery} where f.name match ${`"${query}"`}
group by p.ean group by p.ean
having max(p.fetched_at) having max(p.fetched_at)
order by p.in_stock desc;`; order by p.in_stock desc;`;