mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-22 14:16:19 +00:00
Compare commits
6 commits
290d29ea78
...
ec9ba5c53d
Author | SHA1 | Date | |
---|---|---|---|
ec9ba5c53d | |||
c687ea1484 | |||
cce34571f1 | |||
f7bc0a9db8 | |||
856dfcb1a4 | |||
94510825c1 |
7 changed files with 100 additions and 52 deletions
6
.github/workflows/container.yml
vendored
6
.github/workflows/container.yml
vendored
|
@ -13,15 +13,15 @@ jobs:
|
|||
name: chequear typescript del sitio
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
- uses: pnpm/action-setup@v2
|
||||
with:
|
||||
version: 8
|
||||
- name: Use Node.js 20
|
||||
uses: actions/setup-node@v3
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20
|
||||
cache: 'pnpm'
|
||||
cache: "pnpm"
|
||||
- name: Install dependencies
|
||||
run: pnpm install
|
||||
|
||||
|
|
16
scraper-rs/Cargo.lock
generated
16
scraper-rs/Cargo.lock
generated
|
@ -604,6 +604,15 @@ version = "0.3.3"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
|
||||
|
||||
[[package]]
|
||||
name = "html-escape"
|
||||
version = "0.2.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476"
|
||||
dependencies = [
|
||||
"utf8-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http"
|
||||
version = "0.2.11"
|
||||
|
@ -1229,6 +1238,7 @@ dependencies = [
|
|||
"deadpool",
|
||||
"deadpool-sqlite",
|
||||
"futures",
|
||||
"html-escape",
|
||||
"itertools",
|
||||
"nanoid",
|
||||
"quick-xml",
|
||||
|
@ -1614,6 +1624,12 @@ dependencies = [
|
|||
"percent-encoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "utf8-width"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3"
|
||||
|
||||
[[package]]
|
||||
name = "utf8parse"
|
||||
version = "0.2.1"
|
||||
|
|
|
@ -14,6 +14,7 @@ cron = "0.12.0"
|
|||
deadpool = "0.10.0"
|
||||
deadpool-sqlite = "0.7.0"
|
||||
futures = "0.3.30"
|
||||
html-escape = "0.2.13"
|
||||
itertools = "0.12.0"
|
||||
nanoid = "0.4.0"
|
||||
quick-xml = "0.31.0"
|
||||
|
|
|
@ -23,7 +23,7 @@ enum Supermercado {
|
|||
Coto,
|
||||
}
|
||||
impl Supermercado {
|
||||
fn host(self: &Self) -> &'static str {
|
||||
fn host(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Dia => "diaonline.supermercadosdia.com.ar",
|
||||
Self::Carrefour => "www.carrefour.com.ar",
|
||||
|
@ -38,6 +38,7 @@ enum Args {
|
|||
FetchList(FetchListArgs),
|
||||
ParseFile(ParseFileArgs),
|
||||
GetUrlList(GetUrlListArgs),
|
||||
ScrapUrl(ScrapUrlArgs),
|
||||
Auto(AutoArgs),
|
||||
Cron(AutoArgs),
|
||||
}
|
||||
|
@ -55,6 +56,10 @@ struct GetUrlListArgs {
|
|||
supermercado: Supermercado,
|
||||
}
|
||||
#[derive(clap::Args)]
|
||||
struct ScrapUrlArgs {
|
||||
url: String,
|
||||
}
|
||||
#[derive(clap::Args)]
|
||||
struct AutoArgs {}
|
||||
|
||||
#[tokio::main]
|
||||
|
@ -65,11 +70,20 @@ async fn main() -> anyhow::Result<()> {
|
|||
Args::FetchList(a) => fetch_list_cli(a.list_path).await,
|
||||
Args::ParseFile(a) => parse_file_cli(a.file_path).await,
|
||||
Args::GetUrlList(a) => get_url_list_cli(a.supermercado).await,
|
||||
Args::ScrapUrl(a) => scrap_url_cli(a.url).await,
|
||||
Args::Auto(_) => auto_cli().await,
|
||||
Args::Cron(_) => cron_cli().await,
|
||||
}
|
||||
}
|
||||
|
||||
async fn scrap_url_cli(url: String) -> anyhow::Result<()> {
|
||||
let client = build_client();
|
||||
let res = fetch_and_parse(&client, url.clone()).await;
|
||||
|
||||
println!("Result: {:#?}", res);
|
||||
res.map(|_| ())
|
||||
}
|
||||
|
||||
async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
|
||||
let links_str = fs::read_to_string(links_list_path).unwrap();
|
||||
let links = links_str
|
||||
|
@ -114,8 +128,7 @@ async fn fetch_list(pool: &Pool, links: Vec<String>) -> Counters {
|
|||
fn connect_db() -> Pool {
|
||||
let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string());
|
||||
let cfg = deadpool_sqlite::Config::new(db_path);
|
||||
let pool = cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap();
|
||||
pool
|
||||
cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap()
|
||||
}
|
||||
|
||||
fn build_client() -> reqwest::Client {
|
||||
|
@ -269,27 +282,32 @@ async fn scrap_url(
|
|||
let url_p = Url::parse(&url).unwrap();
|
||||
match url_p.host_str().unwrap() {
|
||||
"www.carrefour.com.ar" => {
|
||||
sites::carrefour::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
|
||||
sites::carrefour::parse(url, &tl::parse(body, tl::ParserOptions::default())?)
|
||||
}
|
||||
"diaonline.supermercadosdia.com.ar" => {
|
||||
sites::dia::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
|
||||
sites::dia::parse(url, &tl::parse(body, tl::ParserOptions::default())?)
|
||||
}
|
||||
"www.cotodigital3.com.ar" => {
|
||||
sites::coto::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
|
||||
sites::coto::parse(url, &tl::parse(body, tl::ParserOptions::default())?)
|
||||
}
|
||||
"www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await,
|
||||
s => bail!("Unknown host {}", s),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct AutoTelegram {
|
||||
token: String,
|
||||
chat_id: String,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct Auto {
|
||||
pool: Pool,
|
||||
telegram_token: String,
|
||||
telegram_chat_id: String,
|
||||
telegram: Option<AutoTelegram>,
|
||||
}
|
||||
impl Auto {
|
||||
async fn download_supermercado(self: Self, supermercado: Supermercado) -> anyhow::Result<()> {
|
||||
async fn download_supermercado(self, supermercado: Supermercado) -> anyhow::Result<()> {
|
||||
{
|
||||
let t0 = now_sec();
|
||||
self.get_and_save_urls(&supermercado).await?;
|
||||
|
@ -341,7 +359,7 @@ impl Auto {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_and_save_urls(self: &Self, supermercado: &Supermercado) -> anyhow::Result<()> {
|
||||
async fn get_and_save_urls(&self, supermercado: &Supermercado) -> anyhow::Result<()> {
|
||||
let urls = get_urls(supermercado).await?;
|
||||
self.pool
|
||||
.get()
|
||||
|
@ -367,30 +385,37 @@ impl Auto {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
async fn inform(self: &Self, msg: &str) {
|
||||
async fn inform(&self, msg: &str) {
|
||||
println!("{}", msg);
|
||||
let u = Url::parse_with_params(
|
||||
&format!(
|
||||
"https://api.telegram.org/bot{}/sendMessage",
|
||||
self.telegram_token
|
||||
),
|
||||
&[
|
||||
("chat_id", self.telegram_chat_id.clone()),
|
||||
("text", msg.to_string()),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
reqwest::get(u).await.unwrap();
|
||||
if let Some(telegram) = &self.telegram {
|
||||
let u = Url::parse_with_params(
|
||||
&format!("https://api.telegram.org/bot{}/sendMessage", telegram.token),
|
||||
&[
|
||||
("chat_id", telegram.chat_id.clone()),
|
||||
("text", msg.to_string()),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
reqwest::get(u).await.unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn auto_cli() -> anyhow::Result<()> {
|
||||
let db = connect_db();
|
||||
let auto = Auto {
|
||||
pool: db,
|
||||
telegram_token: env::var("TELEGRAM_BOT_TOKEN")?,
|
||||
telegram_chat_id: env::var("TELEGRAM_BOT_CHAT_ID")?,
|
||||
let telegram = {
|
||||
match (
|
||||
env::var("TELEGRAM_BOT_TOKEN"),
|
||||
env::var("TELEGRAM_BOT_CHAT_ID"),
|
||||
) {
|
||||
(Ok(token), Ok(chat_id)) => Some(AutoTelegram { token, chat_id }),
|
||||
_ => {
|
||||
tracing::warn!("No token or chat_id for telegram");
|
||||
None
|
||||
}
|
||||
}
|
||||
};
|
||||
let auto = Auto { pool: db, telegram };
|
||||
auto.inform("[auto] Empezando scrap").await;
|
||||
let handles: Vec<_> = Supermercado::value_variants()
|
||||
.iter()
|
||||
|
|
|
@ -53,7 +53,8 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error>
|
|||
.filter_map(|h| h.get(dom.parser()))
|
||||
.find_map(|n| n.as_tag())
|
||||
.map(|t| t.inner_text(dom.parser()))
|
||||
.map(|s| s.trim().to_string());
|
||||
// https://github.com/catdevnull/preciazo/issues/24
|
||||
.map(|s| html_escape::decode_html_entities(s.trim()).to_string());
|
||||
|
||||
let image_url = dom
|
||||
.query_selector(".zoomImage1")
|
||||
|
|
|
@ -118,25 +118,6 @@ pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> {
|
|||
.try_collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_decode_url() -> anyhow::Result<()> {
|
||||
let links = parse_urls_from_sitemap(
|
||||
r#"
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
|
||||
<url>
|
||||
<loc>https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g​-684952/p</loc>
|
||||
<lastmod>2024-01-12T10:41:25.962Z</lastmod>
|
||||
</url>"#,
|
||||
)?;
|
||||
assert_eq!(links[0], "https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g\u{200b}-684952/p");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<String>> {
|
||||
let mut total: Vec<String> = vec![];
|
||||
let client = build_client();
|
||||
|
@ -146,7 +127,6 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<St
|
|||
let url = url.to_string();
|
||||
async move {
|
||||
let client = client;
|
||||
let url = url;
|
||||
let text = get_retry_policy()
|
||||
.retry_if(|| do_request(&client, &url), retry_if_wasnt_not_found)
|
||||
.await?
|
||||
|
@ -165,3 +145,22 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<St
|
|||
}
|
||||
Ok(total.into_iter().unique().collect())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_decode_url() -> anyhow::Result<()> {
|
||||
let links = parse_urls_from_sitemap(
|
||||
r#"
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
|
||||
<url>
|
||||
<loc>https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g​-684952/p</loc>
|
||||
<lastmod>2024-01-12T10:41:25.962Z</lastmod>
|
||||
</url>"#,
|
||||
)?;
|
||||
assert_eq!(links[0], "https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g\u{200b}-684952/p");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,9 +8,15 @@ export const load: PageServerLoad = async ({ url }) => {
|
|||
const query = url.searchParams.get("q");
|
||||
let results: null | { ean: string; name: string; imageUrl: string }[] = null;
|
||||
if (query) {
|
||||
const sQuery = query
|
||||
.replaceAll(`"`, `""`)
|
||||
.split(" ")
|
||||
.map((s) => `"${s}"`)
|
||||
.join(" ");
|
||||
console.debug(sQuery);
|
||||
const sqlQuery = sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
|
||||
join precios p on p.ean = f.ean
|
||||
where f.name match ${`"${query}"`}
|
||||
where f.name match ${sQuery}
|
||||
group by p.ean
|
||||
having max(p.fetched_at)
|
||||
order by p.in_stock desc;`;
|
||||
|
|
Loading…
Reference in a new issue