mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-29 13:06:19 +00:00
Compare commits
6 commits
290d29ea78
...
ec9ba5c53d
Author | SHA1 | Date | |
---|---|---|---|
ec9ba5c53d | |||
c687ea1484 | |||
cce34571f1 | |||
f7bc0a9db8 | |||
856dfcb1a4 | |||
94510825c1 |
7 changed files with 100 additions and 52 deletions
6
.github/workflows/container.yml
vendored
6
.github/workflows/container.yml
vendored
|
@ -13,15 +13,15 @@ jobs:
|
||||||
name: chequear typescript del sitio
|
name: chequear typescript del sitio
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v4
|
||||||
- uses: pnpm/action-setup@v2
|
- uses: pnpm/action-setup@v2
|
||||||
with:
|
with:
|
||||||
version: 8
|
version: 8
|
||||||
- name: Use Node.js 20
|
- name: Use Node.js 20
|
||||||
uses: actions/setup-node@v3
|
uses: actions/setup-node@v4
|
||||||
with:
|
with:
|
||||||
node-version: 20
|
node-version: 20
|
||||||
cache: 'pnpm'
|
cache: "pnpm"
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: pnpm install
|
run: pnpm install
|
||||||
|
|
||||||
|
|
16
scraper-rs/Cargo.lock
generated
16
scraper-rs/Cargo.lock
generated
|
@ -604,6 +604,15 @@ version = "0.3.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
|
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "html-escape"
|
||||||
|
version = "0.2.13"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476"
|
||||||
|
dependencies = [
|
||||||
|
"utf8-width",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "http"
|
name = "http"
|
||||||
version = "0.2.11"
|
version = "0.2.11"
|
||||||
|
@ -1229,6 +1238,7 @@ dependencies = [
|
||||||
"deadpool",
|
"deadpool",
|
||||||
"deadpool-sqlite",
|
"deadpool-sqlite",
|
||||||
"futures",
|
"futures",
|
||||||
|
"html-escape",
|
||||||
"itertools",
|
"itertools",
|
||||||
"nanoid",
|
"nanoid",
|
||||||
"quick-xml",
|
"quick-xml",
|
||||||
|
@ -1614,6 +1624,12 @@ dependencies = [
|
||||||
"percent-encoding",
|
"percent-encoding",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "utf8-width"
|
||||||
|
version = "0.1.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "utf8parse"
|
name = "utf8parse"
|
||||||
version = "0.2.1"
|
version = "0.2.1"
|
||||||
|
|
|
@ -14,6 +14,7 @@ cron = "0.12.0"
|
||||||
deadpool = "0.10.0"
|
deadpool = "0.10.0"
|
||||||
deadpool-sqlite = "0.7.0"
|
deadpool-sqlite = "0.7.0"
|
||||||
futures = "0.3.30"
|
futures = "0.3.30"
|
||||||
|
html-escape = "0.2.13"
|
||||||
itertools = "0.12.0"
|
itertools = "0.12.0"
|
||||||
nanoid = "0.4.0"
|
nanoid = "0.4.0"
|
||||||
quick-xml = "0.31.0"
|
quick-xml = "0.31.0"
|
||||||
|
|
|
@ -23,7 +23,7 @@ enum Supermercado {
|
||||||
Coto,
|
Coto,
|
||||||
}
|
}
|
||||||
impl Supermercado {
|
impl Supermercado {
|
||||||
fn host(self: &Self) -> &'static str {
|
fn host(&self) -> &'static str {
|
||||||
match self {
|
match self {
|
||||||
Self::Dia => "diaonline.supermercadosdia.com.ar",
|
Self::Dia => "diaonline.supermercadosdia.com.ar",
|
||||||
Self::Carrefour => "www.carrefour.com.ar",
|
Self::Carrefour => "www.carrefour.com.ar",
|
||||||
|
@ -38,6 +38,7 @@ enum Args {
|
||||||
FetchList(FetchListArgs),
|
FetchList(FetchListArgs),
|
||||||
ParseFile(ParseFileArgs),
|
ParseFile(ParseFileArgs),
|
||||||
GetUrlList(GetUrlListArgs),
|
GetUrlList(GetUrlListArgs),
|
||||||
|
ScrapUrl(ScrapUrlArgs),
|
||||||
Auto(AutoArgs),
|
Auto(AutoArgs),
|
||||||
Cron(AutoArgs),
|
Cron(AutoArgs),
|
||||||
}
|
}
|
||||||
|
@ -55,6 +56,10 @@ struct GetUrlListArgs {
|
||||||
supermercado: Supermercado,
|
supermercado: Supermercado,
|
||||||
}
|
}
|
||||||
#[derive(clap::Args)]
|
#[derive(clap::Args)]
|
||||||
|
struct ScrapUrlArgs {
|
||||||
|
url: String,
|
||||||
|
}
|
||||||
|
#[derive(clap::Args)]
|
||||||
struct AutoArgs {}
|
struct AutoArgs {}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
|
@ -65,11 +70,20 @@ async fn main() -> anyhow::Result<()> {
|
||||||
Args::FetchList(a) => fetch_list_cli(a.list_path).await,
|
Args::FetchList(a) => fetch_list_cli(a.list_path).await,
|
||||||
Args::ParseFile(a) => parse_file_cli(a.file_path).await,
|
Args::ParseFile(a) => parse_file_cli(a.file_path).await,
|
||||||
Args::GetUrlList(a) => get_url_list_cli(a.supermercado).await,
|
Args::GetUrlList(a) => get_url_list_cli(a.supermercado).await,
|
||||||
|
Args::ScrapUrl(a) => scrap_url_cli(a.url).await,
|
||||||
Args::Auto(_) => auto_cli().await,
|
Args::Auto(_) => auto_cli().await,
|
||||||
Args::Cron(_) => cron_cli().await,
|
Args::Cron(_) => cron_cli().await,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn scrap_url_cli(url: String) -> anyhow::Result<()> {
|
||||||
|
let client = build_client();
|
||||||
|
let res = fetch_and_parse(&client, url.clone()).await;
|
||||||
|
|
||||||
|
println!("Result: {:#?}", res);
|
||||||
|
res.map(|_| ())
|
||||||
|
}
|
||||||
|
|
||||||
async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
|
async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
|
||||||
let links_str = fs::read_to_string(links_list_path).unwrap();
|
let links_str = fs::read_to_string(links_list_path).unwrap();
|
||||||
let links = links_str
|
let links = links_str
|
||||||
|
@ -114,8 +128,7 @@ async fn fetch_list(pool: &Pool, links: Vec<String>) -> Counters {
|
||||||
fn connect_db() -> Pool {
|
fn connect_db() -> Pool {
|
||||||
let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string());
|
let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string());
|
||||||
let cfg = deadpool_sqlite::Config::new(db_path);
|
let cfg = deadpool_sqlite::Config::new(db_path);
|
||||||
let pool = cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap();
|
cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap()
|
||||||
pool
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_client() -> reqwest::Client {
|
fn build_client() -> reqwest::Client {
|
||||||
|
@ -269,27 +282,32 @@ async fn scrap_url(
|
||||||
let url_p = Url::parse(&url).unwrap();
|
let url_p = Url::parse(&url).unwrap();
|
||||||
match url_p.host_str().unwrap() {
|
match url_p.host_str().unwrap() {
|
||||||
"www.carrefour.com.ar" => {
|
"www.carrefour.com.ar" => {
|
||||||
sites::carrefour::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
|
sites::carrefour::parse(url, &tl::parse(body, tl::ParserOptions::default())?)
|
||||||
}
|
}
|
||||||
"diaonline.supermercadosdia.com.ar" => {
|
"diaonline.supermercadosdia.com.ar" => {
|
||||||
sites::dia::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
|
sites::dia::parse(url, &tl::parse(body, tl::ParserOptions::default())?)
|
||||||
}
|
}
|
||||||
"www.cotodigital3.com.ar" => {
|
"www.cotodigital3.com.ar" => {
|
||||||
sites::coto::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
|
sites::coto::parse(url, &tl::parse(body, tl::ParserOptions::default())?)
|
||||||
}
|
}
|
||||||
"www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await,
|
"www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await,
|
||||||
s => bail!("Unknown host {}", s),
|
s => bail!("Unknown host {}", s),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
struct AutoTelegram {
|
||||||
|
token: String,
|
||||||
|
chat_id: String,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
struct Auto {
|
struct Auto {
|
||||||
pool: Pool,
|
pool: Pool,
|
||||||
telegram_token: String,
|
telegram: Option<AutoTelegram>,
|
||||||
telegram_chat_id: String,
|
|
||||||
}
|
}
|
||||||
impl Auto {
|
impl Auto {
|
||||||
async fn download_supermercado(self: Self, supermercado: Supermercado) -> anyhow::Result<()> {
|
async fn download_supermercado(self, supermercado: Supermercado) -> anyhow::Result<()> {
|
||||||
{
|
{
|
||||||
let t0 = now_sec();
|
let t0 = now_sec();
|
||||||
self.get_and_save_urls(&supermercado).await?;
|
self.get_and_save_urls(&supermercado).await?;
|
||||||
|
@ -341,7 +359,7 @@ impl Auto {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn get_and_save_urls(self: &Self, supermercado: &Supermercado) -> anyhow::Result<()> {
|
async fn get_and_save_urls(&self, supermercado: &Supermercado) -> anyhow::Result<()> {
|
||||||
let urls = get_urls(supermercado).await?;
|
let urls = get_urls(supermercado).await?;
|
||||||
self.pool
|
self.pool
|
||||||
.get()
|
.get()
|
||||||
|
@ -367,30 +385,37 @@ impl Auto {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn inform(self: &Self, msg: &str) {
|
async fn inform(&self, msg: &str) {
|
||||||
println!("{}", msg);
|
println!("{}", msg);
|
||||||
let u = Url::parse_with_params(
|
if let Some(telegram) = &self.telegram {
|
||||||
&format!(
|
let u = Url::parse_with_params(
|
||||||
"https://api.telegram.org/bot{}/sendMessage",
|
&format!("https://api.telegram.org/bot{}/sendMessage", telegram.token),
|
||||||
self.telegram_token
|
&[
|
||||||
),
|
("chat_id", telegram.chat_id.clone()),
|
||||||
&[
|
("text", msg.to_string()),
|
||||||
("chat_id", self.telegram_chat_id.clone()),
|
],
|
||||||
("text", msg.to_string()),
|
)
|
||||||
],
|
.unwrap();
|
||||||
)
|
reqwest::get(u).await.unwrap();
|
||||||
.unwrap();
|
}
|
||||||
reqwest::get(u).await.unwrap();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn auto_cli() -> anyhow::Result<()> {
|
async fn auto_cli() -> anyhow::Result<()> {
|
||||||
let db = connect_db();
|
let db = connect_db();
|
||||||
let auto = Auto {
|
let telegram = {
|
||||||
pool: db,
|
match (
|
||||||
telegram_token: env::var("TELEGRAM_BOT_TOKEN")?,
|
env::var("TELEGRAM_BOT_TOKEN"),
|
||||||
telegram_chat_id: env::var("TELEGRAM_BOT_CHAT_ID")?,
|
env::var("TELEGRAM_BOT_CHAT_ID"),
|
||||||
|
) {
|
||||||
|
(Ok(token), Ok(chat_id)) => Some(AutoTelegram { token, chat_id }),
|
||||||
|
_ => {
|
||||||
|
tracing::warn!("No token or chat_id for telegram");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
let auto = Auto { pool: db, telegram };
|
||||||
auto.inform("[auto] Empezando scrap").await;
|
auto.inform("[auto] Empezando scrap").await;
|
||||||
let handles: Vec<_> = Supermercado::value_variants()
|
let handles: Vec<_> = Supermercado::value_variants()
|
||||||
.iter()
|
.iter()
|
||||||
|
|
|
@ -53,7 +53,8 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error>
|
||||||
.filter_map(|h| h.get(dom.parser()))
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
.find_map(|n| n.as_tag())
|
.find_map(|n| n.as_tag())
|
||||||
.map(|t| t.inner_text(dom.parser()))
|
.map(|t| t.inner_text(dom.parser()))
|
||||||
.map(|s| s.trim().to_string());
|
// https://github.com/catdevnull/preciazo/issues/24
|
||||||
|
.map(|s| html_escape::decode_html_entities(s.trim()).to_string());
|
||||||
|
|
||||||
let image_url = dom
|
let image_url = dom
|
||||||
.query_selector(".zoomImage1")
|
.query_selector(".zoomImage1")
|
||||||
|
|
|
@ -118,25 +118,6 @@ pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> {
|
||||||
.try_collect()
|
.try_collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_decode_url() -> anyhow::Result<()> {
|
|
||||||
let links = parse_urls_from_sitemap(
|
|
||||||
r#"
|
|
||||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
|
|
||||||
<url>
|
|
||||||
<loc>https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g​-684952/p</loc>
|
|
||||||
<lastmod>2024-01-12T10:41:25.962Z</lastmod>
|
|
||||||
</url>"#,
|
|
||||||
)?;
|
|
||||||
assert_eq!(links[0], "https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g\u{200b}-684952/p");
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<String>> {
|
pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<String>> {
|
||||||
let mut total: Vec<String> = vec![];
|
let mut total: Vec<String> = vec![];
|
||||||
let client = build_client();
|
let client = build_client();
|
||||||
|
@ -146,7 +127,6 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<St
|
||||||
let url = url.to_string();
|
let url = url.to_string();
|
||||||
async move {
|
async move {
|
||||||
let client = client;
|
let client = client;
|
||||||
let url = url;
|
|
||||||
let text = get_retry_policy()
|
let text = get_retry_policy()
|
||||||
.retry_if(|| do_request(&client, &url), retry_if_wasnt_not_found)
|
.retry_if(|| do_request(&client, &url), retry_if_wasnt_not_found)
|
||||||
.await?
|
.await?
|
||||||
|
@ -165,3 +145,22 @@ pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<St
|
||||||
}
|
}
|
||||||
Ok(total.into_iter().unique().collect())
|
Ok(total.into_iter().unique().collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_decode_url() -> anyhow::Result<()> {
|
||||||
|
let links = parse_urls_from_sitemap(
|
||||||
|
r#"
|
||||||
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
|
||||||
|
<url>
|
||||||
|
<loc>https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g​-684952/p</loc>
|
||||||
|
<lastmod>2024-01-12T10:41:25.962Z</lastmod>
|
||||||
|
</url>"#,
|
||||||
|
)?;
|
||||||
|
assert_eq!(links[0], "https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g\u{200b}-684952/p");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -8,9 +8,15 @@ export const load: PageServerLoad = async ({ url }) => {
|
||||||
const query = url.searchParams.get("q");
|
const query = url.searchParams.get("q");
|
||||||
let results: null | { ean: string; name: string; imageUrl: string }[] = null;
|
let results: null | { ean: string; name: string; imageUrl: string }[] = null;
|
||||||
if (query) {
|
if (query) {
|
||||||
|
const sQuery = query
|
||||||
|
.replaceAll(`"`, `""`)
|
||||||
|
.split(" ")
|
||||||
|
.map((s) => `"${s}"`)
|
||||||
|
.join(" ");
|
||||||
|
console.debug(sQuery);
|
||||||
const sqlQuery = sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
|
const sqlQuery = sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
|
||||||
join precios p on p.ean = f.ean
|
join precios p on p.ean = f.ean
|
||||||
where f.name match ${`"${query}"`}
|
where f.name match ${sQuery}
|
||||||
group by p.ean
|
group by p.ean
|
||||||
having max(p.fetched_at)
|
having max(p.fetched_at)
|
||||||
order by p.in_stock desc;`;
|
order by p.in_stock desc;`;
|
||||||
|
|
Loading…
Reference in a new issue