mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-23 06:36:19 +00:00
Compare commits
No commits in common. "ec9ba5c53d53aa0fbf51ba70176ad03ec3a3581f" and "290d29ea78c1ed07524e4e8bf4a66dc22634ac1e" have entirely different histories.
ec9ba5c53d
...
290d29ea78
7 changed files with 61 additions and 109 deletions
6
.github/workflows/container.yml
vendored
6
.github/workflows/container.yml
vendored
|
@ -13,15 +13,15 @@ jobs:
|
||||||
name: chequear typescript del sitio
|
name: chequear typescript del sitio
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v3
|
||||||
- uses: pnpm/action-setup@v2
|
- uses: pnpm/action-setup@v2
|
||||||
with:
|
with:
|
||||||
version: 8
|
version: 8
|
||||||
- name: Use Node.js 20
|
- name: Use Node.js 20
|
||||||
uses: actions/setup-node@v4
|
uses: actions/setup-node@v3
|
||||||
with:
|
with:
|
||||||
node-version: 20
|
node-version: 20
|
||||||
cache: "pnpm"
|
cache: 'pnpm'
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: pnpm install
|
run: pnpm install
|
||||||
|
|
||||||
|
|
16
scraper-rs/Cargo.lock
generated
16
scraper-rs/Cargo.lock
generated
|
@ -604,15 +604,6 @@ version = "0.3.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
|
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "html-escape"
|
|
||||||
version = "0.2.13"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476"
|
|
||||||
dependencies = [
|
|
||||||
"utf8-width",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "http"
|
name = "http"
|
||||||
version = "0.2.11"
|
version = "0.2.11"
|
||||||
|
@ -1238,7 +1229,6 @@ dependencies = [
|
||||||
"deadpool",
|
"deadpool",
|
||||||
"deadpool-sqlite",
|
"deadpool-sqlite",
|
||||||
"futures",
|
"futures",
|
||||||
"html-escape",
|
|
||||||
"itertools",
|
"itertools",
|
||||||
"nanoid",
|
"nanoid",
|
||||||
"quick-xml",
|
"quick-xml",
|
||||||
|
@ -1624,12 +1614,6 @@ dependencies = [
|
||||||
"percent-encoding",
|
"percent-encoding",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "utf8-width"
|
|
||||||
version = "0.1.7"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "utf8parse"
|
name = "utf8parse"
|
||||||
version = "0.2.1"
|
version = "0.2.1"
|
||||||
|
|
|
@ -14,7 +14,6 @@ cron = "0.12.0"
|
||||||
deadpool = "0.10.0"
|
deadpool = "0.10.0"
|
||||||
deadpool-sqlite = "0.7.0"
|
deadpool-sqlite = "0.7.0"
|
||||||
futures = "0.3.30"
|
futures = "0.3.30"
|
||||||
html-escape = "0.2.13"
|
|
||||||
itertools = "0.12.0"
|
itertools = "0.12.0"
|
||||||
nanoid = "0.4.0"
|
nanoid = "0.4.0"
|
||||||
quick-xml = "0.31.0"
|
quick-xml = "0.31.0"
|
||||||
|
|
|
@ -23,7 +23,7 @@ enum Supermercado {
|
||||||
Coto,
|
Coto,
|
||||||
}
|
}
|
||||||
impl Supermercado {
|
impl Supermercado {
|
||||||
fn host(&self) -> &'static str {
|
fn host(self: &Self) -> &'static str {
|
||||||
match self {
|
match self {
|
||||||
Self::Dia => "diaonline.supermercadosdia.com.ar",
|
Self::Dia => "diaonline.supermercadosdia.com.ar",
|
||||||
Self::Carrefour => "www.carrefour.com.ar",
|
Self::Carrefour => "www.carrefour.com.ar",
|
||||||
|
@ -38,7 +38,6 @@ enum Args {
|
||||||
FetchList(FetchListArgs),
|
FetchList(FetchListArgs),
|
||||||
ParseFile(ParseFileArgs),
|
ParseFile(ParseFileArgs),
|
||||||
GetUrlList(GetUrlListArgs),
|
GetUrlList(GetUrlListArgs),
|
||||||
ScrapUrl(ScrapUrlArgs),
|
|
||||||
Auto(AutoArgs),
|
Auto(AutoArgs),
|
||||||
Cron(AutoArgs),
|
Cron(AutoArgs),
|
||||||
}
|
}
|
||||||
|
@ -56,10 +55,6 @@ struct GetUrlListArgs {
|
||||||
supermercado: Supermercado,
|
supermercado: Supermercado,
|
||||||
}
|
}
|
||||||
#[derive(clap::Args)]
|
#[derive(clap::Args)]
|
||||||
struct ScrapUrlArgs {
|
|
||||||
url: String,
|
|
||||||
}
|
|
||||||
#[derive(clap::Args)]
|
|
||||||
struct AutoArgs {}
|
struct AutoArgs {}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
|
@ -70,20 +65,11 @@ async fn main() -> anyhow::Result<()> {
|
||||||
Args::FetchList(a) => fetch_list_cli(a.list_path).await,
|
Args::FetchList(a) => fetch_list_cli(a.list_path).await,
|
||||||
Args::ParseFile(a) => parse_file_cli(a.file_path).await,
|
Args::ParseFile(a) => parse_file_cli(a.file_path).await,
|
||||||
Args::GetUrlList(a) => get_url_list_cli(a.supermercado).await,
|
Args::GetUrlList(a) => get_url_list_cli(a.supermercado).await,
|
||||||
Args::ScrapUrl(a) => scrap_url_cli(a.url).await,
|
|
||||||
Args::Auto(_) => auto_cli().await,
|
Args::Auto(_) => auto_cli().await,
|
||||||
Args::Cron(_) => cron_cli().await,
|
Args::Cron(_) => cron_cli().await,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn scrap_url_cli(url: String) -> anyhow::Result<()> {
|
|
||||||
let client = build_client();
|
|
||||||
let res = fetch_and_parse(&client, url.clone()).await;
|
|
||||||
|
|
||||||
println!("Result: {:#?}", res);
|
|
||||||
res.map(|_| ())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
|
async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
|
||||||
let links_str = fs::read_to_string(links_list_path).unwrap();
|
let links_str = fs::read_to_string(links_list_path).unwrap();
|
||||||
let links = links_str
|
let links = links_str
|
||||||
|
@ -128,7 +114,8 @@ async fn fetch_list(pool: &Pool, links: Vec<String>) -> Counters {
|
||||||
fn connect_db() -> Pool {
|
fn connect_db() -> Pool {
|
||||||
let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string());
|
let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string());
|
||||||
let cfg = deadpool_sqlite::Config::new(db_path);
|
let cfg = deadpool_sqlite::Config::new(db_path);
|
||||||
cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap()
|
let pool = cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap();
|
||||||
|
pool
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_client() -> reqwest::Client {
|
fn build_client() -> reqwest::Client {
|
||||||
|
@ -282,32 +269,27 @@ async fn scrap_url(
|
||||||
let url_p = Url::parse(&url).unwrap();
|
let url_p = Url::parse(&url).unwrap();
|
||||||
match url_p.host_str().unwrap() {
|
match url_p.host_str().unwrap() {
|
||||||
"www.carrefour.com.ar" => {
|
"www.carrefour.com.ar" => {
|
||||||
sites::carrefour::parse(url, &tl::parse(body, tl::ParserOptions::default())?)
|
sites::carrefour::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
|
||||||
}
|
}
|
||||||
"diaonline.supermercadosdia.com.ar" => {
|
"diaonline.supermercadosdia.com.ar" => {
|
||||||
sites::dia::parse(url, &tl::parse(body, tl::ParserOptions::default())?)
|
sites::dia::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
|
||||||
}
|
}
|
||||||
"www.cotodigital3.com.ar" => {
|
"www.cotodigital3.com.ar" => {
|
||||||
sites::coto::parse(url, &tl::parse(body, tl::ParserOptions::default())?)
|
sites::coto::parse(url, &tl::parse(&body, tl::ParserOptions::default())?)
|
||||||
}
|
}
|
||||||
"www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await,
|
"www.jumbo.com.ar" => sites::jumbo::scrap(client, url, body).await,
|
||||||
s => bail!("Unknown host {}", s),
|
s => bail!("Unknown host {}", s),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
struct AutoTelegram {
|
|
||||||
token: String,
|
|
||||||
chat_id: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
struct Auto {
|
struct Auto {
|
||||||
pool: Pool,
|
pool: Pool,
|
||||||
telegram: Option<AutoTelegram>,
|
telegram_token: String,
|
||||||
|
telegram_chat_id: String,
|
||||||
}
|
}
|
||||||
impl Auto {
|
impl Auto {
|
||||||
async fn download_supermercado(self, supermercado: Supermercado) -> anyhow::Result<()> {
|
async fn download_supermercado(self: Self, supermercado: Supermercado) -> anyhow::Result<()> {
|
||||||
{
|
{
|
||||||
let t0 = now_sec();
|
let t0 = now_sec();
|
||||||
self.get_and_save_urls(&supermercado).await?;
|
self.get_and_save_urls(&supermercado).await?;
|
||||||
|
@ -359,7 +341,7 @@ impl Auto {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn get_and_save_urls(&self, supermercado: &Supermercado) -> anyhow::Result<()> {
|
async fn get_and_save_urls(self: &Self, supermercado: &Supermercado) -> anyhow::Result<()> {
|
||||||
let urls = get_urls(supermercado).await?;
|
let urls = get_urls(supermercado).await?;
|
||||||
self.pool
|
self.pool
|
||||||
.get()
|
.get()
|
||||||
|
@ -385,37 +367,30 @@ impl Auto {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn inform(&self, msg: &str) {
|
async fn inform(self: &Self, msg: &str) {
|
||||||
println!("{}", msg);
|
println!("{}", msg);
|
||||||
if let Some(telegram) = &self.telegram {
|
|
||||||
let u = Url::parse_with_params(
|
let u = Url::parse_with_params(
|
||||||
&format!("https://api.telegram.org/bot{}/sendMessage", telegram.token),
|
&format!(
|
||||||
|
"https://api.telegram.org/bot{}/sendMessage",
|
||||||
|
self.telegram_token
|
||||||
|
),
|
||||||
&[
|
&[
|
||||||
("chat_id", telegram.chat_id.clone()),
|
("chat_id", self.telegram_chat_id.clone()),
|
||||||
("text", msg.to_string()),
|
("text", msg.to_string()),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
reqwest::get(u).await.unwrap();
|
reqwest::get(u).await.unwrap();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn auto_cli() -> anyhow::Result<()> {
|
async fn auto_cli() -> anyhow::Result<()> {
|
||||||
let db = connect_db();
|
let db = connect_db();
|
||||||
let telegram = {
|
let auto = Auto {
|
||||||
match (
|
pool: db,
|
||||||
env::var("TELEGRAM_BOT_TOKEN"),
|
telegram_token: env::var("TELEGRAM_BOT_TOKEN")?,
|
||||||
env::var("TELEGRAM_BOT_CHAT_ID"),
|
telegram_chat_id: env::var("TELEGRAM_BOT_CHAT_ID")?,
|
||||||
) {
|
|
||||||
(Ok(token), Ok(chat_id)) => Some(AutoTelegram { token, chat_id }),
|
|
||||||
_ => {
|
|
||||||
tracing::warn!("No token or chat_id for telegram");
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
let auto = Auto { pool: db, telegram };
|
|
||||||
auto.inform("[auto] Empezando scrap").await;
|
auto.inform("[auto] Empezando scrap").await;
|
||||||
let handles: Vec<_> = Supermercado::value_variants()
|
let handles: Vec<_> = Supermercado::value_variants()
|
||||||
.iter()
|
.iter()
|
||||||
|
|
|
@ -53,8 +53,7 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error>
|
||||||
.filter_map(|h| h.get(dom.parser()))
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
.find_map(|n| n.as_tag())
|
.find_map(|n| n.as_tag())
|
||||||
.map(|t| t.inner_text(dom.parser()))
|
.map(|t| t.inner_text(dom.parser()))
|
||||||
// https://github.com/catdevnull/preciazo/issues/24
|
.map(|s| s.trim().to_string());
|
||||||
.map(|s| html_escape::decode_html_entities(s.trim()).to_string());
|
|
||||||
|
|
||||||
let image_url = dom
|
let image_url = dom
|
||||||
.query_selector(".zoomImage1")
|
.query_selector(".zoomImage1")
|
||||||
|
|
|
@ -118,34 +118,6 @@ pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> {
|
||||||
.try_collect()
|
.try_collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<String>> {
|
|
||||||
let mut total: Vec<String> = vec![];
|
|
||||||
let client = build_client();
|
|
||||||
let handles = stream::iter(sitemaps)
|
|
||||||
.map(|url| {
|
|
||||||
let client = client.clone();
|
|
||||||
let url = url.to_string();
|
|
||||||
async move {
|
|
||||||
let client = client;
|
|
||||||
let text = get_retry_policy()
|
|
||||||
.retry_if(|| do_request(&client, &url), retry_if_wasnt_not_found)
|
|
||||||
.await?
|
|
||||||
.text()
|
|
||||||
.await?;
|
|
||||||
parse_urls_from_sitemap(&text)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
// https://github.com/rust-lang/rust/issues/89976#issuecomment-1073115246
|
|
||||||
.boxed()
|
|
||||||
.buffer_unordered(8)
|
|
||||||
.try_collect::<Vec<_>>()
|
|
||||||
.await?;
|
|
||||||
for mut urls in handles {
|
|
||||||
total.append(&mut urls);
|
|
||||||
}
|
|
||||||
Ok(total.into_iter().unique().collect())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
@ -164,3 +136,32 @@ mod tests {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<String>> {
|
||||||
|
let mut total: Vec<String> = vec![];
|
||||||
|
let client = build_client();
|
||||||
|
let handles = stream::iter(sitemaps)
|
||||||
|
.map(|url| {
|
||||||
|
let client = client.clone();
|
||||||
|
let url = url.to_string();
|
||||||
|
async move {
|
||||||
|
let client = client;
|
||||||
|
let url = url;
|
||||||
|
let text = get_retry_policy()
|
||||||
|
.retry_if(|| do_request(&client, &url), retry_if_wasnt_not_found)
|
||||||
|
.await?
|
||||||
|
.text()
|
||||||
|
.await?;
|
||||||
|
parse_urls_from_sitemap(&text)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
// https://github.com/rust-lang/rust/issues/89976#issuecomment-1073115246
|
||||||
|
.boxed()
|
||||||
|
.buffer_unordered(8)
|
||||||
|
.try_collect::<Vec<_>>()
|
||||||
|
.await?;
|
||||||
|
for mut urls in handles {
|
||||||
|
total.append(&mut urls);
|
||||||
|
}
|
||||||
|
Ok(total.into_iter().unique().collect())
|
||||||
|
}
|
||||||
|
|
|
@ -8,15 +8,9 @@ export const load: PageServerLoad = async ({ url }) => {
|
||||||
const query = url.searchParams.get("q");
|
const query = url.searchParams.get("q");
|
||||||
let results: null | { ean: string; name: string; imageUrl: string }[] = null;
|
let results: null | { ean: string; name: string; imageUrl: string }[] = null;
|
||||||
if (query) {
|
if (query) {
|
||||||
const sQuery = query
|
|
||||||
.replaceAll(`"`, `""`)
|
|
||||||
.split(" ")
|
|
||||||
.map((s) => `"${s}"`)
|
|
||||||
.join(" ");
|
|
||||||
console.debug(sQuery);
|
|
||||||
const sqlQuery = sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
|
const sqlQuery = sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
|
||||||
join precios p on p.ean = f.ean
|
join precios p on p.ean = f.ean
|
||||||
where f.name match ${sQuery}
|
where f.name match ${`"${query}"`}
|
||||||
group by p.ean
|
group by p.ean
|
||||||
having max(p.fetched_at)
|
having max(p.fetched_at)
|
||||||
order by p.in_stock desc;`;
|
order by p.in_stock desc;`;
|
||||||
|
|
Loading…
Reference in a new issue