Compare commits

...

6 commits

8 changed files with 125 additions and 125 deletions

2
.gitignore vendored
View file

@ -5,7 +5,7 @@ data/carrefour
p.* p.*
p p
node_modules/ node_modules/
*.db *.db*
*.db-shm *.db-shm
*.db-wal *.db-wal
scraper/debug/ scraper/debug/

View file

@ -42,6 +42,9 @@ importers:
drizzle-orm: drizzle-orm:
specifier: ^0.29.1 specifier: ^0.29.1
version: 0.29.3(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2) version: 0.29.3(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2)
zod:
specifier: ^3.22.4
version: 3.22.4
devDependencies: devDependencies:
'@sveltejs/adapter-node': '@sveltejs/adapter-node':
specifier: ^2.0.2 specifier: ^2.0.2
@ -2859,4 +2862,3 @@ packages:
/zod@3.22.4: /zod@3.22.4:
resolution: {integrity: sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==} resolution: {integrity: sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==}
dev: true

View file

@ -47,7 +47,10 @@ struct ScrapUrlArgs {
url: String, url: String,
} }
#[derive(clap::Args)] #[derive(clap::Args)]
struct AutoArgs {} struct AutoArgs {
#[arg(long)]
n_products: Option<usize>,
}
#[tokio::main] #[tokio::main]
async fn main() -> anyhow::Result<()> { async fn main() -> anyhow::Result<()> {
@ -59,7 +62,7 @@ async fn main() -> anyhow::Result<()> {
Args::GetUrlList(a) => get_url_list_cli(a.supermercado).await, Args::GetUrlList(a) => get_url_list_cli(a.supermercado).await,
Args::ScrapUrl(a) => scrap_url_cli(a.url).await, Args::ScrapUrl(a) => scrap_url_cli(a.url).await,
Args::ScrapBestSelling => scrap_best_selling_cli().await, Args::ScrapBestSelling => scrap_best_selling_cli().await,
Args::Auto(_) => auto_cli().await, Args::Auto(a) => auto_cli(a).await,
Args::Cron(_) => cron_cli().await, Args::Cron(_) => cron_cli().await,
} }
} }
@ -166,7 +169,7 @@ fn build_client() -> reqwest::Client {
headers.append("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36".parse().unwrap()); headers.append("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36".parse().unwrap());
reqwest::ClientBuilder::default() reqwest::ClientBuilder::default()
.timeout(Duration::from_secs(60 * 5)) .timeout(Duration::from_secs(60 * 5))
.connect_timeout(Duration::from_secs(60)) .connect_timeout(Duration::from_secs(30))
.default_headers(headers) .default_headers(headers)
.build() .build()
.unwrap() .unwrap()
@ -176,10 +179,19 @@ pub async fn do_request(client: &reqwest::Client, url: &str) -> reqwest::Result<
let response = client.execute(request).await?.error_for_status()?; let response = client.execute(request).await?.error_for_status()?;
Ok(response) Ok(response)
} }
async fn request_and_body(client: &reqwest::Client, url: &str) -> reqwest::Result<String> {
let res = do_request(client, url).await?;
res.text().await
}
pub async fn fetch_body(client: &reqwest::Client, url: &str) -> reqwest::Result<String> {
get_retry_policy()
.retry_if(|| request_and_body(client, url), retry_if_wasnt_not_found)
.await
}
pub fn get_retry_policy() -> again::RetryPolicy { pub fn get_retry_policy() -> again::RetryPolicy {
RetryPolicy::exponential(Duration::from_millis(300)) RetryPolicy::exponential(Duration::from_millis(300))
.with_max_retries(10) .with_max_retries(20)
.with_jitter(true) .with_jitter(true)
} }
@ -192,11 +204,7 @@ async fn fetch_and_parse(
client: &reqwest::Client, client: &reqwest::Client,
url: String, url: String,
) -> Result<PrecioPoint, anyhow::Error> { ) -> Result<PrecioPoint, anyhow::Error> {
let body = get_retry_policy() let body = fetch_body(client, &url).await?;
.retry_if(|| do_request(client, &url), retry_if_wasnt_not_found)
.await?
.text()
.await?;
let maybe_point = { scrap_url(client, url, &body).await }; let maybe_point = { scrap_url(client, url, &body).await };
@ -287,6 +295,7 @@ struct AutoTelegram {
struct Auto { struct Auto {
db: Db, db: Db,
telegram: Option<AutoTelegram>, telegram: Option<AutoTelegram>,
limit_n_products: Option<usize>,
} }
impl Auto { impl Auto {
async fn download_supermercado(self, supermercado: Supermercado) -> anyhow::Result<()> { async fn download_supermercado(self, supermercado: Supermercado) -> anyhow::Result<()> {
@ -300,7 +309,13 @@ impl Auto {
)) ))
.await; .await;
} }
let links: Vec<String> = self.db.get_urls_by_domain(supermercado.host()).await?; let links: Vec<String> = {
let mut links = self.db.get_urls_by_domain(supermercado.host()).await?;
if let Some(n) = self.limit_n_products {
links.truncate(n);
}
links
};
// { // {
// let debug_path = PathBuf::from("debug/"); // let debug_path = PathBuf::from("debug/");
// tokio::fs::create_dir_all(&debug_path).await.unwrap(); // tokio::fs::create_dir_all(&debug_path).await.unwrap();
@ -340,7 +355,7 @@ impl Auto {
} }
async fn inform(&self, msg: &str) { async fn inform(&self, msg: &str) {
println!("{}", msg); tracing::info!("{}", msg);
if let Some(telegram) = &self.telegram { if let Some(telegram) = &self.telegram {
let u = Url::parse_with_params( let u = Url::parse_with_params(
&format!("https://api.telegram.org/bot{}/sendMessage", telegram.token), &format!("https://api.telegram.org/bot{}/sendMessage", telegram.token),
@ -355,7 +370,7 @@ impl Auto {
} }
} }
async fn auto_cli() -> anyhow::Result<()> { async fn auto_cli(args: AutoArgs) -> anyhow::Result<()> {
let auto = { let auto = {
let db = Db::connect().await?; let db = Db::connect().await?;
let telegram = { let telegram = {
@ -370,7 +385,11 @@ async fn auto_cli() -> anyhow::Result<()> {
} }
} }
}; };
Auto { db, telegram } Auto {
db,
telegram,
limit_n_products: args.n_products,
}
}; };
auto.inform("[auto] Empezando scrap").await; auto.inform("[auto] Empezando scrap").await;
let handles: Vec<_> = Supermercado::value_variants() let handles: Vec<_> = Supermercado::value_variants()
@ -378,6 +397,7 @@ async fn auto_cli() -> anyhow::Result<()> {
.map(|s| tokio::spawn(auto.clone().download_supermercado(s.to_owned()))) .map(|s| tokio::spawn(auto.clone().download_supermercado(s.to_owned())))
.collect(); .collect();
future::try_join_all(handles).await?; future::try_join_all(handles).await?;
auto.inform("[auto] Download supermercados finished").await;
let best_selling = auto let best_selling = auto
.inform_time( .inform_time(
@ -404,7 +424,7 @@ async fn cron_cli() -> anyhow::Result<()> {
.unwrap(); .unwrap();
println!("Waiting for {:?}", t); println!("Waiting for {:?}", t);
tokio::time::sleep(t).await; tokio::time::sleep(t).await;
auto_cli().await.unwrap(); auto_cli(AutoArgs { n_products: None }).await.unwrap();
} }
} }

View file

@ -14,8 +14,11 @@
"format": "prettier --write ." "format": "prettier --write ."
}, },
"devDependencies": { "devDependencies": {
"@sveltejs/adapter-node": "^2.0.2",
"@sveltejs/kit": "^2.0.0", "@sveltejs/kit": "^2.0.0",
"@sveltejs/vite-plugin-svelte": "^3.0.0", "@sveltejs/vite-plugin-svelte": "^3.0.0",
"@types/better-sqlite3": "^7.6.8",
"@types/node": "^20.10.6",
"autoprefixer": "^10.4.16", "autoprefixer": "^10.4.16",
"db-datos": "workspace:^", "db-datos": "workspace:^",
"postcss": "^8.4.32", "postcss": "^8.4.32",
@ -28,10 +31,7 @@
"tailwindcss": "^3.3.6", "tailwindcss": "^3.3.6",
"tslib": "^2.4.1", "tslib": "^2.4.1",
"typescript": "^5.0.0", "typescript": "^5.0.0",
"vite": "^5.0.3", "vite": "^5.0.3"
"@sveltejs/adapter-node": "^2.0.2",
"@types/better-sqlite3": "^7.6.8",
"@types/node": "^20.10.6"
}, },
"type": "module", "type": "module",
"dependencies": { "dependencies": {
@ -39,6 +39,7 @@
"chart.js": "^4.4.1", "chart.js": "^4.4.1",
"chartjs-adapter-dayjs-4": "^1.0.4", "chartjs-adapter-dayjs-4": "^1.0.4",
"dayjs": "^1.11.10", "dayjs": "^1.11.10",
"drizzle-orm": "^0.29.1" "drizzle-orm": "^0.29.1",
"zod": "^3.22.4"
} }
} }

View file

@ -2,3 +2,7 @@
@tailwind base; @tailwind base;
@tailwind components; @tailwind components;
@tailwind utilities; @tailwind utilities;
:root {
color-scheme: light dark;
}

View file

@ -1,10 +1,19 @@
<script lang="ts"> <script lang="ts" context="module">
export let product: { ean: string; name: string; imageUrl?: string | null }; export type Product = { ean: string; name: string; imageUrl: string | null };
</script> </script>
<a href={`/ean/${product.ean}`} class="flex"> <script lang="ts">
export let product: Product;
</script>
<a href={`/ean/${product.ean}`} class="flex gap-2">
{#if product.imageUrl} {#if product.imageUrl}
<img src={product.imageUrl} alt={product.name} class="max-h-48" /> <img
src={product.imageUrl}
alt={product.name}
class="max-h-48"
loading="lazy"
/>
{/if} {/if}
<p class="text-xl">{product.name}</p> <p class="text-xl">{product.name}</p>
</a> </a>

View file

@ -1,64 +1,60 @@
import type { PageData, PageServerLoad } from "./$types"; import type { PageData, PageServerLoad } from "./$types";
import { getDb, schema } from "$lib/server/db"; import { getDb, schema } from "$lib/server/db";
const { precios } = schema; const { precios, bestSelling } = schema;
import { desc, sql } from "drizzle-orm"; import { desc, max, sql } from "drizzle-orm";
import { import {
Supermercado, Supermercado,
hostBySupermercado, hostBySupermercado,
supermercados, supermercados,
} from "db-datos/supermercado"; } from "db-datos/supermercado";
import z from "zod";
import type { Product } from "$lib/ProductPreview.svelte";
let cache: Promise<{ key: Date; data: { precios: Precios } }> = doQuery(); type Data = {
category: string;
products: Product[];
}[];
let cache: Promise<{ key: Date; data: Data }> = doQuery();
async function doQuery() { async function doQuery() {
const db = await getDb(); const db = await getDb();
console.time("ean");
const eans = await db const categories = await db
.select({ .select({
ean: precios.ean, fetchedAt: bestSelling.fetchedAt,
category: bestSelling.category,
eansJson: bestSelling.eansJson,
}) })
.from(precios) .from(bestSelling)
.groupBy(precios.ean) .groupBy(bestSelling.category)
.orderBy(sql`random()`) .having(max(bestSelling.fetchedAt));
.limit(50);
console.timeEnd("ean");
return; const categoriesWithProducts = await Promise.all(
categories.map(async (category) => {
const eans = z.array(z.string()).parse(JSON.parse(category.eansJson));
const precioss = await Promise.all( const products = await db
supermercados.map( .select({
async ( ean: precios.ean,
supermercado, name: precios.name,
): Promise< imageUrl: precios.imageUrl,
[ })
Supermercado, .from(precios)
{ ean: string; name: string | null; imageUrl: string | null }[], .where(sql`${precios.ean} in ${eans}`)
] .groupBy(precios.ean)
> => { .having(max(precios.fetchedAt));
const host = hostBySupermercado[supermercado];
console.time(supermercado); return {
const q = db category: category.category,
.select({ products: eans
ean: precios.ean, .map((ean) => products.find((p) => p.ean === ean))
name: precios.name, .filter((x): x is Product => !!x && !!x.name),
imageUrl: precios.imageUrl, };
}) }),
.from(precios)
.groupBy(precios.ean)
.having(sql`max(fetched_at)`)
.where(
sql`ean in ${eans.map((x) => x.ean)} and in_stock and url like ${`%${host}%`}`,
);
// console.debug(q.toSQL());
const res = await q;
console.timeEnd(supermercado);
return [supermercado, res];
},
),
); );
const data = { precios: precioss.flatMap(([_, r]) => r) };
return { key: new Date(), data }; return { key: new Date(), data: categoriesWithProducts };
} }
setInterval( setInterval(
@ -69,14 +65,8 @@ setInterval(
4 * 60 * 60 * 1000, 4 * 60 * 60 * 1000,
); );
type Precios = {
ean: string;
name: string | null;
imageUrl: string | null;
}[];
export const load: PageServerLoad = async ({ export const load: PageServerLoad = async ({
params, params,
}): Promise<{ precios: Precios }> => { }): Promise<{ data: Data }> => {
return (await cache).data; return { data: (await cache).data };
}; };

View file

@ -3,53 +3,27 @@
import type { PageData } from "./$types"; import type { PageData } from "./$types";
export let data: PageData; export let data: PageData;
$: precios = data.precios.filter(
(d): d is { ean: string; name: string; imageUrl: string | null } => const categoryLabels: { [key in string]: string } = {
!!d.name, almacen: "Almacen",
); bebidas: "Bebidas",
$: productos = precios.reduce( "frutas-y-verduras": "Frutas y Verduras",
(prev, curr) => [ };
...prev,
...(prev.find((p) => p.ean === curr.ean) ? [] : [curr]),
],
[] as { ean: string; name: string; imageUrl: string | null }[],
);
</script> </script>
<h1 class="text-xl">WIP</h1> {#each data.data as { category, products }}
<section class="my-6">
<section> <h2 class="text-2xl font-bold">
<h2 class="text-lg font-bold">Ejemplos</h2> {categoryLabels[category] ?? category}
<ul> </h2>
<li> <ul
<a href="/ean/7790070410795"> class="grid max-w-full grid-flow-col grid-rows-2 gap-x-8 gap-y-4 overflow-x-auto"
Cookies Sabor Vainilla Con Chips De Chocolate Exquisita Paq 300 Grm >
</a> {#each products as product}
</li> <li class="w-96">
<li> <ProductPreview {product} />
<a href="/ean/7794000006911"> </li>
Sopa Instantánea KNORR QUICK Zapallo Romero Sobres 5 Un. {/each}
</a> </ul>
</li> </section>
<li> {/each}
<a href="/ean/7798062540253">Agua Saborizada Levité Pera 1,5 Lts.</a>
</li>
<li>
<a href="/ean/7790895000430">Gaseosa Coca-Cola Sabor Original 1,5 Lts.</a>
</li>
<li>
<a href="/ean/7792200000128">Bizcochos Agridulc 9 De Oro Paq 200 Grm</a>
</li>
</ul>
</section>
<section>
<h2 class="text-lg font-bold">Random</h2>
<ul class="grid grid-cols-1 gap-4 md:grid-cols-2 lg:grid-cols-3">
{#each productos as product}
<li>
<ProductPreview {product} />
</li>
{/each}
</ul>
</section>