Compare commits

...

6 commits

8 changed files with 125 additions and 125 deletions

2
.gitignore vendored
View file

@ -5,7 +5,7 @@ data/carrefour
p.*
p
node_modules/
*.db
*.db*
*.db-shm
*.db-wal
scraper/debug/

View file

@ -42,6 +42,9 @@ importers:
drizzle-orm:
specifier: ^0.29.1
version: 0.29.3(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2)
zod:
specifier: ^3.22.4
version: 3.22.4
devDependencies:
'@sveltejs/adapter-node':
specifier: ^2.0.2
@ -2859,4 +2862,3 @@ packages:
/zod@3.22.4:
resolution: {integrity: sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==}
dev: true

View file

@ -47,7 +47,10 @@ struct ScrapUrlArgs {
url: String,
}
#[derive(clap::Args)]
struct AutoArgs {}
struct AutoArgs {
#[arg(long)]
n_products: Option<usize>,
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
@ -59,7 +62,7 @@ async fn main() -> anyhow::Result<()> {
Args::GetUrlList(a) => get_url_list_cli(a.supermercado).await,
Args::ScrapUrl(a) => scrap_url_cli(a.url).await,
Args::ScrapBestSelling => scrap_best_selling_cli().await,
Args::Auto(_) => auto_cli().await,
Args::Auto(a) => auto_cli(a).await,
Args::Cron(_) => cron_cli().await,
}
}
@ -166,7 +169,7 @@ fn build_client() -> reqwest::Client {
headers.append("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36".parse().unwrap());
reqwest::ClientBuilder::default()
.timeout(Duration::from_secs(60 * 5))
.connect_timeout(Duration::from_secs(60))
.connect_timeout(Duration::from_secs(30))
.default_headers(headers)
.build()
.unwrap()
@ -176,10 +179,19 @@ pub async fn do_request(client: &reqwest::Client, url: &str) -> reqwest::Result<
let response = client.execute(request).await?.error_for_status()?;
Ok(response)
}
async fn request_and_body(client: &reqwest::Client, url: &str) -> reqwest::Result<String> {
let res = do_request(client, url).await?;
res.text().await
}
pub async fn fetch_body(client: &reqwest::Client, url: &str) -> reqwest::Result<String> {
get_retry_policy()
.retry_if(|| request_and_body(client, url), retry_if_wasnt_not_found)
.await
}
pub fn get_retry_policy() -> again::RetryPolicy {
RetryPolicy::exponential(Duration::from_millis(300))
.with_max_retries(10)
.with_max_retries(20)
.with_jitter(true)
}
@ -192,11 +204,7 @@ async fn fetch_and_parse(
client: &reqwest::Client,
url: String,
) -> Result<PrecioPoint, anyhow::Error> {
let body = get_retry_policy()
.retry_if(|| do_request(client, &url), retry_if_wasnt_not_found)
.await?
.text()
.await?;
let body = fetch_body(client, &url).await?;
let maybe_point = { scrap_url(client, url, &body).await };
@ -287,6 +295,7 @@ struct AutoTelegram {
struct Auto {
db: Db,
telegram: Option<AutoTelegram>,
limit_n_products: Option<usize>,
}
impl Auto {
async fn download_supermercado(self, supermercado: Supermercado) -> anyhow::Result<()> {
@ -300,7 +309,13 @@ impl Auto {
))
.await;
}
let links: Vec<String> = self.db.get_urls_by_domain(supermercado.host()).await?;
let links: Vec<String> = {
let mut links = self.db.get_urls_by_domain(supermercado.host()).await?;
if let Some(n) = self.limit_n_products {
links.truncate(n);
}
links
};
// {
// let debug_path = PathBuf::from("debug/");
// tokio::fs::create_dir_all(&debug_path).await.unwrap();
@ -340,7 +355,7 @@ impl Auto {
}
async fn inform(&self, msg: &str) {
println!("{}", msg);
tracing::info!("{}", msg);
if let Some(telegram) = &self.telegram {
let u = Url::parse_with_params(
&format!("https://api.telegram.org/bot{}/sendMessage", telegram.token),
@ -355,7 +370,7 @@ impl Auto {
}
}
async fn auto_cli() -> anyhow::Result<()> {
async fn auto_cli(args: AutoArgs) -> anyhow::Result<()> {
let auto = {
let db = Db::connect().await?;
let telegram = {
@ -370,7 +385,11 @@ async fn auto_cli() -> anyhow::Result<()> {
}
}
};
Auto { db, telegram }
Auto {
db,
telegram,
limit_n_products: args.n_products,
}
};
auto.inform("[auto] Empezando scrap").await;
let handles: Vec<_> = Supermercado::value_variants()
@ -378,6 +397,7 @@ async fn auto_cli() -> anyhow::Result<()> {
.map(|s| tokio::spawn(auto.clone().download_supermercado(s.to_owned())))
.collect();
future::try_join_all(handles).await?;
auto.inform("[auto] Download supermercados finished").await;
let best_selling = auto
.inform_time(
@ -404,7 +424,7 @@ async fn cron_cli() -> anyhow::Result<()> {
.unwrap();
println!("Waiting for {:?}", t);
tokio::time::sleep(t).await;
auto_cli().await.unwrap();
auto_cli(AutoArgs { n_products: None }).await.unwrap();
}
}

View file

@ -14,8 +14,11 @@
"format": "prettier --write ."
},
"devDependencies": {
"@sveltejs/adapter-node": "^2.0.2",
"@sveltejs/kit": "^2.0.0",
"@sveltejs/vite-plugin-svelte": "^3.0.0",
"@types/better-sqlite3": "^7.6.8",
"@types/node": "^20.10.6",
"autoprefixer": "^10.4.16",
"db-datos": "workspace:^",
"postcss": "^8.4.32",
@ -28,10 +31,7 @@
"tailwindcss": "^3.3.6",
"tslib": "^2.4.1",
"typescript": "^5.0.0",
"vite": "^5.0.3",
"@sveltejs/adapter-node": "^2.0.2",
"@types/better-sqlite3": "^7.6.8",
"@types/node": "^20.10.6"
"vite": "^5.0.3"
},
"type": "module",
"dependencies": {
@ -39,6 +39,7 @@
"chart.js": "^4.4.1",
"chartjs-adapter-dayjs-4": "^1.0.4",
"dayjs": "^1.11.10",
"drizzle-orm": "^0.29.1"
"drizzle-orm": "^0.29.1",
"zod": "^3.22.4"
}
}

View file

@ -2,3 +2,7 @@
@tailwind base;
@tailwind components;
@tailwind utilities;
:root {
color-scheme: light dark;
}

View file

@ -1,10 +1,19 @@
<script lang="ts">
export let product: { ean: string; name: string; imageUrl?: string | null };
<script lang="ts" context="module">
export type Product = { ean: string; name: string; imageUrl: string | null };
</script>
<a href={`/ean/${product.ean}`} class="flex">
<script lang="ts">
export let product: Product;
</script>
<a href={`/ean/${product.ean}`} class="flex gap-2">
{#if product.imageUrl}
<img src={product.imageUrl} alt={product.name} class="max-h-48" />
<img
src={product.imageUrl}
alt={product.name}
class="max-h-48"
loading="lazy"
/>
{/if}
<p class="text-xl">{product.name}</p>
</a>

View file

@ -1,64 +1,60 @@
import type { PageData, PageServerLoad } from "./$types";
import { getDb, schema } from "$lib/server/db";
const { precios } = schema;
import { desc, sql } from "drizzle-orm";
const { precios, bestSelling } = schema;
import { desc, max, sql } from "drizzle-orm";
import {
Supermercado,
hostBySupermercado,
supermercados,
} from "db-datos/supermercado";
import z from "zod";
import type { Product } from "$lib/ProductPreview.svelte";
let cache: Promise<{ key: Date; data: { precios: Precios } }> = doQuery();
type Data = {
category: string;
products: Product[];
}[];
let cache: Promise<{ key: Date; data: Data }> = doQuery();
async function doQuery() {
const db = await getDb();
console.time("ean");
const eans = await db
const categories = await db
.select({
ean: precios.ean,
fetchedAt: bestSelling.fetchedAt,
category: bestSelling.category,
eansJson: bestSelling.eansJson,
})
.from(precios)
.groupBy(precios.ean)
.orderBy(sql`random()`)
.limit(50);
console.timeEnd("ean");
.from(bestSelling)
.groupBy(bestSelling.category)
.having(max(bestSelling.fetchedAt));
return;
const categoriesWithProducts = await Promise.all(
categories.map(async (category) => {
const eans = z.array(z.string()).parse(JSON.parse(category.eansJson));
const precioss = await Promise.all(
supermercados.map(
async (
supermercado,
): Promise<
[
Supermercado,
{ ean: string; name: string | null; imageUrl: string | null }[],
]
> => {
const host = hostBySupermercado[supermercado];
console.time(supermercado);
const q = db
const products = await db
.select({
ean: precios.ean,
name: precios.name,
imageUrl: precios.imageUrl,
})
.from(precios)
.where(sql`${precios.ean} in ${eans}`)
.groupBy(precios.ean)
.having(sql`max(fetched_at)`)
.where(
sql`ean in ${eans.map((x) => x.ean)} and in_stock and url like ${`%${host}%`}`,
.having(max(precios.fetchedAt));
return {
category: category.category,
products: eans
.map((ean) => products.find((p) => p.ean === ean))
.filter((x): x is Product => !!x && !!x.name),
};
}),
);
// console.debug(q.toSQL());
const res = await q;
console.timeEnd(supermercado);
return [supermercado, res];
},
),
);
const data = { precios: precioss.flatMap(([_, r]) => r) };
return { key: new Date(), data };
return { key: new Date(), data: categoriesWithProducts };
}
setInterval(
@ -69,14 +65,8 @@ setInterval(
4 * 60 * 60 * 1000,
);
type Precios = {
ean: string;
name: string | null;
imageUrl: string | null;
}[];
export const load: PageServerLoad = async ({
params,
}): Promise<{ precios: Precios }> => {
return (await cache).data;
}): Promise<{ data: Data }> => {
return { data: (await cache).data };
};

View file

@ -3,53 +3,27 @@
import type { PageData } from "./$types";
export let data: PageData;
$: precios = data.precios.filter(
(d): d is { ean: string; name: string; imageUrl: string | null } =>
!!d.name,
);
$: productos = precios.reduce(
(prev, curr) => [
...prev,
...(prev.find((p) => p.ean === curr.ean) ? [] : [curr]),
],
[] as { ean: string; name: string; imageUrl: string | null }[],
);
const categoryLabels: { [key in string]: string } = {
almacen: "Almacen",
bebidas: "Bebidas",
"frutas-y-verduras": "Frutas y Verduras",
};
</script>
<h1 class="text-xl">WIP</h1>
<section>
<h2 class="text-lg font-bold">Ejemplos</h2>
<ul>
<li>
<a href="/ean/7790070410795">
Cookies Sabor Vainilla Con Chips De Chocolate Exquisita Paq 300 Grm
</a>
</li>
<li>
<a href="/ean/7794000006911">
Sopa Instantánea KNORR QUICK Zapallo Romero Sobres 5 Un.
</a>
</li>
<li>
<a href="/ean/7798062540253">Agua Saborizada Levité Pera 1,5 Lts.</a>
</li>
<li>
<a href="/ean/7790895000430">Gaseosa Coca-Cola Sabor Original 1,5 Lts.</a>
</li>
<li>
<a href="/ean/7792200000128">Bizcochos Agridulc 9 De Oro Paq 200 Grm</a>
</li>
</ul>
</section>
<section>
<h2 class="text-lg font-bold">Random</h2>
<ul class="grid grid-cols-1 gap-4 md:grid-cols-2 lg:grid-cols-3">
{#each productos as product}
<li>
{#each data.data as { category, products }}
<section class="my-6">
<h2 class="text-2xl font-bold">
{categoryLabels[category] ?? category}
</h2>
<ul
class="grid max-w-full grid-flow-col grid-rows-2 gap-x-8 gap-y-4 overflow-x-auto"
>
{#each products as product}
<li class="w-96">
<ProductPreview {product} />
</li>
{/each}
</ul>
</section>
</section>
{/each}