Compare commits

..

No commits in common. "91c7087bdc18f01554ed9e8a1076e51fd32569e5" and "6d32c897acc535401ae54571c769009579a66131" have entirely different histories.

50 changed files with 218 additions and 1679 deletions

View file

@ -2,8 +2,6 @@ data/warcs/
data/carrefour/ data/carrefour/
*/*.db* */*.db*
sqlite.db sqlite.db
db.db
db.db-wal
downloader/ downloader/
node_modules/ node_modules/
*/node_modules/ */node_modules/

View file

@ -1,31 +0,0 @@
name: Sepa Precios Archiver
on:
schedule:
- cron: "0 */12 * * *" # Run every 6 hours
workflow_dispatch: # Allow manual trigger
jobs:
archive-prices:
runs-on: ubicloud-standard-4
steps:
- uses: actions/checkout@v4
- uses: oven-sh/setup-bun@v2
with:
bun-version: latest
# - name: Setup tmate session
# uses: mxschmitt/action-tmate@v3
# with:
# limit-access-to-actor: true
- name: Run archiver script
env:
GITHUB_TOKEN: ${{ secrets.ARCHIVE_GITHUB_TOKEN }}
B2_BUCKET_NAME: ${{ secrets.B2_BUCKET_NAME }}
B2_BUCKET_KEY_ID: ${{ secrets.B2_BUCKET_KEY_ID }}
B2_BUCKET_KEY: ${{ secrets.B2_BUCKET_KEY }}
run: |
cd sepa-precios-archiver
bun install --frozen-lockfile
bun index.ts

4
.gitignore vendored
View file

@ -3,7 +3,5 @@ node_modules/
*.db-shm *.db-shm
*.db-wal *.db-wal
target/ target/
*.local .env.*
.DS_Store

View file

@ -1 +0,0 @@
DB_PATH=../db.db

View file

@ -1,4 +1,4 @@
export const DB_PATH = process.env.DB_PATH ?? "../db.db"; export const DB_PATH = process.env.DB_PATH ?? "../sqlite.db";
/** @type { import("drizzle-kit").Config } */ /** @type { import("drizzle-kit").Config } */
export default { export default {

View file

@ -1,2 +0,0 @@
-- Custom SQL migration file, put you code below! --
CREATE VIRTUAL TABLE productos_fts USING fts5 (ean, name, content = precios, content_rowid = idd);

View file

@ -1,208 +0,0 @@
{
"id": "f981b295-c9eb-4df5-88b1-d3765e4cc314",
"prevId": "c95c6547-d540-45cf-aa9d-9d828efb468e",
"version": "6",
"dialect": "sqlite",
"tables": {
"db_best_selling": {
"name": "db_best_selling",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"category": {
"name": "category",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"eans_json": {
"name": "eans_json",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
}
},
"indexes": {},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
},
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {
"precios_ean_idx": {
"name": "precios_ean_idx",
"columns": [
"ean"
],
"isUnique": false
},
"precios_url_idx": {
"name": "precios_url_idx",
"columns": [
"url"
],
"isUnique": false
},
"precios_fetched_at_idx": {
"name": "precios_fetched_at_idx",
"columns": [
"fetched_at"
],
"isUnique": false
},
"precios_ean_fetched_at_idx": {
"name": "precios_ean_fetched_at_idx",
"columns": [
"ean",
"fetched_at"
],
"isUnique": false
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
},
"producto_urls": {
"name": "producto_urls",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"first_seen": {
"name": "first_seen",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"last_seen": {
"name": "last_seen",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
}
},
"indexes": {
"producto_urls_url_unique": {
"name": "producto_urls_url_unique",
"columns": [
"url"
],
"isUnique": true
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"columns": {},
"schemas": {},
"tables": {}
},
"internal": {
"indexes": {}
}
}

View file

@ -39,12 +39,12 @@ importers:
dayjs: dayjs:
specifier: ^1.11.10 specifier: ^1.11.10
version: 1.11.10 version: 1.11.10
drizzle-kit:
specifier: ^0.23.0
version: 0.23.0
drizzle-orm: drizzle-orm:
specifier: ^0.32.0 specifier: ^0.32.0
version: 0.32.0(@types/better-sqlite3@7.6.9)(better-sqlite3@11.1.2) version: 0.32.0(@types/better-sqlite3@7.6.9)(better-sqlite3@11.1.2)
ky:
specifier: ^1.5.0
version: 1.5.0
zod: zod:
specifier: ^3.22.4 specifier: ^3.22.4
version: 3.22.4 version: 3.22.4
@ -1227,10 +1227,6 @@ packages:
resolution: {integrity: sha512-o+NO+8WrRiQEE4/7nwRJhN1HWpVmJm511pBHUxPLtp0BUISzlBplORYSmTclCnJvQq2tKu/sgl3xVpkc7ZWuQQ==} resolution: {integrity: sha512-o+NO+8WrRiQEE4/7nwRJhN1HWpVmJm511pBHUxPLtp0BUISzlBplORYSmTclCnJvQq2tKu/sgl3xVpkc7ZWuQQ==}
engines: {node: '>=6'} engines: {node: '>=6'}
ky@1.5.0:
resolution: {integrity: sha512-bkQo+UqryW6Zmo/DsixYZE4Z9t2mzvNMhceyIhuMuInb3knm5Q+GNGMKveydJAj+Z6piN1SwI6eR/V0G+Z0BtA==}
engines: {node: '>=18'}
lilconfig@2.1.0: lilconfig@2.1.0:
resolution: {integrity: sha512-utWOt/GHzuUxnLKxB6dk81RoOeoNeHgbrXiuGk4yyF5qlRz+iIVWu56E2fqGHFrXz0QNUhLB/8nKqvRH66JKGQ==} resolution: {integrity: sha512-utWOt/GHzuUxnLKxB6dk81RoOeoNeHgbrXiuGk4yyF5qlRz+iIVWu56E2fqGHFrXz0QNUhLB/8nKqvRH66JKGQ==}
engines: {node: '>=10'} engines: {node: '>=10'}
@ -2682,8 +2678,6 @@ snapshots:
kleur@4.1.5: {} kleur@4.1.5: {}
ky@1.5.0: {}
lilconfig@2.1.0: {} lilconfig@2.1.0: {}
lilconfig@3.1.1: {} lilconfig@3.1.1: {}

View file

@ -1 +1 @@
DATABASE_URL=sqlite://../db.db DATABASE_URL=sqlite://../sqlite.db

View file

@ -1,20 +0,0 @@
{
"db_name": "SQLite",
"query": "select count(distinct ean) as count from precios",
"describe": {
"columns": [
{
"name": "count",
"ordinal": 0,
"type_info": "Integer"
}
],
"parameters": {
"Right": 0
},
"nullable": [
false
]
},
"hash": "2e632fbda989abf0d8a88a1d3bc1de0a9aefb0d3f3cdc33d26158d09faed97b2"
}

View file

@ -1,38 +0,0 @@
{
"db_name": "SQLite",
"query": "with search_results as (\n select f.ean from precios_fts f\n where f.name match ? and f.ean != ''\n group by f.ean\n\t\t\tlimit 100\n )\n select p.id, p.ean, p.name, p.image_url from search_results as s\n join precios as p\n on p.ean = s.ean\n where p.fetched_at = (\n SELECT MAX(fetched_at)\n FROM precios as pf\n WHERE pf.ean = s.ean and pf.name is not null\n );",
"describe": {
"columns": [
{
"name": "id",
"ordinal": 0,
"type_info": "Integer"
},
{
"name": "ean",
"ordinal": 1,
"type_info": "Text"
},
{
"name": "name",
"ordinal": 2,
"type_info": "Text"
},
{
"name": "image_url",
"ordinal": 3,
"type_info": "Text"
}
],
"parameters": {
"Right": 1
},
"nullable": [
false,
false,
true,
true
]
},
"hash": "3ee249afda554bbffe736257af05aba689c71188ce1a869e01988ac7ca1220a2"
}

View file

@ -6,7 +6,7 @@
{ {
"name": "count", "name": "count",
"ordinal": 0, "ordinal": 0,
"type_info": "Integer" "type_info": "Int"
} }
], ],
"parameters": { "parameters": {

View file

@ -1,56 +0,0 @@
{
"db_name": "SQLite",
"query": "\nselect ean,fetched_at,precio_centavos,in_stock,url,name,image_url from precios\nwhere ean = ?\norder by fetched_at\n",
"describe": {
"columns": [
{
"name": "ean",
"ordinal": 0,
"type_info": "Text"
},
{
"name": "fetched_at",
"ordinal": 1,
"type_info": "Integer"
},
{
"name": "precio_centavos",
"ordinal": 2,
"type_info": "Integer"
},
{
"name": "in_stock",
"ordinal": 3,
"type_info": "Integer"
},
{
"name": "url",
"ordinal": 4,
"type_info": "Text"
},
{
"name": "name",
"ordinal": 5,
"type_info": "Text"
},
{
"name": "image_url",
"ordinal": 6,
"type_info": "Text"
}
],
"parameters": {
"Right": 1
},
"nullable": [
false,
false,
true,
true,
false,
true,
true
]
},
"hash": "88a597e29390fb04bbc48d9f88303551e068ddc478b037354c62bc77bc70ad96"
}

View file

@ -6,7 +6,7 @@
{ {
"name": "count", "name": "count",
"ordinal": 0, "ordinal": 0,
"type_info": "Integer" "type_info": "Int"
} }
], ],
"parameters": { "parameters": {

135
rust/Cargo.lock generated
View file

@ -35,6 +35,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"getrandom 0.2.15",
"once_cell", "once_cell",
"version_check", "version_check",
"zerocopy", "zerocopy",
@ -162,7 +163,7 @@ checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn", "syn 2.0.71",
] ]
[[package]] [[package]]
@ -353,7 +354,6 @@ dependencies = [
"iana-time-zone", "iana-time-zone",
"js-sys", "js-sys",
"num-traits", "num-traits",
"serde",
"wasm-bindgen", "wasm-bindgen",
"windows-targets 0.52.6", "windows-targets 0.52.6",
] ]
@ -386,10 +386,10 @@ version = "4.5.8"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2bac35c6dafb060fd4d275d9a4ffae97917c13a6327903a8be2153cd964f7085" checksum = "2bac35c6dafb060fd4d275d9a4ffae97917c13a6327903a8be2153cd964f7085"
dependencies = [ dependencies = [
"heck", "heck 0.5.0",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn", "syn 2.0.71",
] ]
[[package]] [[package]]
@ -404,15 +404,6 @@ version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
[[package]]
name = "concurrent-queue"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973"
dependencies = [
"crossbeam-utils",
]
[[package]] [[package]]
name = "const-oid" name = "const-oid"
version = "0.9.6" version = "0.9.6"
@ -561,14 +552,9 @@ dependencies = [
[[package]] [[package]]
name = "event-listener" name = "event-listener"
version = "5.3.1" version = "2.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6032be9bd27023a771701cc49f9f053c751055f71efb2e0ae5c15809093675ba" checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
dependencies = [
"concurrent-queue",
"parking",
"pin-project-lite",
]
[[package]] [[package]]
name = "fastrand" name = "fastrand"
@ -679,7 +665,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn", "syn 2.0.71",
] ]
[[package]] [[package]]
@ -762,13 +748,22 @@ dependencies = [
[[package]] [[package]]
name = "hashlink" name = "hashlink"
version = "0.9.1" version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af" checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
dependencies = [ dependencies = [
"hashbrown", "hashbrown",
] ]
[[package]]
name = "heck"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
dependencies = [
"unicode-segmentation",
]
[[package]] [[package]]
name = "heck" name = "heck"
version = "0.5.0" version = "0.5.0"
@ -1038,9 +1033,9 @@ checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
[[package]] [[package]]
name = "libsqlite3-sys" name = "libsqlite3-sys"
version = "0.28.0" version = "0.27.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c10584274047cb335c23d3e61bcef8e323adae7c5c8c760540f73610177fc3f" checksum = "cf4e226dcd58b4be396f7bd3c20da8fdee2911400705297ba7d2d7cc2c30f716"
dependencies = [ dependencies = [
"cc", "cc",
"pkg-config", "pkg-config",
@ -1239,12 +1234,6 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "parking"
version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb813b8af86854136c6922af0598d719255ecb2179515e6e7730d468f05c9cae"
[[package]] [[package]]
name = "parking_lot" name = "parking_lot"
version = "0.11.2" version = "0.11.2"
@ -1331,7 +1320,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn", "syn 2.0.71",
] ]
[[package]] [[package]]
@ -1805,7 +1794,7 @@ checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn", "syn 2.0.71",
] ]
[[package]] [[package]]
@ -1911,9 +1900,6 @@ name = "smallvec"
version = "1.13.2" version = "1.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
dependencies = [
"serde",
]
[[package]] [[package]]
name = "socket2" name = "socket2"
@ -1956,9 +1942,9 @@ dependencies = [
[[package]] [[package]]
name = "sqlx" name = "sqlx"
version = "0.8.0" version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "27144619c6e5802f1380337a209d2ac1c431002dd74c6e60aebff3c506dc4f0c" checksum = "c9a2ccff1a000a5a59cd33da541d9f2fdcd9e6e8229cc200565942bff36d0aaa"
dependencies = [ dependencies = [
"sqlx-core", "sqlx-core",
"sqlx-macros", "sqlx-macros",
@ -1969,10 +1955,11 @@ dependencies = [
[[package]] [[package]]
name = "sqlx-core" name = "sqlx-core"
version = "0.8.0" version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a999083c1af5b5d6c071d34a708a19ba3e02106ad82ef7bbd69f5e48266b613b" checksum = "24ba59a9342a3d9bab6c56c118be528b27c9b60e490080e9711a04dccac83ef6"
dependencies = [ dependencies = [
"ahash",
"atoi", "atoi",
"byteorder", "byteorder",
"bytes", "bytes",
@ -1986,7 +1973,6 @@ dependencies = [
"futures-intrusive", "futures-intrusive",
"futures-io", "futures-io",
"futures-util", "futures-util",
"hashbrown",
"hashlink", "hashlink",
"hex", "hex",
"indexmap", "indexmap",
@ -2009,26 +1995,26 @@ dependencies = [
[[package]] [[package]]
name = "sqlx-macros" name = "sqlx-macros"
version = "0.8.0" version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a23217eb7d86c584b8cbe0337b9eacf12ab76fe7673c513141ec42565698bb88" checksum = "4ea40e2345eb2faa9e1e5e326db8c34711317d2b5e08d0d5741619048a803127"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"sqlx-core", "sqlx-core",
"sqlx-macros-core", "sqlx-macros-core",
"syn", "syn 1.0.109",
] ]
[[package]] [[package]]
name = "sqlx-macros-core" name = "sqlx-macros-core"
version = "0.8.0" version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a099220ae541c5db479c6424bdf1b200987934033c2584f79a0e1693601e776" checksum = "5833ef53aaa16d860e92123292f1f6a3d53c34ba8b1969f152ef1a7bb803f3c8"
dependencies = [ dependencies = [
"dotenvy", "dotenvy",
"either", "either",
"heck", "heck 0.4.1",
"hex", "hex",
"once_cell", "once_cell",
"proc-macro2", "proc-macro2",
@ -2040,7 +2026,7 @@ dependencies = [
"sqlx-mysql", "sqlx-mysql",
"sqlx-postgres", "sqlx-postgres",
"sqlx-sqlite", "sqlx-sqlite",
"syn", "syn 1.0.109",
"tempfile", "tempfile",
"tokio", "tokio",
"url", "url",
@ -2048,12 +2034,12 @@ dependencies = [
[[package]] [[package]]
name = "sqlx-mysql" name = "sqlx-mysql"
version = "0.8.0" version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5afe4c38a9b417b6a9a5eeffe7235d0a106716495536e7727d1c7f4b1ff3eba6" checksum = "1ed31390216d20e538e447a7a9b959e06ed9fc51c37b514b46eb758016ecd418"
dependencies = [ dependencies = [
"atoi", "atoi",
"base64 0.22.1", "base64 0.21.7",
"bitflags 2.6.0", "bitflags 2.6.0",
"byteorder", "byteorder",
"bytes", "bytes",
@ -2091,12 +2077,12 @@ dependencies = [
[[package]] [[package]]
name = "sqlx-postgres" name = "sqlx-postgres"
version = "0.8.0" version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1dbb157e65f10dbe01f729339c06d239120221c9ad9fa0ba8408c4cc18ecf21" checksum = "7c824eb80b894f926f89a0b9da0c7f435d27cdd35b8c655b114e58223918577e"
dependencies = [ dependencies = [
"atoi", "atoi",
"base64 0.22.1", "base64 0.21.7",
"bitflags 2.6.0", "bitflags 2.6.0",
"byteorder", "byteorder",
"chrono", "chrono",
@ -2130,9 +2116,9 @@ dependencies = [
[[package]] [[package]]
name = "sqlx-sqlite" name = "sqlx-sqlite"
version = "0.8.0" version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b2cdd83c008a622d94499c0006d8ee5f821f36c89b7d625c900e5dc30b5c5ee" checksum = "b244ef0a8414da0bed4bb1910426e890b19e5e9bccc27ada6b797d05c55ae0aa"
dependencies = [ dependencies = [
"atoi", "atoi",
"chrono", "chrono",
@ -2146,10 +2132,10 @@ dependencies = [
"log", "log",
"percent-encoding", "percent-encoding",
"serde", "serde",
"serde_urlencoded",
"sqlx-core", "sqlx-core",
"tracing", "tracing",
"url", "url",
"urlencoding",
] ]
[[package]] [[package]]
@ -2175,6 +2161,17 @@ version = "2.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
[[package]]
name = "syn"
version = "1.0.109"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]] [[package]]
name = "syn" name = "syn"
version = "2.0.71" version = "2.0.71"
@ -2227,7 +2224,7 @@ checksum = "d20468752b09f49e909e55a5d338caa8bedf615594e9d80bc4c565d30faf798c"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn", "syn 2.0.71",
] ]
[[package]] [[package]]
@ -2288,7 +2285,7 @@ checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn", "syn 2.0.71",
] ]
[[package]] [[package]]
@ -2386,7 +2383,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn", "syn 2.0.71",
] ]
[[package]] [[package]]
@ -2467,6 +2464,12 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e4259d9d4425d9f0661581b804cb85fe66a4c631cadd8f490d1c13a35d5d9291" checksum = "e4259d9d4425d9f0661581b804cb85fe66a4c631cadd8f490d1c13a35d5d9291"
[[package]]
name = "unicode-segmentation"
version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
[[package]] [[package]]
name = "unicode_categories" name = "unicode_categories"
version = "0.1.1" version = "0.1.1"
@ -2490,6 +2493,12 @@ dependencies = [
"percent-encoding", "percent-encoding",
] ]
[[package]]
name = "urlencoding"
version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
[[package]] [[package]]
name = "utf8-width" name = "utf8-width"
version = "0.1.7" version = "0.1.7"
@ -2568,7 +2577,7 @@ dependencies = [
"once_cell", "once_cell",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn", "syn 2.0.71",
"wasm-bindgen-shared", "wasm-bindgen-shared",
] ]
@ -2602,7 +2611,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn", "syn 2.0.71",
"wasm-bindgen-backend", "wasm-bindgen-backend",
"wasm-bindgen-shared", "wasm-bindgen-shared",
] ]
@ -2854,7 +2863,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn", "syn 2.0.71",
] ]
[[package]] [[package]]

View file

@ -9,10 +9,10 @@ edition = "2021"
again = "0.1.2" again = "0.1.2"
anyhow = "1.0.79" anyhow = "1.0.79"
base64 = "0.21.7" base64 = "0.21.7"
chrono = { version = "0.4", features = ["serde"] } chrono = "0.4"
clap = { version = "4.4.15", features = ["derive"] } clap = { version = "4.4.15", features = ["derive"] }
cron = "0.12.0" cron = "0.12.0"
sqlx = { version = "0.8", features = [ "runtime-tokio", "sqlite", "chrono", "json" ] } sqlx = { version = "0.7", features = [ "runtime-tokio", "sqlite", "chrono" ] }
futures = "0.3.30" futures = "0.3.30"
html-escape = "0.2.13" html-escape = "0.2.13"
itertools = "0.12.0" itertools = "0.12.0"

View file

@ -1,16 +1,8 @@
use axum::{ use axum::{extract::State, http::StatusCode, response::IntoResponse, routing::get, Router};
extract::{Path, State},
http::StatusCode,
response::IntoResponse,
routing::get,
Json, Router,
};
use chrono::{DateTime, Utc};
use clap::ValueEnum; use clap::ValueEnum;
use futures::future::join_all; use futures::future::join_all;
use itertools::Itertools; use itertools::Itertools;
use preciazo::supermercado::Supermercado; use preciazo::supermercado::Supermercado;
use serde::Serialize;
use sqlx::{ use sqlx::{
sqlite::{SqliteConnectOptions, SqlitePoolOptions}, sqlite::{SqliteConnectOptions, SqlitePoolOptions},
SqlitePool, SqlitePool,
@ -102,220 +94,31 @@ async fn healthcheck(State(pool): State<SqlitePool>) -> impl IntoResponse {
} }
} }
#[derive(Serialize)]
struct CategoryWithProducts {
category: String,
products: Vec<Product>,
}
#[derive(Serialize)]
struct Product {
ean: String,
name: Option<String>,
image_url: Option<String>,
}
async fn get_best_selling(State(pool): State<SqlitePool>) -> impl IntoResponse {
#[derive(sqlx::FromRow, Debug)]
struct ProductWithCategory {
category: String,
ean: String,
name: Option<String>,
image_url: Option<String>,
}
let products_with_category = sqlx::query_as::<_, ProductWithCategory>(
"with latest_best_selling as (
select category, eans_json
from db_best_selling
group by category
having max(fetched_at)
),
extracted_eans as (
select latest_best_selling.category, json.value as ean
from latest_best_selling, json_each(latest_best_selling.eans_json) json
)
select extracted_eans.category, extracted_eans.ean, precios.image_url, name
from extracted_eans
join precios
on extracted_eans.ean = precios.ean
where
precios.fetched_at = (
SELECT MAX(fetched_at)
FROM precios
WHERE ean = extracted_eans.ean
)",
)
.fetch_all(&pool)
.await
.unwrap();
let categories = products_with_category
.iter()
.map(|p| p.category.clone())
.unique()
.collect_vec();
let categories_with_products = categories
.into_iter()
.map(|c| CategoryWithProducts {
category: c.clone(),
products: products_with_category
.iter()
.filter(|p| p.category == c)
.map(|p| Product {
ean: p.ean.clone(),
image_url: p.image_url.clone(),
name: p.name.clone(),
})
.collect_vec(),
})
.collect_vec();
Json(categories_with_products)
}
async fn get_product_history(
State(pool): State<SqlitePool>,
Path(ean): Path<String>,
) -> impl IntoResponse {
#[derive(sqlx::FromRow, Debug, Serialize)]
struct Precio {
ean: String,
fetched_at: chrono::DateTime<Utc>,
precio_centavos: Option<i64>,
in_stock: Option<bool>,
url: String,
name: Option<String>,
image_url: Option<String>,
}
let precios = sqlx::query!(
"
select ean,fetched_at,precio_centavos,in_stock,url,name,image_url from precios
where ean = ?
order by fetched_at
",
ean
)
.map(|r| Precio {
ean: r.ean,
url: r.url,
fetched_at: DateTime::from_timestamp(r.fetched_at, 0).unwrap(),
image_url: r.image_url,
name: r.name,
in_stock: r.in_stock.map(|x| x == 1),
precio_centavos: r.precio_centavos,
})
.fetch_all(&pool)
.await
.unwrap();
Json(precios)
}
async fn search(State(pool): State<SqlitePool>, Path(query): Path<String>) -> impl IntoResponse {
let sql_query = query
.clone()
.replace("\"", "\"\"")
.split(" ")
.map(|x| format!("\"{}\"", x))
.join(" ");
#[derive(Serialize)]
struct Result {
ean: String,
name: String,
image_url: String,
}
let results = sqlx::query!(
"with search_results as (
select f.ean from precios_fts f
where f.name match ? and f.ean != ''
group by f.ean
limit 100
)
select p.id, p.ean, p.name, p.image_url from search_results as s
join precios as p
on p.ean = s.ean
where p.fetched_at = (
SELECT MAX(fetched_at)
FROM precios as pf
WHERE pf.ean = s.ean and pf.name is not null
);",
sql_query
)
.fetch_all(&pool)
.await
.unwrap()
.into_iter()
.map(|r| Result {
ean: r.ean,
image_url: r.image_url.unwrap(),
name: r.name.unwrap(),
})
.collect_vec();
Json(results)
}
async fn get_info(State(pool): State<SqlitePool>) -> impl IntoResponse {
#[derive(Serialize)]
struct Info {
count: i64,
}
let count = sqlx::query!("select count(distinct ean) as count from precios")
.fetch_one(&pool)
.await
.unwrap()
.count;
Json(Info { count })
}
#[tokio::main] #[tokio::main]
async fn main() { async fn main() {
tracing_subscriber::fmt::init(); tracing_subscriber::fmt::init();
let pool = SqlitePoolOptions::new() let pool = SqlitePoolOptions::new()
.max_connections(10) .max_connections(1)
.connect_with( .connect_with(
SqliteConnectOptions::from_str(&format!( SqliteConnectOptions::from_str(&format!(
"sqlite://{}", "sqlite://{}",
env::var("DB_PATH").unwrap_or("../db.db".to_string()) env::var("DB_PATH").unwrap_or("../sqlite.db".to_string())
)) ))
.unwrap() .unwrap()
.journal_mode(sqlx::sqlite::SqliteJournalMode::Wal) .journal_mode(sqlx::sqlite::SqliteJournalMode::Wal)
.synchronous(sqlx::sqlite::SqliteSynchronous::Normal) .busy_timeout(Duration::from_secs(15))
.busy_timeout(Duration::from_secs(30))
.optimize_on_close(true, None), .optimize_on_close(true, None),
) )
.await .await
.expect("can't connect to database"); .expect("can't connect to database");
sqlx::query("pragma temp_store = memory;")
.execute(&pool)
.await
.unwrap();
sqlx::query("pragma mmap_size = 30000000000;")
.execute(&pool)
.await
.unwrap();
sqlx::query("pragma page_size = 4096;")
.execute(&pool)
.await
.unwrap();
let app = Router::new() let app = Router::new()
.route("/", get(index)) .route("/", get(index))
.route("/api/healthcheck", get(healthcheck)) .route("/api/healthcheck", get(healthcheck))
.route("/api/0/best-selling-products", get(get_best_selling))
.route("/api/0/ean/:ean/history", get(get_product_history))
.route("/api/0/info", get(get_info))
.route("/api/0/search/:query", get(search))
.with_state(pool); .with_state(pool);
let listener = tokio::net::TcpListener::bind("0.0.0.0:8000").await.unwrap(); let listener = tokio::net::TcpListener::bind("0.0.0.0:8000").await.unwrap();
tracing::info!("listening on {}", listener.local_addr().unwrap()); tracing::debug!("listening on {}", listener.local_addr().unwrap());
axum::serve(listener, app).await.unwrap(); axum::serve(listener, app).await.unwrap();
} }

View file

@ -1,5 +1,3 @@
use std::env;
use super::now_sec; use super::now_sec;
use super::AutoArgs; use super::AutoArgs;
use super::AutoTelegram; use super::AutoTelegram;
@ -66,16 +64,7 @@ impl Auto {
// } // }
{ {
let t0 = now_sec(); let t0 = now_sec();
let counters = self.scraper.fetch_list(&self.db, links).await;
let n_coroutines = if supermercado == Supermercado::Coto {
50
} else {
env::var("N_COROUTINES")
.map_or(Ok(24), |s| s.parse::<usize>())
.expect("N_COROUTINES no es un número")
};
let counters = self.scraper.fetch_list(&self.db, links, n_coroutines).await;
self.inform(&format!( self.inform(&format!(
"Downloaded {:?}: {:?} (took {})", "Downloaded {:?}: {:?} (took {})",
&supermercado, &supermercado,

View file

@ -17,7 +17,7 @@ pub struct Db {
impl Db { impl Db {
pub async fn connect() -> anyhow::Result<Self> { pub async fn connect() -> anyhow::Result<Self> {
let db_path = env::var("DB_PATH").unwrap_or("../db.db".to_string()); let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string());
info!("Opening DB at {}", db_path); info!("Opening DB at {}", db_path);
let read_pool = connect_to_db(&db_path, 32).await?; let read_pool = connect_to_db(&db_path, 32).await?;
let write_pool = connect_to_db(&db_path, 1).await?; let write_pool = connect_to_db(&db_path, 1).await?;

View file

@ -99,7 +99,7 @@ async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
let db = Db::connect().await?; let db = Db::connect().await?;
let scraper = Scraper::from_env().await?; let scraper = Scraper::from_env().await?;
let counters = scraper.fetch_list(&db, links, 100).await; let counters = scraper.fetch_list(&db, links).await;
println!("Finished: {:?}", counters); println!("Finished: {:?}", counters);
Ok(()) Ok(())

View file

@ -128,7 +128,11 @@ impl Scraper {
counters counters
} }
pub async fn fetch_list(&self, db: &Db, links: Vec<String>, n_coroutines: usize) -> Counters { pub async fn fetch_list(&self, db: &Db, links: Vec<String>) -> Counters {
let n_coroutines = env::var("N_COROUTINES")
.map_or(Ok(24), |s| s.parse::<usize>())
.expect("N_COROUTINES no es un número");
stream::iter(links) stream::iter(links)
.map(|url| { .map(|url| {
let db = db.clone(); let db = db.clone();

View file

@ -56,11 +56,7 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error>
.find_map(|n| n.as_tag()) .find_map(|n| n.as_tag())
.map(|t| t.inner_text(dom.parser())) .map(|t| t.inner_text(dom.parser()))
// https://github.com/catdevnull/preciazo/issues/24 // https://github.com/catdevnull/preciazo/issues/24
.map(|s| { .map(|s| html_escape::decode_html_entities(s.trim()).to_string());
html_escape::decode_html_entities(s.trim())
.trim()
.to_string()
});
let image_url = dom let image_url = dom
.query_selector(".zoomImage1") .query_selector(".zoomImage1")

View file

@ -207,15 +207,14 @@ pub async fn get_best_selling_by_category(
.append_pair("extensions", &{ .append_pair("extensions", &{
let variables_obj = json!({"hideUnavailableItems":true,"skusFilter":"FIRST_AVAILABLE","simulationBehavior":"default","installmentCriteria":"MAX_WITHOUT_INTEREST","productOriginVtex":false,"map":"c","query":query,"orderBy":"OrderByTopSaleDESC","from":0,"to":99,"selectedFacets": let variables_obj = json!({"hideUnavailableItems":true,"skusFilter":"FIRST_AVAILABLE","simulationBehavior":"default","installmentCriteria":"MAX_WITHOUT_INTEREST","productOriginVtex":false,"map":"c","query":query,"orderBy":"OrderByTopSaleDESC","from":0,"to":99,"selectedFacets":
query.split('/').map(|f| json!({"key":"c","value":f})).collect::<Vec<_>>() query.split('/').map(|f| json!({"key":"c","value":f})).collect::<Vec<_>>()
,"facetsBehavior":"Static","categoryTreeBehavior":"default", ,"facetsBehavior":"Static","categoryTreeBehavior":"default","withFacets":false,"showSponsored":false});
"withFacets":false,"showSponsored":false,"advertisementOptions":{"showSponsored":false,"sponsoredCount":0,"advertisementPlacement":"top_search","repeatSponsoredProducts":true}});
let b64=base64::prelude::BASE64_STANDARD.encode(variables_obj.to_string()); let b64=base64::prelude::BASE64_STANDARD.encode(variables_obj.to_string());
format!( format!(
r#"{{ r#"{{
"persistedQuery": {{ "persistedQuery": {{
"version": 1, "version": 1,
"sha256Hash": "8e3fd5f65d7d83516bfea23051b11e7aa469d85f26906f27e18afbee52c56ce4", "sha256Hash": "fd92698fe375e8e4fa55d26fa62951d979b790fcf1032a6f02926081d199f550",
"sender": "vtex.store-resources@0.x", "sender": "vtex.store-resources@0.x",
"provider": "vtex.search-graphql@0.x" "provider": "vtex.search-graphql@0.x"
}}, }},

View file

@ -1,175 +0,0 @@
# Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore
# Logs
logs
_.log
npm-debug.log_
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
.pnpm-debug.log*
# Caches
.cache
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
# Runtime data
pids
_.pid
_.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# Snowpack dependency directory (https://snowpack.dev/)
web_modules/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional stylelint cache
.stylelintcache
# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variable files
.env
.env.development.local
.env.test.local
.env.production.local
.env.local
# parcel-bundler cache (https://parceljs.org/)
.parcel-cache
# Next.js build output
.next
out
# Nuxt.js build / generate output
.nuxt
dist
# Gatsby files
# Comment in the public line in if your project uses Gatsby and not Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public
# vuepress build output
.vuepress/dist
# vuepress v2.x temp and cache directory
.temp
# Docusaurus cache and generated files
.docusaurus
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# TernJS port file
.tern-port
# Stores VSCode versions used for testing VSCode extensions
.vscode-test
# yarn v2
.yarn/cache
.yarn/unplugged
.yarn/build-state.yml
.yarn/install-state.gz
.pnp.*
# IntelliJ based IDEs
.idea
# Finder (MacOS) folder config
.DS_Store

View file

@ -1,19 +0,0 @@
# sepa-precios-archiver
Archivador del dataset de precios de [Precios Claros - Base SEPA](https://datos.produccion.gob.ar/dataset/sepa-precios). Recomprime para utilizar ~8 veces menos espacio, y resube a un bucket mio de Backblaze B2.
## Instalación
Para instalar las dependencias:
```bash
bun install
```
Para ejecutarlo:
```bash
bun run index.ts
```
This project was created using `bun init` in bun v1.1.25. [Bun](https://bun.sh) is a fast all-in-one JavaScript runtime.

Binary file not shown.

View file

@ -1,190 +0,0 @@
import { z } from "zod";
import { zDatasetInfo } from "./schemas";
import { mkdtemp, writeFile, readdir, mkdir, rm } from "fs/promises";
import { basename, extname, join } from "path";
import { $, write } from "bun";
import { S3Client, HeadObjectCommand } from "@aws-sdk/client-s3";
import { Upload } from "@aws-sdk/lib-storage";
function checkEnvVariable(variableName: string) {
const value = process.env[variableName];
if (value) {
console.log(`${variableName} is set`);
return value;
} else {
console.log(`${variableName} is not set`);
process.exit(1);
}
}
const GITHUB_TOKEN = checkEnvVariable("GITHUB_TOKEN");
const B2_BUCKET_NAME = checkEnvVariable("B2_BUCKET_NAME");
const B2_BUCKET_KEY_ID = checkEnvVariable("B2_BUCKET_KEY_ID");
const B2_BUCKET_KEY = checkEnvVariable("B2_BUCKET_KEY");
const s3 = new S3Client({
endpoint: "https://s3.us-west-004.backblazeb2.com",
region: "us-west-004",
credentials: {
accessKeyId: B2_BUCKET_KEY_ID,
secretAccessKey: B2_BUCKET_KEY,
},
});
async function getRawDatasetInfo() {
const response = await fetchWithRetry(
"https://datos.produccion.gob.ar/api/3/action/package_show?id=sepa-precios",
);
const json = await response.json();
return json;
}
async function saveDatasetInfoIntoRepo(datasetInfo: any) {
const dir = await mkdtemp("/tmp/sepa-precios-archiver-metadata-repo-");
try {
await $`git clone https://catdevnull:${GITHUB_TOKEN}@github.com/catdevnull/sepa-precios-metadata.git ${dir}`;
await writeFile(
dir + "/dataset-info.json",
JSON.stringify(datasetInfo, null, 2),
);
await $`cd ${dir} && git add dataset-info.json`;
await $`cd ${dir} && git config user.email "git@nulo.in" && git config user.name "github actions"`;
await $`cd ${dir} && git diff --staged --quiet || git commit -m "Update dataset info"`;
await $`cd ${dir} && git push origin main`;
} finally {
await $`rm -rf ${dir}`;
}
console.log(`✅ Saved dataset info into repo`);
}
async function checkFileExistsInB2(fileName: string): Promise<boolean> {
try {
await s3.send(
new HeadObjectCommand({
Bucket: B2_BUCKET_NAME,
Key: fileName,
}),
);
return true;
} catch (error) {
if ((error as any).name === "NotFound") {
return false;
}
throw error;
}
}
async function uploadToB2Bucket(
fileName: string,
fileContent: ReadableStream | Blob | string,
) {
const upload = new Upload({
client: s3,
params: {
Bucket: B2_BUCKET_NAME,
Key: fileName,
Body: fileContent,
},
});
await upload.done();
}
const rawDatasetInfo = await getRawDatasetInfo();
await saveDatasetInfoIntoRepo(rawDatasetInfo);
let errored = false;
async function fetchWithRetry(
url: string,
maxRetries = 3,
waitTime = 15000,
): Promise<Response> {
let retries = 0;
while (retries < maxRetries) {
try {
const response = await fetch(url, {
signal: AbortSignal.timeout(waitTime),
});
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
return response;
} catch (error) {
console.error(`Attempt ${retries + 1} failed: ${error}`);
retries++;
if (retries >= maxRetries) {
throw error;
}
await new Promise((resolve) => setTimeout(resolve, 1000 * retries));
}
}
throw new Error("Max retries reached");
}
function checkRes(
res: Response,
): res is Response & { body: ReadableStream<Uint8Array> } {
if (!res.ok) {
console.error(`❌ Error downloading ${res.url}`);
errored = true;
return false;
}
if (!res.body) throw new Error(`❌ No body in response`);
return true;
}
await uploadToB2Bucket(
`timestamped-metadata/${new Date().toISOString()}.json`,
JSON.stringify(rawDatasetInfo, null, 2),
);
const datasetInfo = z.object({ result: zDatasetInfo }).parse(rawDatasetInfo);
for (const resource of datasetInfo.result.resources) {
if (extname(resource.url) === ".zip") {
const fileName = `${resource.id}-${basename(resource.url)}-repackaged.tar.zst`;
if (await checkFileExistsInB2(fileName)) continue;
console.log(`⬇️ Downloading, repackaging and uploading ${resource.url}`);
const dir = await mkdtemp("/tmp/sepa-precios-archiver-repackage-");
console.info(dir);
try {
const zip = join(dir, "zip");
await $`curl --retry 8 --retry-delay 5 --retry-all-errors -L -o ${zip} ${resource.url}`;
await $`unzip ${zip} -d ${dir}`;
await rm(zip);
for (const file of await readdir(dir)) {
const path = join(dir, file);
if (extname(file) !== ".zip") continue;
const extractDir = join(dir, basename(file, ".zip"));
await mkdir(extractDir, { recursive: true });
await $`cd ${dir} && unzip ${path} -d ${extractDir}`;
await rm(path);
}
await writeFile(
join(dir, "dataset-info.json"),
JSON.stringify(rawDatasetInfo, null, 2),
);
const compressed =
await $`tar -c -C ${dir} . | zstd -15 --long -T0`.blob();
await uploadToB2Bucket(fileName, compressed);
} finally {
await $`rm -rf ${dir}`;
}
} else {
const fileName = `${resource.id}-${basename(resource.url)}`;
if (await checkFileExistsInB2(fileName)) continue;
console.log(`⬇️ Downloading and reuploading ${resource.url}`);
const response = await fetchWithRetry(resource.url, 3, 60 * 1000);
if (!checkRes(response)) continue;
await uploadToB2Bucket(fileName, response.body);
}
}
if (errored) {
process.exit(1);
}

View file

@ -1,16 +0,0 @@
{
"name": "sepa-precios-archiver",
"module": "index.ts",
"type": "module",
"devDependencies": {
"@types/bun": "latest"
},
"peerDependencies": {
"typescript": "^5.5.4"
},
"dependencies": {
"@aws-sdk/client-s3": "^3.637.0",
"@aws-sdk/lib-storage": "^3.637.0",
"zod": "^3.23.8"
}
}

View file

@ -1,17 +0,0 @@
import { z } from "zod";
export const zDatasetInfo = z.object({
metadata_modified: z.coerce.date(),
metadata_created: z.coerce.date(),
resources: z.array(
z.object({
id: z.string(),
size: z.number(),
format: z.string(),
created: z.coerce.date(),
url: z.string(),
modified: z.coerce.date().optional(),
description: z.string(),
}),
),
});

View file

@ -1,27 +0,0 @@
{
"compilerOptions": {
// Enable latest features
"lib": ["ESNext", "DOM"],
"target": "ESNext",
"module": "ESNext",
"moduleDetection": "force",
"jsx": "react-jsx",
"allowJs": true,
// Bundler mode
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"verbatimModuleSyntax": true,
"noEmit": true,
// Best practices
"strict": true,
"skipLibCheck": true,
"noFallthroughCasesInSwitch": true,
// Some stricter flags (disabled by default)
"noUnusedLocals": false,
"noUnusedParameters": false,
"noPropertyAccessFromIndexSignature": false
}
}

View file

@ -1,175 +0,0 @@
# Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore
# Logs
logs
_.log
npm-debug.log_
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
.pnpm-debug.log*
# Caches
.cache
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
# Runtime data
pids
_.pid
_.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# Snowpack dependency directory (https://snowpack.dev/)
web_modules/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional stylelint cache
.stylelintcache
# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variable files
.env
.env.development.local
.env.test.local
.env.production.local
.env.local
# parcel-bundler cache (https://parceljs.org/)
.parcel-cache
# Next.js build output
.next
out
# Nuxt.js build / generate output
.nuxt
dist
# Gatsby files
# Comment in the public line in if your project uses Gatsby and not Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public
# vuepress build output
.vuepress/dist
# vuepress v2.x temp and cache directory
.temp
# Docusaurus cache and generated files
.docusaurus
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# TernJS port file
.tern-port
# Stores VSCode versions used for testing VSCode extensions
.vscode-test
# yarn v2
.yarn/cache
.yarn/unplugged
.yarn/build-state.yml
.yarn/install-state.gz
.pnp.*
# IntelliJ based IDEs
.idea
# Finder (MacOS) folder config
.DS_Store

View file

@ -1,14 +0,0 @@
# sepa-precios-importer
Importador de [datasets de precios de SEPA](https://datos.produccion.gob.ar/dataset/sepa-precios/archivo/d076720f-a7f0-4af8-b1d6-1b99d5a90c14) a una base de datos PostgreSQL.
Vease [Errores en el formato de los datos SEPA](https://gist.github.com/catdevnull/587d5c63c4bab11b9798861c917db93b)
To install dependencies:
```bash
bun install
bun run index.ts ~/carpeta-con-datasets-descomprimidos
```
This project was created using `bun init` in bun v1.1.26. [Bun](https://bun.sh) is a fast all-in-one JavaScript runtime.

Binary file not shown.

View file

@ -1,231 +0,0 @@
import { readFile } from "fs/promises";
import Papa from "papaparse";
import { basename, join, dirname } from "path";
import postgres from "postgres";
import { Readable } from "stream";
import { pipeline } from "node:stream/promises";
import { Glob } from "bun";
const sql = postgres({
database: "sepa-precios",
});
// await sql`
// drop table if exists precios;`;
// await sql`
// drop table if exists datasets;`;
await sql`
CREATE TABLE if not exists datasets (
id SERIAL PRIMARY KEY,
name TEXT UNIQUE,
date DATE
);`;
await sql`
CREATE TABLE if not exists sucursales (
id_dataset INTEGER REFERENCES datasets(id),
id_comercio INTEGER,
id_bandera INTEGER,
id_sucursal INTEGER,
sucursales_nombre TEXT,
sucursales_tipo TEXT,
sucursales_calle TEXT,
sucursales_numero TEXT,
sucursales_latitud NUMERIC,
sucursales_longitud NUMERIC,
sucursales_observaciones TEXT,
sucursales_barrio TEXT,
sucursales_codigo_postal TEXT,
sucursales_localidad TEXT,
sucursales_provincia TEXT,
sucursales_lunes_horario_atencion TEXT,
sucursales_martes_horario_atencion TEXT,
sucursales_miercoles_horario_atencion TEXT,
sucursales_jueves_horario_atencion TEXT,
sucursales_viernes_horario_atencion TEXT,
sucursales_sabado_horario_atencion TEXT,
sucursales_domingo_horario_atencion TEXT,
UNIQUE (id_dataset, id_comercio, id_bandera, id_sucursal)
);`;
await sql`
CREATE TABLE if not exists precios (
id_dataset INTEGER REFERENCES datasets(id),
id_comercio INTEGER,
id_bandera INTEGER,
id_sucursal INTEGER,
id_producto BIGINT,
productos_ean INTEGER,
productos_descripcion TEXT,
productos_cantidad_presentacion NUMERIC(10, 2),
productos_unidad_medida_presentacion TEXT,
productos_marca TEXT,
productos_precio_lista NUMERIC(10, 2),
productos_precio_referencia NUMERIC(10, 2),
productos_cantidad_referencia NUMERIC(10, 2),
productos_unidad_medida_referencia TEXT,
productos_precio_unitario_promo1 NUMERIC(10, 2),
productos_leyenda_promo1 TEXT,
productos_precio_unitario_promo2 NUMERIC(10, 2),
productos_leyenda_promo2 TEXT,
FOREIGN KEY (id_dataset, id_comercio, id_bandera, id_sucursal) REFERENCES sucursales(id_dataset, id_comercio, id_bandera, id_sucursal)
);
`;
async function importSucursales(
sql: postgres.Sql,
datasetId: number,
dir: string,
) {
const sucursales: Papa.ParseResult<any> = Papa.parse(
await readFile(join(dir, "sucursales.csv"), "utf-8"),
{
header: true,
},
);
const objs = sucursales.data
.filter((data) => data.id_comercio && data.id_bandera && data.id_sucursal)
.map((data) => {
// Megatone
if ("sucursales_domingohorario_atencion" in data) {
data.sucursales_domingo_horario_atencion =
data.sucursales_domingohorario_atencion;
delete data.sucursales_domingohorario_atencion;
}
return {
id_dataset: datasetId,
...data,
};
});
const keys = Object.keys(objs[0]);
const lines = Readable.from(
objs.map((data) => keys.map((key) => (data as any)[key]).join("\t") + "\n"),
);
const writable =
await sql`copy sucursales (${sql.unsafe(keys.join(", "))}) from stdin with CSV DELIMITER E'\t' QUOTE E'\b'`.writable();
await pipeline(lines, writable);
}
async function importDataset(dir: string) {
const date = basename(dir).match(/(\d{4}-\d{2}-\d{2})/)![1];
// TODO: parsear "Ultima actualizacion" al final del CSV y insertarlo en la tabla datasets
// {
// const res =
// await sql`select id from datasets where name = ${basename(dir)}`;
// await importSucursales(sql, res[0].id, dir);
// }
try {
await sql.begin(async (sql) => {
let datasetId: number;
const res =
await sql`insert into datasets (name, date) values (${basename(dir)}, ${date}) returning id`;
datasetId = res[0].id;
const datas: any[] = [];
const comercios: Papa.ParseResult<{ comercio_cuit: string }> = Papa.parse(
await readFile(join(dir, "comercio.csv"), "utf-8"),
{ header: true },
);
const comercioCuit = comercios.data[0].comercio_cuit;
console.log(`dataset ${datasetId}, comercio ${comercioCuit}`);
await importSucursales(sql, datasetId, dir);
let file = await readFile(join(dir, "productos.csv"), "utf-8");
// WALL OF SHAME: estos proveedores no saben producir CSVs correctos
if (comercioCuit == "30612929455") {
// Libertad S.A.
file = file.replaceAll("|RAPTOR 6X16X45", "/RAPTOR 6X16X45");
} else if (comercioCuit == "30578411174") {
// Alberdi S.A.
file = file.replaceAll(";", "|");
}
if (
["33504047089", "30707429468", "30589621499"].includes(comercioCuit)
) {
// TODO: si tienen los valores, pero con otros nombres, por ejemplo
// productos_precio_lista seria precio_unitario_bulto_por_unidad_venta_con_iva.
// pero no quiero mentir, asi que por ahora no lo importo
console.error(
`No voy a importar el dataset ${dir} porque el formato está mal. Pero se podría importar. Pero por ahora no lo voy a hacer. Véase https://gist.github.com/catdevnull/587d5c63c4bab11b9798861c917db93b`,
);
return;
}
console.time("parse");
return await new Promise((resolve, reject) => {
Papa.parse(file, {
header: true,
step: function (result: any) {
const { data } = result;
if (
data.id_comercio &&
data.id_bandera &&
data.id_sucursal &&
data.id_producto
)
datas.push(data);
},
complete: async function () {
try {
console.timeEnd("parse");
console.time("map");
const objs = datas.map((data) => {
delete data.id_dun_14;
return {
id_dataset: datasetId,
...data,
productos_descripcion: data.productos_descripcion.replaceAll(
"\t",
" ",
),
};
});
if (!objs.length) {
console.error(`No hay datos para el dataset ${dir}`);
return;
}
const keys = Object.keys(objs[0]);
const lines = Readable.from(
objs.map(
(data) => keys.map((key) => data[key]).join("\t") + "\n",
),
);
console.timeEnd("map");
console.time("copy");
const writable =
await sql`copy precios (${sql.unsafe(keys.join(", "))}) from stdin with CSV DELIMITER E'\t' QUOTE E'\b'`.writable();
await pipeline(lines, writable);
console.timeEnd("copy");
console.info(`saved ${objs.length} rows`);
} catch (e) {
reject(e);
return;
} finally {
Bun.gc(true);
resolve(void 0);
}
},
skipEmptyLines: true,
});
});
});
} catch (e) {
if ((e as any).code == "23505") {
console.log(`dataset ${basename(dir)} already exists`);
return;
}
throw e;
}
}
try {
const glob = new Glob("**/productos.csv");
for await (const file of glob.scan(process.argv[2])) {
const dir = join(process.argv[2], dirname(file));
console.log(dir);
await importDataset(dir);
}
} finally {
await sql.end();
}

View file

@ -1,17 +0,0 @@
{
"name": "sepa-precios-importer",
"module": "index.ts",
"type": "module",
"devDependencies": {
"@types/bun": "^1.1.7",
"@types/papaparse": "^5.3.14"
},
"peerDependencies": {
"typescript": "^5.0.0"
},
"dependencies": {
"p-queue": "^8.0.1",
"papaparse": "^5.4.1",
"postgres": "^3.4.4"
}
}

View file

@ -1,27 +0,0 @@
{
"compilerOptions": {
// Enable latest features
"lib": ["ESNext", "DOM"],
"target": "ESNext",
"module": "ESNext",
"moduleDetection": "force",
"jsx": "react-jsx",
"allowJs": true,
// Bundler mode
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"verbatimModuleSyntax": true,
"noEmit": true,
// Best practices
"strict": true,
"skipLibCheck": true,
"noFallthroughCasesInSwitch": true,
// Some stricter flags (disabled by default)
"noUnusedLocals": false,
"noUnusedParameters": false,
"noPropertyAccessFromIndexSignature": false
}
}

View file

@ -1,2 +0,0 @@
DB_PATH=../db.db
VITE_API_HOST=http://localhost:8000

2
sitio/.gitignore vendored
View file

@ -4,7 +4,7 @@ node_modules
/.svelte-kit /.svelte-kit
/package /package
.env .env
*.local .env.*
!.env.example !.env.example
vite.config.js.timestamp-* vite.config.js.timestamp-*
vite.config.ts.timestamp-* vite.config.ts.timestamp-*

View file

@ -40,7 +40,6 @@
"chartjs-adapter-dayjs-4": "^1.0.4", "chartjs-adapter-dayjs-4": "^1.0.4",
"dayjs": "^1.11.10", "dayjs": "^1.11.10",
"drizzle-orm": "^0.32.0", "drizzle-orm": "^0.32.0",
"ky": "^1.5.0",
"zod": "^3.22.4" "zod": "^3.22.4"
}, },
"packageManager": "pnpm@9.5.0+sha512.140036830124618d624a2187b50d04289d5a087f326c9edfc0ccd733d76c4f52c3a313d4fc148794a2a9d81553016004e6742e8cf850670268a7387fc220c903" "packageManager": "pnpm@9.5.0+sha512.140036830124618d624a2187b50d04289d5a087f326c9edfc0ccd733d76c4f52c3a313d4fc148794a2a9d81553016004e6742e8cf850670268a7387fc220c903"

View file

@ -1,9 +1,5 @@
<script lang="ts" context="module"> <script lang="ts" context="module">
export type Product = { export type Product = { ean: string; name: string; imageUrl: string | null };
ean: string;
name: string | null;
image_url: string | null;
};
</script> </script>
<script lang="ts"> <script lang="ts">
@ -11,9 +7,9 @@
</script> </script>
<a href={`/ean/${product.ean}`} class="flex gap-2"> <a href={`/ean/${product.ean}`} class="flex gap-2">
{#if product.image_url} {#if product.imageUrl}
<img <img
src={product.image_url} src={product.imageUrl}
alt={product.name} alt={product.name}
class="max-h-48" class="max-h-48"
loading="lazy" loading="lazy"

View file

@ -1,2 +1 @@
// place files you want to import through the `$lib` alias in this folder. // place files you want to import through the `$lib` alias in this folder.
export const API_HOST = import.meta.env.VITE_API_HOST;

View file

@ -0,0 +1,2 @@
export { getDb } from "db-datos/db.js";
export * as schema from "db-datos/schema.js";

View file

@ -1,17 +1,15 @@
import { countDistinct } from "drizzle-orm";
import type { PageServerLoad } from "./$types"; import type { PageServerLoad } from "./$types";
import { z } from "zod"; import { getDb, schema } from "$lib/server/db";
import ky from "ky"; const { precios } = schema;
import { API_HOST } from "$lib";
async function getInfo() {
return z
.object({
count: z.number(),
})
.parse(await ky.get(`${API_HOST}/api/0/info`).json());
}
export const load: PageServerLoad = async () => { export const load: PageServerLoad = async () => {
const nProductos = (await getInfo()).count; const db = await getDb();
const nProductosR = await db
.select({
count: countDistinct(precios.ean),
})
.from(precios);
const nProductos = nProductosR[0].count;
return { nProductos }; return { nProductos };
}; };

View file

@ -1,29 +1,68 @@
import type { PageServerLoad } from "./$types"; import type { PageServerLoad } from "./$types";
import { getDb, schema } from "$lib/server/db";
const { precios, bestSelling } = schema;
import { max, sql } from "drizzle-orm";
import z from "zod"; import z from "zod";
import type { Product } from "$lib/ProductPreview.svelte";
async function getBestSelling() { type Data = {
const res = await fetch( category: string;
`${import.meta.env.VITE_API_HOST}/api/0/best-selling-products`, products: Product[];
}[];
let cache: Promise<{ key: Date; data: Data }> = doQuery();
async function doQuery() {
const db = await getDb();
const categories = await db
.select({
fetchedAt: bestSelling.fetchedAt,
category: bestSelling.category,
eansJson: bestSelling.eansJson,
})
.from(bestSelling)
.groupBy(bestSelling.category)
.having(max(bestSelling.fetchedAt));
const categoriesWithProducts = await Promise.all(
categories.map(async (category) => {
const eans = z.array(z.string()).parse(JSON.parse(category.eansJson));
const products = await db
.select({
ean: precios.ean,
name: precios.name,
imageUrl: precios.imageUrl,
})
.from(precios)
.where(sql`${precios.ean} in ${eans}`)
.groupBy(precios.ean)
.having(max(precios.fetchedAt));
return {
category: category.category,
products: eans
.map((ean) => products.find((p) => p.ean === ean))
.filter((x): x is Product => !!x && !!x.name),
};
}),
); );
const json = await res.json();
return z return { key: new Date(), data: categoriesWithProducts };
.array(
z.object({
category: z.string(),
products: z.array(
z.object({
ean: z.string(),
name: z.string().nullable(),
image_url: z.string().nullable(),
}),
),
}),
)
.parse(json);
} }
export const load: PageServerLoad = async ({ params }) => { console.log("setting up interval");
return { setInterval(
data: await getBestSelling(), async () => {
}; const c = await doQuery();
cache = Promise.resolve(c);
},
4 * 60 * 60 * 1000,
);
export const load: PageServerLoad = async ({
params,
}): Promise<{ data: Data }> => {
return { data: (await cache).data };
}; };

View file

@ -1,23 +1,20 @@
import { error } from "@sveltejs/kit"; import { error } from "@sveltejs/kit";
import { eq } from "drizzle-orm";
import type { PageServerLoad } from "./$types"; import type { PageServerLoad } from "./$types";
import { z } from "zod"; import { getDb, schema } from "$lib/server/db";
import { zPrecio, type Precio } from "./common"; const { precios } = schema;
import { API_HOST } from "$lib";
async function getProductHistory(ean: string) {
const res = await fetch(`${API_HOST}/api/0/ean/${ean}/history`);
const json = await res.json();
return z.array(zPrecio).parse(json);
}
export const load: PageServerLoad = async ({ params }) => { export const load: PageServerLoad = async ({ params }) => {
const res = await getProductHistory(params.ean); const db = await getDb();
const q = db
.select()
.from(precios)
.where(eq(precios.ean, params.ean))
.orderBy(precios.fetchedAt);
const res = await q;
if (res.length === 0) return error(404, "Not Found"); if (res.length === 0) return error(404, "Not Found");
const meta = res.findLast( const meta = res.findLast((p) => p.name);
(p): p is Precio & { name: string; image_url: string } =>
!!(p.name && p.image_url),
);
return { precios: res, meta }; return { precios: res, meta };
}; };

View file

@ -1,18 +1,18 @@
<script lang="ts"> <script lang="ts">
import { Supermercado, hosts } from "db-datos/supermercado"; import { Supermercado, hosts } from "db-datos/supermercado";
import * as schema from "db-datos/schema";
import type { PageData } from "./$types"; import type { PageData } from "./$types";
import Chart from "./Chart.svelte"; import Chart from "./Chart.svelte";
import type { Precio } from "./common";
export let data: PageData; export let data: PageData;
let urls: Map<Supermercado, Precio>; let urls: Map<Supermercado, schema.Precio>;
$: urls = data.precios.reduce((prev, curr) => { $: urls = data.precios.reduce((prev, curr) => {
const url = new URL(curr.url); const url = new URL(curr.url);
const supermercado = hosts[url.hostname]; const supermercado = hosts[url.hostname];
prev.set(supermercado, curr); prev.set(supermercado, curr);
return prev; return prev;
}, new Map<Supermercado, Precio>()); }, new Map<Supermercado, schema.Precio>());
const classBySupermercado: { [supermercado in Supermercado]: string } = { const classBySupermercado: { [supermercado in Supermercado]: string } = {
[Supermercado.Dia]: "bg-[#d52b1e] focus:ring-[#d52b1e]", [Supermercado.Dia]: "bg-[#d52b1e] focus:ring-[#d52b1e]",
@ -30,18 +30,18 @@
{#if data.meta} {#if data.meta}
<h1 class="text-3xl font-bold">{data.meta.name}</h1> <h1 class="text-3xl font-bold">{data.meta.name}</h1>
<img src={data.meta.image_url} alt={data.meta.name} class="max-h-48" /> <img src={data.meta.imageUrl} alt={data.meta.name} class="max-h-48" />
<div class="flex gap-2"> <div class="flex gap-2">
{#each urls as [supermercado, { url, precio_centavos }]} {#each urls as [supermercado, { url, precioCentavos }]}
<a <a
href={url} href={url}
rel="noreferrer noopener" rel="noreferrer noopener"
target="_blank" target="_blank"
class={`focus:shadow-outline inline-flex flex-col items-center justify-center rounded-md ${classBySupermercado[supermercado]} px-4 py-2 font-medium tracking-wide text-white transition-colors duration-200 hover:bg-opacity-80 focus:outline-none focus:ring-2 focus:ring-offset-2`} class={`focus:shadow-outline inline-flex flex-col items-center justify-center rounded-md ${classBySupermercado[supermercado]} px-4 py-2 font-medium tracking-wide text-white transition-colors duration-200 hover:bg-opacity-80 focus:outline-none focus:ring-2 focus:ring-offset-2`}
> >
{#if precio_centavos} {#if precioCentavos}
<span class="text-lg font-bold" <span class="text-lg font-bold"
>{formatter.format(precio_centavos / 100)}</span >{formatter.format(precioCentavos / 100)}</span
> >
{/if} {/if}
<span class="text-sm">{supermercado}</span> <span class="text-sm">{supermercado}</span>

View file

@ -1,8 +1,8 @@
<script lang="ts"> <script lang="ts">
import type { Precio } from "db-datos/schema";
// import dayjs from "dayjs"; // import dayjs from "dayjs";
import ChartJs from "./ChartJs.svelte"; import ChartJs from "./ChartJs.svelte";
import { hosts, colorBySupermercado } from "db-datos/supermercado"; import { hosts, colorBySupermercado } from "db-datos/supermercado";
import type { Precio } from "./common";
export let precios: Precio[]; export let precios: Precio[];
@ -15,15 +15,15 @@
const ps = precios const ps = precios
.filter((p) => new URL(p.url!).hostname === host) .filter((p) => new URL(p.url!).hostname === host)
.filter( .filter(
(p): p is Precio & { precio_centavos: number } => (p): p is Precio & { precioCentavos: number } =>
p.precio_centavos !== null, p.precioCentavos !== null,
); );
return { return {
label: supermercado, label: supermercado,
data: [ data: [
...ps.map((p) => ({ ...ps.map((p) => ({
x: p.fetched_at, x: p.fetchedAt,
y: p.precio_centavos / 100, y: p.precioCentavos / 100,
})), })),
// lie // lie
// ...ps.map((p) => ({ // ...ps.map((p) => ({

View file

@ -1,12 +0,0 @@
import { z } from "zod";
export const zPrecio = z.object({
ean: z.string(),
fetched_at: z.coerce.date(),
precio_centavos: z.number().nullable(),
in_stock: z.boolean().nullable(),
url: z.string(),
name: z.string().nullable(),
image_url: z.string().nullable(),
});
export type Precio = z.infer<typeof zPrecio>;

View file

@ -1,29 +1,26 @@
import { z } from "zod"; import { sql } from "drizzle-orm";
import type { PageServerLoad } from "./$types"; import type { PageServerLoad } from "./$types";
import { API_HOST } from "$lib"; import { getDb } from "$lib/server/db";
import ky from "ky";
const zProductResult = z.object({
ean: z.string(),
name: z.string(),
image_url: z.string(),
});
async function search(query: string) {
return z
.array(zProductResult)
.parse(
await ky
.get(`${API_HOST}/api/0/search/${encodeURIComponent(query)}`)
.json(),
);
}
export const load: PageServerLoad = async ({ url }) => { export const load: PageServerLoad = async ({ url }) => {
const db = await getDb();
const query = url.searchParams.get("q"); const query = url.searchParams.get("q");
let results: null | { ean: string; name: string; image_url: string }[] = query let results: null | { ean: string; name: string; imageUrl: string }[] = null;
? await search(query) if (query) {
: null; const sQuery = query
.replaceAll(`"`, `""`)
.split(" ")
.map((s) => `"${s}"`)
.join(" ");
console.debug(sQuery);
const sqlQuery = sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
join precios p on p.ean = f.ean
where f.name match ${sQuery}
group by p.ean
having max(p.fetched_at)
order by p.in_stock desc;`;
results = db.all(sqlQuery);
}
return { query, results }; return { query, results };
}; };