Compare commits

...

45 commits

Author SHA1 Message Date
91c7087bdc retry all errors 2024-08-28 21:12:34 -03:00
6b98e1eb24 timeout 2024-08-28 21:00:34 -03:00
814041eaf5 fetch with retry 2024-08-28 20:59:03 -03:00
dd6fc0b172 usar maquinas mas chiquitas 2024-08-28 20:55:35 -03:00
5dbfa0c39c retry 2024-08-28 20:54:46 -03:00
3e169a095c no usar arm 2024-08-28 20:53:02 -03:00
76c8e4bf1c ubicloud 2024-08-28 20:47:11 -03:00
107d91711f mejorar los archives 2024-08-28 20:46:30 -03:00
a6e972ce83 arreglar schema 2024-08-28 20:29:58 -03:00
2dc7c35a63 correr en github runners 2024-08-28 15:34:59 -03:00
21c661120a importar sucursales y arreglar cosas 2024-08-25 12:48:10 -03:00
97b93c0342 disable tmate 2024-08-24 22:59:22 -03:00
4d87fdbf25 usar curl lol 2024-08-24 22:57:24 -03:00
665110a2d6 tmate 2024-08-24 22:40:10 -03:00
bdd15ec895 upterm 2024-08-24 22:01:28 -03:00
213cd832b7 sepa-precios-importer 2024-08-24 21:58:25 -03:00
93b2f8c1a1 upterm less 2024-08-24 17:00:13 -03:00
2173311731 ubicloud arm 2024-08-24 16:56:43 -03:00
80d49cd99c upterm 2024-08-24 16:52:49 -03:00
eedd19f894 user config 2024-08-24 12:42:03 -03:00
fa46fc619b sqlx 2024-08-24 11:48:30 -03:00
bef5a74fcd secrets 2024-08-24 11:47:14 -03:00
0a8178d649 archivar de verda 2024-08-24 11:45:38 -03:00
cdac98d251 lockfile 2024-08-24 00:01:15 -03:00
79e1df717f cambiar nombre 2024-08-24 00:00:38 -03:00
f0c6f711e7 bun install 2024-08-24 00:00:18 -03:00
aaea1524db init sepa-precios-archiver 2024-08-23 23:59:02 -03:00
a48d0eb3a7 fix best selling 2024-08-19 21:00:45 -03:00
ac4b33c6e6 alsdjfaosfghls 2024-08-19 21:00:15 -03:00
6adcb07b88 Revert "index ean, fetched_at"
This reverts commit 8617a0b2a5.
2024-08-19 20:57:56 -03:00
8f82631b77 Merge branch 'new-rust-api' 2024-08-19 20:57:26 -03:00
adf1d7ac59 usar 50 coroutinas en coto 2024-08-19 20:47:33 -03:00
5943d80252 usar db.db 2024-08-13 20:13:02 -03:00
905d94a55e trim harder
closes #44
2024-08-04 17:31:00 -03:00
8617a0b2a5 index ean, fetched_at 2024-08-04 15:35:43 -03:00
d87fa5f905 borrar db de sitio 2024-08-04 15:31:36 -03:00
76db90618c search 2024-08-04 15:30:59 -03:00
a3bdc59b73 count 2024-08-04 14:48:54 -03:00
d38b2a8cb0 history 2024-08-04 14:38:59 -03:00
4bf1351688 fix: usar env para API HOST 2024-08-04 13:03:52 -03:00
8d9fce5293 fix types 2024-08-04 12:57:01 -03:00
a1fa2796ef sitio: adaptar homepage para usar api 2024-08-04 12:55:20 -03:00
f01213aaf8 gitignore 2024-08-04 12:49:40 -03:00
b02c3f84c5 more wip 2024-08-04 12:49:37 -03:00
4a63e5f6c8 WIP 2024-07-21 11:09:38 -03:00
50 changed files with 1679 additions and 218 deletions

View file

@ -2,6 +2,8 @@ data/warcs/
data/carrefour/
*/*.db*
sqlite.db
db.db
db.db-wal
downloader/
node_modules/
*/node_modules/

View file

@ -0,0 +1,31 @@
name: Sepa Precios Archiver
on:
schedule:
- cron: "0 */12 * * *" # Run every 6 hours
workflow_dispatch: # Allow manual trigger
jobs:
archive-prices:
runs-on: ubicloud-standard-4
steps:
- uses: actions/checkout@v4
- uses: oven-sh/setup-bun@v2
with:
bun-version: latest
# - name: Setup tmate session
# uses: mxschmitt/action-tmate@v3
# with:
# limit-access-to-actor: true
- name: Run archiver script
env:
GITHUB_TOKEN: ${{ secrets.ARCHIVE_GITHUB_TOKEN }}
B2_BUCKET_NAME: ${{ secrets.B2_BUCKET_NAME }}
B2_BUCKET_KEY_ID: ${{ secrets.B2_BUCKET_KEY_ID }}
B2_BUCKET_KEY: ${{ secrets.B2_BUCKET_KEY }}
run: |
cd sepa-precios-archiver
bun install --frozen-lockfile
bun index.ts

4
.gitignore vendored
View file

@ -3,5 +3,7 @@ node_modules/
*.db-shm
*.db-wal
target/
.env.*
*.local
.DS_Store

View file

@ -0,0 +1 @@
DB_PATH=../db.db

View file

@ -1,4 +1,4 @@
export const DB_PATH = process.env.DB_PATH ?? "../sqlite.db";
export const DB_PATH = process.env.DB_PATH ?? "../db.db";
/** @type { import("drizzle-kit").Config } */
export default {

View file

@ -0,0 +1,2 @@
-- Custom SQL migration file, put you code below! --
CREATE VIRTUAL TABLE productos_fts USING fts5 (ean, name, content = precios, content_rowid = idd);

View file

@ -0,0 +1,208 @@
{
"id": "f981b295-c9eb-4df5-88b1-d3765e4cc314",
"prevId": "c95c6547-d540-45cf-aa9d-9d828efb468e",
"version": "6",
"dialect": "sqlite",
"tables": {
"db_best_selling": {
"name": "db_best_selling",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"category": {
"name": "category",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"eans_json": {
"name": "eans_json",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
}
},
"indexes": {},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
},
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {
"precios_ean_idx": {
"name": "precios_ean_idx",
"columns": [
"ean"
],
"isUnique": false
},
"precios_url_idx": {
"name": "precios_url_idx",
"columns": [
"url"
],
"isUnique": false
},
"precios_fetched_at_idx": {
"name": "precios_fetched_at_idx",
"columns": [
"fetched_at"
],
"isUnique": false
},
"precios_ean_fetched_at_idx": {
"name": "precios_ean_fetched_at_idx",
"columns": [
"ean",
"fetched_at"
],
"isUnique": false
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
},
"producto_urls": {
"name": "producto_urls",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"first_seen": {
"name": "first_seen",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"last_seen": {
"name": "last_seen",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
}
},
"indexes": {
"producto_urls_url_unique": {
"name": "producto_urls_url_unique",
"columns": [
"url"
],
"isUnique": true
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"columns": {},
"schemas": {},
"tables": {}
},
"internal": {
"indexes": {}
}
}

View file

@ -94,4 +94,4 @@
"breakpoints": true
}
]
}
}

View file

@ -39,12 +39,12 @@ importers:
dayjs:
specifier: ^1.11.10
version: 1.11.10
drizzle-kit:
specifier: ^0.23.0
version: 0.23.0
drizzle-orm:
specifier: ^0.32.0
version: 0.32.0(@types/better-sqlite3@7.6.9)(better-sqlite3@11.1.2)
ky:
specifier: ^1.5.0
version: 1.5.0
zod:
specifier: ^3.22.4
version: 3.22.4
@ -1227,6 +1227,10 @@ packages:
resolution: {integrity: sha512-o+NO+8WrRiQEE4/7nwRJhN1HWpVmJm511pBHUxPLtp0BUISzlBplORYSmTclCnJvQq2tKu/sgl3xVpkc7ZWuQQ==}
engines: {node: '>=6'}
ky@1.5.0:
resolution: {integrity: sha512-bkQo+UqryW6Zmo/DsixYZE4Z9t2mzvNMhceyIhuMuInb3knm5Q+GNGMKveydJAj+Z6piN1SwI6eR/V0G+Z0BtA==}
engines: {node: '>=18'}
lilconfig@2.1.0:
resolution: {integrity: sha512-utWOt/GHzuUxnLKxB6dk81RoOeoNeHgbrXiuGk4yyF5qlRz+iIVWu56E2fqGHFrXz0QNUhLB/8nKqvRH66JKGQ==}
engines: {node: '>=10'}
@ -2678,6 +2682,8 @@ snapshots:
kleur@4.1.5: {}
ky@1.5.0: {}
lilconfig@2.1.0: {}
lilconfig@3.1.1: {}

View file

@ -1 +1 @@
DATABASE_URL=sqlite://../sqlite.db
DATABASE_URL=sqlite://../db.db

View file

@ -0,0 +1,20 @@
{
"db_name": "SQLite",
"query": "select count(distinct ean) as count from precios",
"describe": {
"columns": [
{
"name": "count",
"ordinal": 0,
"type_info": "Integer"
}
],
"parameters": {
"Right": 0
},
"nullable": [
false
]
},
"hash": "2e632fbda989abf0d8a88a1d3bc1de0a9aefb0d3f3cdc33d26158d09faed97b2"
}

View file

@ -0,0 +1,38 @@
{
"db_name": "SQLite",
"query": "with search_results as (\n select f.ean from precios_fts f\n where f.name match ? and f.ean != ''\n group by f.ean\n\t\t\tlimit 100\n )\n select p.id, p.ean, p.name, p.image_url from search_results as s\n join precios as p\n on p.ean = s.ean\n where p.fetched_at = (\n SELECT MAX(fetched_at)\n FROM precios as pf\n WHERE pf.ean = s.ean and pf.name is not null\n );",
"describe": {
"columns": [
{
"name": "id",
"ordinal": 0,
"type_info": "Integer"
},
{
"name": "ean",
"ordinal": 1,
"type_info": "Text"
},
{
"name": "name",
"ordinal": 2,
"type_info": "Text"
},
{
"name": "image_url",
"ordinal": 3,
"type_info": "Text"
}
],
"parameters": {
"Right": 1
},
"nullable": [
false,
false,
true,
true
]
},
"hash": "3ee249afda554bbffe736257af05aba689c71188ce1a869e01988ac7ca1220a2"
}

View file

@ -6,7 +6,7 @@
{
"name": "count",
"ordinal": 0,
"type_info": "Int"
"type_info": "Integer"
}
],
"parameters": {

View file

@ -0,0 +1,56 @@
{
"db_name": "SQLite",
"query": "\nselect ean,fetched_at,precio_centavos,in_stock,url,name,image_url from precios\nwhere ean = ?\norder by fetched_at\n",
"describe": {
"columns": [
{
"name": "ean",
"ordinal": 0,
"type_info": "Text"
},
{
"name": "fetched_at",
"ordinal": 1,
"type_info": "Integer"
},
{
"name": "precio_centavos",
"ordinal": 2,
"type_info": "Integer"
},
{
"name": "in_stock",
"ordinal": 3,
"type_info": "Integer"
},
{
"name": "url",
"ordinal": 4,
"type_info": "Text"
},
{
"name": "name",
"ordinal": 5,
"type_info": "Text"
},
{
"name": "image_url",
"ordinal": 6,
"type_info": "Text"
}
],
"parameters": {
"Right": 1
},
"nullable": [
false,
false,
true,
true,
false,
true,
true
]
},
"hash": "88a597e29390fb04bbc48d9f88303551e068ddc478b037354c62bc77bc70ad96"
}

View file

@ -6,7 +6,7 @@
{
"name": "count",
"ordinal": 0,
"type_info": "Int"
"type_info": "Integer"
}
],
"parameters": {

135
rust/Cargo.lock generated
View file

@ -35,7 +35,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
dependencies = [
"cfg-if",
"getrandom 0.2.15",
"once_cell",
"version_check",
"zerocopy",
@ -163,7 +162,7 @@ checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.71",
"syn",
]
[[package]]
@ -354,6 +353,7 @@ dependencies = [
"iana-time-zone",
"js-sys",
"num-traits",
"serde",
"wasm-bindgen",
"windows-targets 0.52.6",
]
@ -386,10 +386,10 @@ version = "4.5.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2bac35c6dafb060fd4d275d9a4ffae97917c13a6327903a8be2153cd964f7085"
dependencies = [
"heck 0.5.0",
"heck",
"proc-macro2",
"quote",
"syn 2.0.71",
"syn",
]
[[package]]
@ -404,6 +404,15 @@ version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
[[package]]
name = "concurrent-queue"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "const-oid"
version = "0.9.6"
@ -552,9 +561,14 @@ dependencies = [
[[package]]
name = "event-listener"
version = "2.5.3"
version = "5.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
checksum = "6032be9bd27023a771701cc49f9f053c751055f71efb2e0ae5c15809093675ba"
dependencies = [
"concurrent-queue",
"parking",
"pin-project-lite",
]
[[package]]
name = "fastrand"
@ -665,7 +679,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.71",
"syn",
]
[[package]]
@ -748,22 +762,13 @@ dependencies = [
[[package]]
name = "hashlink"
version = "0.8.4"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af"
dependencies = [
"hashbrown",
]
[[package]]
name = "heck"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
dependencies = [
"unicode-segmentation",
]
[[package]]
name = "heck"
version = "0.5.0"
@ -1033,9 +1038,9 @@ checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
[[package]]
name = "libsqlite3-sys"
version = "0.27.0"
version = "0.28.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf4e226dcd58b4be396f7bd3c20da8fdee2911400705297ba7d2d7cc2c30f716"
checksum = "0c10584274047cb335c23d3e61bcef8e323adae7c5c8c760540f73610177fc3f"
dependencies = [
"cc",
"pkg-config",
@ -1234,6 +1239,12 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "parking"
version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb813b8af86854136c6922af0598d719255ecb2179515e6e7730d468f05c9cae"
[[package]]
name = "parking_lot"
version = "0.11.2"
@ -1320,7 +1331,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.71",
"syn",
]
[[package]]
@ -1794,7 +1805,7 @@ checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.71",
"syn",
]
[[package]]
@ -1900,6 +1911,9 @@ name = "smallvec"
version = "1.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
dependencies = [
"serde",
]
[[package]]
name = "socket2"
@ -1942,9 +1956,9 @@ dependencies = [
[[package]]
name = "sqlx"
version = "0.7.4"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c9a2ccff1a000a5a59cd33da541d9f2fdcd9e6e8229cc200565942bff36d0aaa"
checksum = "27144619c6e5802f1380337a209d2ac1c431002dd74c6e60aebff3c506dc4f0c"
dependencies = [
"sqlx-core",
"sqlx-macros",
@ -1955,11 +1969,10 @@ dependencies = [
[[package]]
name = "sqlx-core"
version = "0.7.4"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24ba59a9342a3d9bab6c56c118be528b27c9b60e490080e9711a04dccac83ef6"
checksum = "a999083c1af5b5d6c071d34a708a19ba3e02106ad82ef7bbd69f5e48266b613b"
dependencies = [
"ahash",
"atoi",
"byteorder",
"bytes",
@ -1973,6 +1986,7 @@ dependencies = [
"futures-intrusive",
"futures-io",
"futures-util",
"hashbrown",
"hashlink",
"hex",
"indexmap",
@ -1995,26 +2009,26 @@ dependencies = [
[[package]]
name = "sqlx-macros"
version = "0.7.4"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ea40e2345eb2faa9e1e5e326db8c34711317d2b5e08d0d5741619048a803127"
checksum = "a23217eb7d86c584b8cbe0337b9eacf12ab76fe7673c513141ec42565698bb88"
dependencies = [
"proc-macro2",
"quote",
"sqlx-core",
"sqlx-macros-core",
"syn 1.0.109",
"syn",
]
[[package]]
name = "sqlx-macros-core"
version = "0.7.4"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5833ef53aaa16d860e92123292f1f6a3d53c34ba8b1969f152ef1a7bb803f3c8"
checksum = "1a099220ae541c5db479c6424bdf1b200987934033c2584f79a0e1693601e776"
dependencies = [
"dotenvy",
"either",
"heck 0.4.1",
"heck",
"hex",
"once_cell",
"proc-macro2",
@ -2026,7 +2040,7 @@ dependencies = [
"sqlx-mysql",
"sqlx-postgres",
"sqlx-sqlite",
"syn 1.0.109",
"syn",
"tempfile",
"tokio",
"url",
@ -2034,12 +2048,12 @@ dependencies = [
[[package]]
name = "sqlx-mysql"
version = "0.7.4"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ed31390216d20e538e447a7a9b959e06ed9fc51c37b514b46eb758016ecd418"
checksum = "5afe4c38a9b417b6a9a5eeffe7235d0a106716495536e7727d1c7f4b1ff3eba6"
dependencies = [
"atoi",
"base64 0.21.7",
"base64 0.22.1",
"bitflags 2.6.0",
"byteorder",
"bytes",
@ -2077,12 +2091,12 @@ dependencies = [
[[package]]
name = "sqlx-postgres"
version = "0.7.4"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c824eb80b894f926f89a0b9da0c7f435d27cdd35b8c655b114e58223918577e"
checksum = "b1dbb157e65f10dbe01f729339c06d239120221c9ad9fa0ba8408c4cc18ecf21"
dependencies = [
"atoi",
"base64 0.21.7",
"base64 0.22.1",
"bitflags 2.6.0",
"byteorder",
"chrono",
@ -2116,9 +2130,9 @@ dependencies = [
[[package]]
name = "sqlx-sqlite"
version = "0.7.4"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b244ef0a8414da0bed4bb1910426e890b19e5e9bccc27ada6b797d05c55ae0aa"
checksum = "9b2cdd83c008a622d94499c0006d8ee5f821f36c89b7d625c900e5dc30b5c5ee"
dependencies = [
"atoi",
"chrono",
@ -2132,10 +2146,10 @@ dependencies = [
"log",
"percent-encoding",
"serde",
"serde_urlencoded",
"sqlx-core",
"tracing",
"url",
"urlencoding",
]
[[package]]
@ -2161,17 +2175,6 @@ version = "2.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
[[package]]
name = "syn"
version = "1.0.109"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "syn"
version = "2.0.71"
@ -2224,7 +2227,7 @@ checksum = "d20468752b09f49e909e55a5d338caa8bedf615594e9d80bc4c565d30faf798c"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.71",
"syn",
]
[[package]]
@ -2285,7 +2288,7 @@ checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.71",
"syn",
]
[[package]]
@ -2383,7 +2386,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.71",
"syn",
]
[[package]]
@ -2464,12 +2467,6 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e4259d9d4425d9f0661581b804cb85fe66a4c631cadd8f490d1c13a35d5d9291"
[[package]]
name = "unicode-segmentation"
version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
[[package]]
name = "unicode_categories"
version = "0.1.1"
@ -2493,12 +2490,6 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "urlencoding"
version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
[[package]]
name = "utf8-width"
version = "0.1.7"
@ -2577,7 +2568,7 @@ dependencies = [
"once_cell",
"proc-macro2",
"quote",
"syn 2.0.71",
"syn",
"wasm-bindgen-shared",
]
@ -2611,7 +2602,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.71",
"syn",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
@ -2863,7 +2854,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.71",
"syn",
]
[[package]]

View file

@ -9,10 +9,10 @@ edition = "2021"
again = "0.1.2"
anyhow = "1.0.79"
base64 = "0.21.7"
chrono = "0.4"
chrono = { version = "0.4", features = ["serde"] }
clap = { version = "4.4.15", features = ["derive"] }
cron = "0.12.0"
sqlx = { version = "0.7", features = [ "runtime-tokio", "sqlite", "chrono" ] }
sqlx = { version = "0.8", features = [ "runtime-tokio", "sqlite", "chrono", "json" ] }
futures = "0.3.30"
html-escape = "0.2.13"
itertools = "0.12.0"

View file

@ -1,8 +1,16 @@
use axum::{extract::State, http::StatusCode, response::IntoResponse, routing::get, Router};
use axum::{
extract::{Path, State},
http::StatusCode,
response::IntoResponse,
routing::get,
Json, Router,
};
use chrono::{DateTime, Utc};
use clap::ValueEnum;
use futures::future::join_all;
use itertools::Itertools;
use preciazo::supermercado::Supermercado;
use serde::Serialize;
use sqlx::{
sqlite::{SqliteConnectOptions, SqlitePoolOptions},
SqlitePool,
@ -94,31 +102,220 @@ async fn healthcheck(State(pool): State<SqlitePool>) -> impl IntoResponse {
}
}
#[derive(Serialize)]
struct CategoryWithProducts {
category: String,
products: Vec<Product>,
}
#[derive(Serialize)]
struct Product {
ean: String,
name: Option<String>,
image_url: Option<String>,
}
async fn get_best_selling(State(pool): State<SqlitePool>) -> impl IntoResponse {
#[derive(sqlx::FromRow, Debug)]
struct ProductWithCategory {
category: String,
ean: String,
name: Option<String>,
image_url: Option<String>,
}
let products_with_category = sqlx::query_as::<_, ProductWithCategory>(
"with latest_best_selling as (
select category, eans_json
from db_best_selling
group by category
having max(fetched_at)
),
extracted_eans as (
select latest_best_selling.category, json.value as ean
from latest_best_selling, json_each(latest_best_selling.eans_json) json
)
select extracted_eans.category, extracted_eans.ean, precios.image_url, name
from extracted_eans
join precios
on extracted_eans.ean = precios.ean
where
precios.fetched_at = (
SELECT MAX(fetched_at)
FROM precios
WHERE ean = extracted_eans.ean
)",
)
.fetch_all(&pool)
.await
.unwrap();
let categories = products_with_category
.iter()
.map(|p| p.category.clone())
.unique()
.collect_vec();
let categories_with_products = categories
.into_iter()
.map(|c| CategoryWithProducts {
category: c.clone(),
products: products_with_category
.iter()
.filter(|p| p.category == c)
.map(|p| Product {
ean: p.ean.clone(),
image_url: p.image_url.clone(),
name: p.name.clone(),
})
.collect_vec(),
})
.collect_vec();
Json(categories_with_products)
}
async fn get_product_history(
State(pool): State<SqlitePool>,
Path(ean): Path<String>,
) -> impl IntoResponse {
#[derive(sqlx::FromRow, Debug, Serialize)]
struct Precio {
ean: String,
fetched_at: chrono::DateTime<Utc>,
precio_centavos: Option<i64>,
in_stock: Option<bool>,
url: String,
name: Option<String>,
image_url: Option<String>,
}
let precios = sqlx::query!(
"
select ean,fetched_at,precio_centavos,in_stock,url,name,image_url from precios
where ean = ?
order by fetched_at
",
ean
)
.map(|r| Precio {
ean: r.ean,
url: r.url,
fetched_at: DateTime::from_timestamp(r.fetched_at, 0).unwrap(),
image_url: r.image_url,
name: r.name,
in_stock: r.in_stock.map(|x| x == 1),
precio_centavos: r.precio_centavos,
})
.fetch_all(&pool)
.await
.unwrap();
Json(precios)
}
async fn search(State(pool): State<SqlitePool>, Path(query): Path<String>) -> impl IntoResponse {
let sql_query = query
.clone()
.replace("\"", "\"\"")
.split(" ")
.map(|x| format!("\"{}\"", x))
.join(" ");
#[derive(Serialize)]
struct Result {
ean: String,
name: String,
image_url: String,
}
let results = sqlx::query!(
"with search_results as (
select f.ean from precios_fts f
where f.name match ? and f.ean != ''
group by f.ean
limit 100
)
select p.id, p.ean, p.name, p.image_url from search_results as s
join precios as p
on p.ean = s.ean
where p.fetched_at = (
SELECT MAX(fetched_at)
FROM precios as pf
WHERE pf.ean = s.ean and pf.name is not null
);",
sql_query
)
.fetch_all(&pool)
.await
.unwrap()
.into_iter()
.map(|r| Result {
ean: r.ean,
image_url: r.image_url.unwrap(),
name: r.name.unwrap(),
})
.collect_vec();
Json(results)
}
async fn get_info(State(pool): State<SqlitePool>) -> impl IntoResponse {
#[derive(Serialize)]
struct Info {
count: i64,
}
let count = sqlx::query!("select count(distinct ean) as count from precios")
.fetch_one(&pool)
.await
.unwrap()
.count;
Json(Info { count })
}
#[tokio::main]
async fn main() {
tracing_subscriber::fmt::init();
let pool = SqlitePoolOptions::new()
.max_connections(1)
.max_connections(10)
.connect_with(
SqliteConnectOptions::from_str(&format!(
"sqlite://{}",
env::var("DB_PATH").unwrap_or("../sqlite.db".to_string())
env::var("DB_PATH").unwrap_or("../db.db".to_string())
))
.unwrap()
.journal_mode(sqlx::sqlite::SqliteJournalMode::Wal)
.busy_timeout(Duration::from_secs(15))
.synchronous(sqlx::sqlite::SqliteSynchronous::Normal)
.busy_timeout(Duration::from_secs(30))
.optimize_on_close(true, None),
)
.await
.expect("can't connect to database");
sqlx::query("pragma temp_store = memory;")
.execute(&pool)
.await
.unwrap();
sqlx::query("pragma mmap_size = 30000000000;")
.execute(&pool)
.await
.unwrap();
sqlx::query("pragma page_size = 4096;")
.execute(&pool)
.await
.unwrap();
let app = Router::new()
.route("/", get(index))
.route("/api/healthcheck", get(healthcheck))
.route("/api/0/best-selling-products", get(get_best_selling))
.route("/api/0/ean/:ean/history", get(get_product_history))
.route("/api/0/info", get(get_info))
.route("/api/0/search/:query", get(search))
.with_state(pool);
let listener = tokio::net::TcpListener::bind("0.0.0.0:8000").await.unwrap();
tracing::debug!("listening on {}", listener.local_addr().unwrap());
tracing::info!("listening on {}", listener.local_addr().unwrap());
axum::serve(listener, app).await.unwrap();
}

View file

@ -1,3 +1,5 @@
use std::env;
use super::now_sec;
use super::AutoArgs;
use super::AutoTelegram;
@ -64,7 +66,16 @@ impl Auto {
// }
{
let t0 = now_sec();
let counters = self.scraper.fetch_list(&self.db, links).await;
let n_coroutines = if supermercado == Supermercado::Coto {
50
} else {
env::var("N_COROUTINES")
.map_or(Ok(24), |s| s.parse::<usize>())
.expect("N_COROUTINES no es un número")
};
let counters = self.scraper.fetch_list(&self.db, links, n_coroutines).await;
self.inform(&format!(
"Downloaded {:?}: {:?} (took {})",
&supermercado,

View file

@ -17,7 +17,7 @@ pub struct Db {
impl Db {
pub async fn connect() -> anyhow::Result<Self> {
let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string());
let db_path = env::var("DB_PATH").unwrap_or("../db.db".to_string());
info!("Opening DB at {}", db_path);
let read_pool = connect_to_db(&db_path, 32).await?;
let write_pool = connect_to_db(&db_path, 1).await?;

View file

@ -99,7 +99,7 @@ async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
let db = Db::connect().await?;
let scraper = Scraper::from_env().await?;
let counters = scraper.fetch_list(&db, links).await;
let counters = scraper.fetch_list(&db, links, 100).await;
println!("Finished: {:?}", counters);
Ok(())

View file

@ -128,11 +128,7 @@ impl Scraper {
counters
}
pub async fn fetch_list(&self, db: &Db, links: Vec<String>) -> Counters {
let n_coroutines = env::var("N_COROUTINES")
.map_or(Ok(24), |s| s.parse::<usize>())
.expect("N_COROUTINES no es un número");
pub async fn fetch_list(&self, db: &Db, links: Vec<String>, n_coroutines: usize) -> Counters {
stream::iter(links)
.map(|url| {
let db = db.clone();

View file

@ -56,7 +56,11 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error>
.find_map(|n| n.as_tag())
.map(|t| t.inner_text(dom.parser()))
// https://github.com/catdevnull/preciazo/issues/24
.map(|s| html_escape::decode_html_entities(s.trim()).to_string());
.map(|s| {
html_escape::decode_html_entities(s.trim())
.trim()
.to_string()
});
let image_url = dom
.query_selector(".zoomImage1")

View file

@ -207,14 +207,15 @@ pub async fn get_best_selling_by_category(
.append_pair("extensions", &{
let variables_obj = json!({"hideUnavailableItems":true,"skusFilter":"FIRST_AVAILABLE","simulationBehavior":"default","installmentCriteria":"MAX_WITHOUT_INTEREST","productOriginVtex":false,"map":"c","query":query,"orderBy":"OrderByTopSaleDESC","from":0,"to":99,"selectedFacets":
query.split('/').map(|f| json!({"key":"c","value":f})).collect::<Vec<_>>()
,"facetsBehavior":"Static","categoryTreeBehavior":"default","withFacets":false,"showSponsored":false});
,"facetsBehavior":"Static","categoryTreeBehavior":"default",
"withFacets":false,"showSponsored":false,"advertisementOptions":{"showSponsored":false,"sponsoredCount":0,"advertisementPlacement":"top_search","repeatSponsoredProducts":true}});
let b64=base64::prelude::BASE64_STANDARD.encode(variables_obj.to_string());
format!(
r#"{{
"persistedQuery": {{
"version": 1,
"sha256Hash": "fd92698fe375e8e4fa55d26fa62951d979b790fcf1032a6f02926081d199f550",
"sha256Hash": "8e3fd5f65d7d83516bfea23051b11e7aa469d85f26906f27e18afbee52c56ce4",
"sender": "vtex.store-resources@0.x",
"provider": "vtex.search-graphql@0.x"
}},

175
sepa-precios-archiver/.gitignore vendored Normal file
View file

@ -0,0 +1,175 @@
# Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore
# Logs
logs
_.log
npm-debug.log_
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
.pnpm-debug.log*
# Caches
.cache
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
# Runtime data
pids
_.pid
_.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# Snowpack dependency directory (https://snowpack.dev/)
web_modules/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional stylelint cache
.stylelintcache
# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variable files
.env
.env.development.local
.env.test.local
.env.production.local
.env.local
# parcel-bundler cache (https://parceljs.org/)
.parcel-cache
# Next.js build output
.next
out
# Nuxt.js build / generate output
.nuxt
dist
# Gatsby files
# Comment in the public line in if your project uses Gatsby and not Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public
# vuepress build output
.vuepress/dist
# vuepress v2.x temp and cache directory
.temp
# Docusaurus cache and generated files
.docusaurus
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# TernJS port file
.tern-port
# Stores VSCode versions used for testing VSCode extensions
.vscode-test
# yarn v2
.yarn/cache
.yarn/unplugged
.yarn/build-state.yml
.yarn/install-state.gz
.pnp.*
# IntelliJ based IDEs
.idea
# Finder (MacOS) folder config
.DS_Store

View file

@ -0,0 +1,19 @@
# sepa-precios-archiver
Archivador del dataset de precios de [Precios Claros - Base SEPA](https://datos.produccion.gob.ar/dataset/sepa-precios). Recomprime para utilizar ~8 veces menos espacio, y resube a un bucket mio de Backblaze B2.
## Instalación
Para instalar las dependencias:
```bash
bun install
```
Para ejecutarlo:
```bash
bun run index.ts
```
This project was created using `bun init` in bun v1.1.25. [Bun](https://bun.sh) is a fast all-in-one JavaScript runtime.

BIN
sepa-precios-archiver/bun.lockb Executable file

Binary file not shown.

View file

@ -0,0 +1,190 @@
import { z } from "zod";
import { zDatasetInfo } from "./schemas";
import { mkdtemp, writeFile, readdir, mkdir, rm } from "fs/promises";
import { basename, extname, join } from "path";
import { $, write } from "bun";
import { S3Client, HeadObjectCommand } from "@aws-sdk/client-s3";
import { Upload } from "@aws-sdk/lib-storage";
function checkEnvVariable(variableName: string) {
const value = process.env[variableName];
if (value) {
console.log(`${variableName} is set`);
return value;
} else {
console.log(`${variableName} is not set`);
process.exit(1);
}
}
const GITHUB_TOKEN = checkEnvVariable("GITHUB_TOKEN");
const B2_BUCKET_NAME = checkEnvVariable("B2_BUCKET_NAME");
const B2_BUCKET_KEY_ID = checkEnvVariable("B2_BUCKET_KEY_ID");
const B2_BUCKET_KEY = checkEnvVariable("B2_BUCKET_KEY");
const s3 = new S3Client({
endpoint: "https://s3.us-west-004.backblazeb2.com",
region: "us-west-004",
credentials: {
accessKeyId: B2_BUCKET_KEY_ID,
secretAccessKey: B2_BUCKET_KEY,
},
});
async function getRawDatasetInfo() {
const response = await fetchWithRetry(
"https://datos.produccion.gob.ar/api/3/action/package_show?id=sepa-precios",
);
const json = await response.json();
return json;
}
async function saveDatasetInfoIntoRepo(datasetInfo: any) {
const dir = await mkdtemp("/tmp/sepa-precios-archiver-metadata-repo-");
try {
await $`git clone https://catdevnull:${GITHUB_TOKEN}@github.com/catdevnull/sepa-precios-metadata.git ${dir}`;
await writeFile(
dir + "/dataset-info.json",
JSON.stringify(datasetInfo, null, 2),
);
await $`cd ${dir} && git add dataset-info.json`;
await $`cd ${dir} && git config user.email "git@nulo.in" && git config user.name "github actions"`;
await $`cd ${dir} && git diff --staged --quiet || git commit -m "Update dataset info"`;
await $`cd ${dir} && git push origin main`;
} finally {
await $`rm -rf ${dir}`;
}
console.log(`✅ Saved dataset info into repo`);
}
async function checkFileExistsInB2(fileName: string): Promise<boolean> {
try {
await s3.send(
new HeadObjectCommand({
Bucket: B2_BUCKET_NAME,
Key: fileName,
}),
);
return true;
} catch (error) {
if ((error as any).name === "NotFound") {
return false;
}
throw error;
}
}
async function uploadToB2Bucket(
fileName: string,
fileContent: ReadableStream | Blob | string,
) {
const upload = new Upload({
client: s3,
params: {
Bucket: B2_BUCKET_NAME,
Key: fileName,
Body: fileContent,
},
});
await upload.done();
}
const rawDatasetInfo = await getRawDatasetInfo();
await saveDatasetInfoIntoRepo(rawDatasetInfo);
let errored = false;
async function fetchWithRetry(
url: string,
maxRetries = 3,
waitTime = 15000,
): Promise<Response> {
let retries = 0;
while (retries < maxRetries) {
try {
const response = await fetch(url, {
signal: AbortSignal.timeout(waitTime),
});
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
return response;
} catch (error) {
console.error(`Attempt ${retries + 1} failed: ${error}`);
retries++;
if (retries >= maxRetries) {
throw error;
}
await new Promise((resolve) => setTimeout(resolve, 1000 * retries));
}
}
throw new Error("Max retries reached");
}
function checkRes(
res: Response,
): res is Response & { body: ReadableStream<Uint8Array> } {
if (!res.ok) {
console.error(`❌ Error downloading ${res.url}`);
errored = true;
return false;
}
if (!res.body) throw new Error(`❌ No body in response`);
return true;
}
await uploadToB2Bucket(
`timestamped-metadata/${new Date().toISOString()}.json`,
JSON.stringify(rawDatasetInfo, null, 2),
);
const datasetInfo = z.object({ result: zDatasetInfo }).parse(rawDatasetInfo);
for (const resource of datasetInfo.result.resources) {
if (extname(resource.url) === ".zip") {
const fileName = `${resource.id}-${basename(resource.url)}-repackaged.tar.zst`;
if (await checkFileExistsInB2(fileName)) continue;
console.log(`⬇️ Downloading, repackaging and uploading ${resource.url}`);
const dir = await mkdtemp("/tmp/sepa-precios-archiver-repackage-");
console.info(dir);
try {
const zip = join(dir, "zip");
await $`curl --retry 8 --retry-delay 5 --retry-all-errors -L -o ${zip} ${resource.url}`;
await $`unzip ${zip} -d ${dir}`;
await rm(zip);
for (const file of await readdir(dir)) {
const path = join(dir, file);
if (extname(file) !== ".zip") continue;
const extractDir = join(dir, basename(file, ".zip"));
await mkdir(extractDir, { recursive: true });
await $`cd ${dir} && unzip ${path} -d ${extractDir}`;
await rm(path);
}
await writeFile(
join(dir, "dataset-info.json"),
JSON.stringify(rawDatasetInfo, null, 2),
);
const compressed =
await $`tar -c -C ${dir} . | zstd -15 --long -T0`.blob();
await uploadToB2Bucket(fileName, compressed);
} finally {
await $`rm -rf ${dir}`;
}
} else {
const fileName = `${resource.id}-${basename(resource.url)}`;
if (await checkFileExistsInB2(fileName)) continue;
console.log(`⬇️ Downloading and reuploading ${resource.url}`);
const response = await fetchWithRetry(resource.url, 3, 60 * 1000);
if (!checkRes(response)) continue;
await uploadToB2Bucket(fileName, response.body);
}
}
if (errored) {
process.exit(1);
}

View file

@ -0,0 +1,16 @@
{
"name": "sepa-precios-archiver",
"module": "index.ts",
"type": "module",
"devDependencies": {
"@types/bun": "latest"
},
"peerDependencies": {
"typescript": "^5.5.4"
},
"dependencies": {
"@aws-sdk/client-s3": "^3.637.0",
"@aws-sdk/lib-storage": "^3.637.0",
"zod": "^3.23.8"
}
}

View file

@ -0,0 +1,17 @@
import { z } from "zod";
export const zDatasetInfo = z.object({
metadata_modified: z.coerce.date(),
metadata_created: z.coerce.date(),
resources: z.array(
z.object({
id: z.string(),
size: z.number(),
format: z.string(),
created: z.coerce.date(),
url: z.string(),
modified: z.coerce.date().optional(),
description: z.string(),
}),
),
});

View file

@ -0,0 +1,27 @@
{
"compilerOptions": {
// Enable latest features
"lib": ["ESNext", "DOM"],
"target": "ESNext",
"module": "ESNext",
"moduleDetection": "force",
"jsx": "react-jsx",
"allowJs": true,
// Bundler mode
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"verbatimModuleSyntax": true,
"noEmit": true,
// Best practices
"strict": true,
"skipLibCheck": true,
"noFallthroughCasesInSwitch": true,
// Some stricter flags (disabled by default)
"noUnusedLocals": false,
"noUnusedParameters": false,
"noPropertyAccessFromIndexSignature": false
}
}

175
sepa-precios-importer/.gitignore vendored Normal file
View file

@ -0,0 +1,175 @@
# Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore
# Logs
logs
_.log
npm-debug.log_
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
.pnpm-debug.log*
# Caches
.cache
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
# Runtime data
pids
_.pid
_.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# Snowpack dependency directory (https://snowpack.dev/)
web_modules/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional stylelint cache
.stylelintcache
# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variable files
.env
.env.development.local
.env.test.local
.env.production.local
.env.local
# parcel-bundler cache (https://parceljs.org/)
.parcel-cache
# Next.js build output
.next
out
# Nuxt.js build / generate output
.nuxt
dist
# Gatsby files
# Comment in the public line in if your project uses Gatsby and not Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public
# vuepress build output
.vuepress/dist
# vuepress v2.x temp and cache directory
.temp
# Docusaurus cache and generated files
.docusaurus
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# TernJS port file
.tern-port
# Stores VSCode versions used for testing VSCode extensions
.vscode-test
# yarn v2
.yarn/cache
.yarn/unplugged
.yarn/build-state.yml
.yarn/install-state.gz
.pnp.*
# IntelliJ based IDEs
.idea
# Finder (MacOS) folder config
.DS_Store

View file

@ -0,0 +1,14 @@
# sepa-precios-importer
Importador de [datasets de precios de SEPA](https://datos.produccion.gob.ar/dataset/sepa-precios/archivo/d076720f-a7f0-4af8-b1d6-1b99d5a90c14) a una base de datos PostgreSQL.
Vease [Errores en el formato de los datos SEPA](https://gist.github.com/catdevnull/587d5c63c4bab11b9798861c917db93b)
To install dependencies:
```bash
bun install
bun run index.ts ~/carpeta-con-datasets-descomprimidos
```
This project was created using `bun init` in bun v1.1.26. [Bun](https://bun.sh) is a fast all-in-one JavaScript runtime.

BIN
sepa-precios-importer/bun.lockb Executable file

Binary file not shown.

View file

@ -0,0 +1,231 @@
import { readFile } from "fs/promises";
import Papa from "papaparse";
import { basename, join, dirname } from "path";
import postgres from "postgres";
import { Readable } from "stream";
import { pipeline } from "node:stream/promises";
import { Glob } from "bun";
const sql = postgres({
database: "sepa-precios",
});
// await sql`
// drop table if exists precios;`;
// await sql`
// drop table if exists datasets;`;
await sql`
CREATE TABLE if not exists datasets (
id SERIAL PRIMARY KEY,
name TEXT UNIQUE,
date DATE
);`;
await sql`
CREATE TABLE if not exists sucursales (
id_dataset INTEGER REFERENCES datasets(id),
id_comercio INTEGER,
id_bandera INTEGER,
id_sucursal INTEGER,
sucursales_nombre TEXT,
sucursales_tipo TEXT,
sucursales_calle TEXT,
sucursales_numero TEXT,
sucursales_latitud NUMERIC,
sucursales_longitud NUMERIC,
sucursales_observaciones TEXT,
sucursales_barrio TEXT,
sucursales_codigo_postal TEXT,
sucursales_localidad TEXT,
sucursales_provincia TEXT,
sucursales_lunes_horario_atencion TEXT,
sucursales_martes_horario_atencion TEXT,
sucursales_miercoles_horario_atencion TEXT,
sucursales_jueves_horario_atencion TEXT,
sucursales_viernes_horario_atencion TEXT,
sucursales_sabado_horario_atencion TEXT,
sucursales_domingo_horario_atencion TEXT,
UNIQUE (id_dataset, id_comercio, id_bandera, id_sucursal)
);`;
await sql`
CREATE TABLE if not exists precios (
id_dataset INTEGER REFERENCES datasets(id),
id_comercio INTEGER,
id_bandera INTEGER,
id_sucursal INTEGER,
id_producto BIGINT,
productos_ean INTEGER,
productos_descripcion TEXT,
productos_cantidad_presentacion NUMERIC(10, 2),
productos_unidad_medida_presentacion TEXT,
productos_marca TEXT,
productos_precio_lista NUMERIC(10, 2),
productos_precio_referencia NUMERIC(10, 2),
productos_cantidad_referencia NUMERIC(10, 2),
productos_unidad_medida_referencia TEXT,
productos_precio_unitario_promo1 NUMERIC(10, 2),
productos_leyenda_promo1 TEXT,
productos_precio_unitario_promo2 NUMERIC(10, 2),
productos_leyenda_promo2 TEXT,
FOREIGN KEY (id_dataset, id_comercio, id_bandera, id_sucursal) REFERENCES sucursales(id_dataset, id_comercio, id_bandera, id_sucursal)
);
`;
async function importSucursales(
sql: postgres.Sql,
datasetId: number,
dir: string,
) {
const sucursales: Papa.ParseResult<any> = Papa.parse(
await readFile(join(dir, "sucursales.csv"), "utf-8"),
{
header: true,
},
);
const objs = sucursales.data
.filter((data) => data.id_comercio && data.id_bandera && data.id_sucursal)
.map((data) => {
// Megatone
if ("sucursales_domingohorario_atencion" in data) {
data.sucursales_domingo_horario_atencion =
data.sucursales_domingohorario_atencion;
delete data.sucursales_domingohorario_atencion;
}
return {
id_dataset: datasetId,
...data,
};
});
const keys = Object.keys(objs[0]);
const lines = Readable.from(
objs.map((data) => keys.map((key) => (data as any)[key]).join("\t") + "\n"),
);
const writable =
await sql`copy sucursales (${sql.unsafe(keys.join(", "))}) from stdin with CSV DELIMITER E'\t' QUOTE E'\b'`.writable();
await pipeline(lines, writable);
}
async function importDataset(dir: string) {
const date = basename(dir).match(/(\d{4}-\d{2}-\d{2})/)![1];
// TODO: parsear "Ultima actualizacion" al final del CSV y insertarlo en la tabla datasets
// {
// const res =
// await sql`select id from datasets where name = ${basename(dir)}`;
// await importSucursales(sql, res[0].id, dir);
// }
try {
await sql.begin(async (sql) => {
let datasetId: number;
const res =
await sql`insert into datasets (name, date) values (${basename(dir)}, ${date}) returning id`;
datasetId = res[0].id;
const datas: any[] = [];
const comercios: Papa.ParseResult<{ comercio_cuit: string }> = Papa.parse(
await readFile(join(dir, "comercio.csv"), "utf-8"),
{ header: true },
);
const comercioCuit = comercios.data[0].comercio_cuit;
console.log(`dataset ${datasetId}, comercio ${comercioCuit}`);
await importSucursales(sql, datasetId, dir);
let file = await readFile(join(dir, "productos.csv"), "utf-8");
// WALL OF SHAME: estos proveedores no saben producir CSVs correctos
if (comercioCuit == "30612929455") {
// Libertad S.A.
file = file.replaceAll("|RAPTOR 6X16X45", "/RAPTOR 6X16X45");
} else if (comercioCuit == "30578411174") {
// Alberdi S.A.
file = file.replaceAll(";", "|");
}
if (
["33504047089", "30707429468", "30589621499"].includes(comercioCuit)
) {
// TODO: si tienen los valores, pero con otros nombres, por ejemplo
// productos_precio_lista seria precio_unitario_bulto_por_unidad_venta_con_iva.
// pero no quiero mentir, asi que por ahora no lo importo
console.error(
`No voy a importar el dataset ${dir} porque el formato está mal. Pero se podría importar. Pero por ahora no lo voy a hacer. Véase https://gist.github.com/catdevnull/587d5c63c4bab11b9798861c917db93b`,
);
return;
}
console.time("parse");
return await new Promise((resolve, reject) => {
Papa.parse(file, {
header: true,
step: function (result: any) {
const { data } = result;
if (
data.id_comercio &&
data.id_bandera &&
data.id_sucursal &&
data.id_producto
)
datas.push(data);
},
complete: async function () {
try {
console.timeEnd("parse");
console.time("map");
const objs = datas.map((data) => {
delete data.id_dun_14;
return {
id_dataset: datasetId,
...data,
productos_descripcion: data.productos_descripcion.replaceAll(
"\t",
" ",
),
};
});
if (!objs.length) {
console.error(`No hay datos para el dataset ${dir}`);
return;
}
const keys = Object.keys(objs[0]);
const lines = Readable.from(
objs.map(
(data) => keys.map((key) => data[key]).join("\t") + "\n",
),
);
console.timeEnd("map");
console.time("copy");
const writable =
await sql`copy precios (${sql.unsafe(keys.join(", "))}) from stdin with CSV DELIMITER E'\t' QUOTE E'\b'`.writable();
await pipeline(lines, writable);
console.timeEnd("copy");
console.info(`saved ${objs.length} rows`);
} catch (e) {
reject(e);
return;
} finally {
Bun.gc(true);
resolve(void 0);
}
},
skipEmptyLines: true,
});
});
});
} catch (e) {
if ((e as any).code == "23505") {
console.log(`dataset ${basename(dir)} already exists`);
return;
}
throw e;
}
}
try {
const glob = new Glob("**/productos.csv");
for await (const file of glob.scan(process.argv[2])) {
const dir = join(process.argv[2], dirname(file));
console.log(dir);
await importDataset(dir);
}
} finally {
await sql.end();
}

View file

@ -0,0 +1,17 @@
{
"name": "sepa-precios-importer",
"module": "index.ts",
"type": "module",
"devDependencies": {
"@types/bun": "^1.1.7",
"@types/papaparse": "^5.3.14"
},
"peerDependencies": {
"typescript": "^5.0.0"
},
"dependencies": {
"p-queue": "^8.0.1",
"papaparse": "^5.4.1",
"postgres": "^3.4.4"
}
}

View file

@ -0,0 +1,27 @@
{
"compilerOptions": {
// Enable latest features
"lib": ["ESNext", "DOM"],
"target": "ESNext",
"module": "ESNext",
"moduleDetection": "force",
"jsx": "react-jsx",
"allowJs": true,
// Bundler mode
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"verbatimModuleSyntax": true,
"noEmit": true,
// Best practices
"strict": true,
"skipLibCheck": true,
"noFallthroughCasesInSwitch": true,
// Some stricter flags (disabled by default)
"noUnusedLocals": false,
"noUnusedParameters": false,
"noPropertyAccessFromIndexSignature": false
}
}

2
sitio/.env.development Normal file
View file

@ -0,0 +1,2 @@
DB_PATH=../db.db
VITE_API_HOST=http://localhost:8000

2
sitio/.gitignore vendored
View file

@ -4,7 +4,7 @@ node_modules
/.svelte-kit
/package
.env
.env.*
*.local
!.env.example
vite.config.js.timestamp-*
vite.config.ts.timestamp-*

View file

@ -40,6 +40,7 @@
"chartjs-adapter-dayjs-4": "^1.0.4",
"dayjs": "^1.11.10",
"drizzle-orm": "^0.32.0",
"ky": "^1.5.0",
"zod": "^3.22.4"
},
"packageManager": "pnpm@9.5.0+sha512.140036830124618d624a2187b50d04289d5a087f326c9edfc0ccd733d76c4f52c3a313d4fc148794a2a9d81553016004e6742e8cf850670268a7387fc220c903"

View file

@ -1,5 +1,9 @@
<script lang="ts" context="module">
export type Product = { ean: string; name: string; imageUrl: string | null };
export type Product = {
ean: string;
name: string | null;
image_url: string | null;
};
</script>
<script lang="ts">
@ -7,9 +11,9 @@
</script>
<a href={`/ean/${product.ean}`} class="flex gap-2">
{#if product.imageUrl}
{#if product.image_url}
<img
src={product.imageUrl}
src={product.image_url}
alt={product.name}
class="max-h-48"
loading="lazy"

View file

@ -1 +1,2 @@
// place files you want to import through the `$lib` alias in this folder.
export const API_HOST = import.meta.env.VITE_API_HOST;

View file

@ -1,2 +0,0 @@
export { getDb } from "db-datos/db.js";
export * as schema from "db-datos/schema.js";

View file

@ -1,15 +1,17 @@
import { countDistinct } from "drizzle-orm";
import type { PageServerLoad } from "./$types";
import { getDb, schema } from "$lib/server/db";
const { precios } = schema;
import { z } from "zod";
import ky from "ky";
import { API_HOST } from "$lib";
async function getInfo() {
return z
.object({
count: z.number(),
})
.parse(await ky.get(`${API_HOST}/api/0/info`).json());
}
export const load: PageServerLoad = async () => {
const db = await getDb();
const nProductosR = await db
.select({
count: countDistinct(precios.ean),
})
.from(precios);
const nProductos = nProductosR[0].count;
const nProductos = (await getInfo()).count;
return { nProductos };
};

View file

@ -1,68 +1,29 @@
import type { PageServerLoad } from "./$types";
import { getDb, schema } from "$lib/server/db";
const { precios, bestSelling } = schema;
import { max, sql } from "drizzle-orm";
import z from "zod";
import type { Product } from "$lib/ProductPreview.svelte";
type Data = {
category: string;
products: Product[];
}[];
let cache: Promise<{ key: Date; data: Data }> = doQuery();
async function doQuery() {
const db = await getDb();
const categories = await db
.select({
fetchedAt: bestSelling.fetchedAt,
category: bestSelling.category,
eansJson: bestSelling.eansJson,
})
.from(bestSelling)
.groupBy(bestSelling.category)
.having(max(bestSelling.fetchedAt));
const categoriesWithProducts = await Promise.all(
categories.map(async (category) => {
const eans = z.array(z.string()).parse(JSON.parse(category.eansJson));
const products = await db
.select({
ean: precios.ean,
name: precios.name,
imageUrl: precios.imageUrl,
})
.from(precios)
.where(sql`${precios.ean} in ${eans}`)
.groupBy(precios.ean)
.having(max(precios.fetchedAt));
return {
category: category.category,
products: eans
.map((ean) => products.find((p) => p.ean === ean))
.filter((x): x is Product => !!x && !!x.name),
};
}),
async function getBestSelling() {
const res = await fetch(
`${import.meta.env.VITE_API_HOST}/api/0/best-selling-products`,
);
return { key: new Date(), data: categoriesWithProducts };
const json = await res.json();
return z
.array(
z.object({
category: z.string(),
products: z.array(
z.object({
ean: z.string(),
name: z.string().nullable(),
image_url: z.string().nullable(),
}),
),
}),
)
.parse(json);
}
console.log("setting up interval");
setInterval(
async () => {
const c = await doQuery();
cache = Promise.resolve(c);
},
4 * 60 * 60 * 1000,
);
export const load: PageServerLoad = async ({
params,
}): Promise<{ data: Data }> => {
return { data: (await cache).data };
export const load: PageServerLoad = async ({ params }) => {
return {
data: await getBestSelling(),
};
};

View file

@ -1,20 +1,23 @@
import { error } from "@sveltejs/kit";
import { eq } from "drizzle-orm";
import type { PageServerLoad } from "./$types";
import { getDb, schema } from "$lib/server/db";
const { precios } = schema;
import { z } from "zod";
import { zPrecio, type Precio } from "./common";
import { API_HOST } from "$lib";
async function getProductHistory(ean: string) {
const res = await fetch(`${API_HOST}/api/0/ean/${ean}/history`);
const json = await res.json();
return z.array(zPrecio).parse(json);
}
export const load: PageServerLoad = async ({ params }) => {
const db = await getDb();
const q = db
.select()
.from(precios)
.where(eq(precios.ean, params.ean))
.orderBy(precios.fetchedAt);
const res = await q;
const res = await getProductHistory(params.ean);
if (res.length === 0) return error(404, "Not Found");
const meta = res.findLast((p) => p.name);
const meta = res.findLast(
(p): p is Precio & { name: string; image_url: string } =>
!!(p.name && p.image_url),
);
return { precios: res, meta };
};

View file

@ -1,18 +1,18 @@
<script lang="ts">
import { Supermercado, hosts } from "db-datos/supermercado";
import * as schema from "db-datos/schema";
import type { PageData } from "./$types";
import Chart from "./Chart.svelte";
import type { Precio } from "./common";
export let data: PageData;
let urls: Map<Supermercado, schema.Precio>;
let urls: Map<Supermercado, Precio>;
$: urls = data.precios.reduce((prev, curr) => {
const url = new URL(curr.url);
const supermercado = hosts[url.hostname];
prev.set(supermercado, curr);
return prev;
}, new Map<Supermercado, schema.Precio>());
}, new Map<Supermercado, Precio>());
const classBySupermercado: { [supermercado in Supermercado]: string } = {
[Supermercado.Dia]: "bg-[#d52b1e] focus:ring-[#d52b1e]",
@ -30,18 +30,18 @@
{#if data.meta}
<h1 class="text-3xl font-bold">{data.meta.name}</h1>
<img src={data.meta.imageUrl} alt={data.meta.name} class="max-h-48" />
<img src={data.meta.image_url} alt={data.meta.name} class="max-h-48" />
<div class="flex gap-2">
{#each urls as [supermercado, { url, precioCentavos }]}
{#each urls as [supermercado, { url, precio_centavos }]}
<a
href={url}
rel="noreferrer noopener"
target="_blank"
class={`focus:shadow-outline inline-flex flex-col items-center justify-center rounded-md ${classBySupermercado[supermercado]} px-4 py-2 font-medium tracking-wide text-white transition-colors duration-200 hover:bg-opacity-80 focus:outline-none focus:ring-2 focus:ring-offset-2`}
>
{#if precioCentavos}
{#if precio_centavos}
<span class="text-lg font-bold"
>{formatter.format(precioCentavos / 100)}</span
>{formatter.format(precio_centavos / 100)}</span
>
{/if}
<span class="text-sm">{supermercado}</span>

View file

@ -1,8 +1,8 @@
<script lang="ts">
import type { Precio } from "db-datos/schema";
// import dayjs from "dayjs";
import ChartJs from "./ChartJs.svelte";
import { hosts, colorBySupermercado } from "db-datos/supermercado";
import type { Precio } from "./common";
export let precios: Precio[];
@ -15,15 +15,15 @@
const ps = precios
.filter((p) => new URL(p.url!).hostname === host)
.filter(
(p): p is Precio & { precioCentavos: number } =>
p.precioCentavos !== null,
(p): p is Precio & { precio_centavos: number } =>
p.precio_centavos !== null,
);
return {
label: supermercado,
data: [
...ps.map((p) => ({
x: p.fetchedAt,
y: p.precioCentavos / 100,
x: p.fetched_at,
y: p.precio_centavos / 100,
})),
// lie
// ...ps.map((p) => ({

View file

@ -0,0 +1,12 @@
import { z } from "zod";
export const zPrecio = z.object({
ean: z.string(),
fetched_at: z.coerce.date(),
precio_centavos: z.number().nullable(),
in_stock: z.boolean().nullable(),
url: z.string(),
name: z.string().nullable(),
image_url: z.string().nullable(),
});
export type Precio = z.infer<typeof zPrecio>;

View file

@ -1,26 +1,29 @@
import { sql } from "drizzle-orm";
import { z } from "zod";
import type { PageServerLoad } from "./$types";
import { getDb } from "$lib/server/db";
import { API_HOST } from "$lib";
import ky from "ky";
const zProductResult = z.object({
ean: z.string(),
name: z.string(),
image_url: z.string(),
});
async function search(query: string) {
return z
.array(zProductResult)
.parse(
await ky
.get(`${API_HOST}/api/0/search/${encodeURIComponent(query)}`)
.json(),
);
}
export const load: PageServerLoad = async ({ url }) => {
const db = await getDb();
const query = url.searchParams.get("q");
let results: null | { ean: string; name: string; imageUrl: string }[] = null;
if (query) {
const sQuery = query
.replaceAll(`"`, `""`)
.split(" ")
.map((s) => `"${s}"`)
.join(" ");
console.debug(sQuery);
const sqlQuery = sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
join precios p on p.ean = f.ean
where f.name match ${sQuery}
group by p.ean
having max(p.fetched_at)
order by p.in_stock desc;`;
results = db.all(sqlQuery);
}
let results: null | { ean: string; name: string; image_url: string }[] = query
? await search(query)
: null;
return { query, results };
};