mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-22 22:26:19 +00:00
Compare commits
No commits in common. "91c7087bdc18f01554ed9e8a1076e51fd32569e5" and "6d32c897acc535401ae54571c769009579a66131" have entirely different histories.
91c7087bdc
...
6d32c897ac
50 changed files with 218 additions and 1679 deletions
|
@ -2,8 +2,6 @@ data/warcs/
|
|||
data/carrefour/
|
||||
*/*.db*
|
||||
sqlite.db
|
||||
db.db
|
||||
db.db-wal
|
||||
downloader/
|
||||
node_modules/
|
||||
*/node_modules/
|
||||
|
|
31
.github/workflows/sepa-precios-archiver.yml
vendored
31
.github/workflows/sepa-precios-archiver.yml
vendored
|
@ -1,31 +0,0 @@
|
|||
name: Sepa Precios Archiver
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: "0 */12 * * *" # Run every 6 hours
|
||||
workflow_dispatch: # Allow manual trigger
|
||||
|
||||
jobs:
|
||||
archive-prices:
|
||||
runs-on: ubicloud-standard-4
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: oven-sh/setup-bun@v2
|
||||
with:
|
||||
bun-version: latest
|
||||
# - name: Setup tmate session
|
||||
# uses: mxschmitt/action-tmate@v3
|
||||
# with:
|
||||
# limit-access-to-actor: true
|
||||
|
||||
- name: Run archiver script
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.ARCHIVE_GITHUB_TOKEN }}
|
||||
B2_BUCKET_NAME: ${{ secrets.B2_BUCKET_NAME }}
|
||||
B2_BUCKET_KEY_ID: ${{ secrets.B2_BUCKET_KEY_ID }}
|
||||
B2_BUCKET_KEY: ${{ secrets.B2_BUCKET_KEY }}
|
||||
run: |
|
||||
cd sepa-precios-archiver
|
||||
bun install --frozen-lockfile
|
||||
bun index.ts
|
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -3,7 +3,5 @@ node_modules/
|
|||
*.db-shm
|
||||
*.db-wal
|
||||
target/
|
||||
*.local
|
||||
.env.*
|
||||
|
||||
|
||||
.DS_Store
|
|
@ -1 +0,0 @@
|
|||
DB_PATH=../db.db
|
|
@ -1,4 +1,4 @@
|
|||
export const DB_PATH = process.env.DB_PATH ?? "../db.db";
|
||||
export const DB_PATH = process.env.DB_PATH ?? "../sqlite.db";
|
||||
|
||||
/** @type { import("drizzle-kit").Config } */
|
||||
export default {
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
-- Custom SQL migration file, put you code below! --
|
||||
CREATE VIRTUAL TABLE productos_fts USING fts5 (ean, name, content = precios, content_rowid = idd);
|
|
@ -1,208 +0,0 @@
|
|||
{
|
||||
"id": "f981b295-c9eb-4df5-88b1-d3765e4cc314",
|
||||
"prevId": "c95c6547-d540-45cf-aa9d-9d828efb468e",
|
||||
"version": "6",
|
||||
"dialect": "sqlite",
|
||||
"tables": {
|
||||
"db_best_selling": {
|
||||
"name": "db_best_selling",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "integer",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"autoincrement": true
|
||||
},
|
||||
"fetched_at": {
|
||||
"name": "fetched_at",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"category": {
|
||||
"name": "category",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"eans_json": {
|
||||
"name": "eans_json",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
}
|
||||
},
|
||||
"indexes": {},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {}
|
||||
},
|
||||
"precios": {
|
||||
"name": "precios",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "integer",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"autoincrement": true
|
||||
},
|
||||
"ean": {
|
||||
"name": "ean",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"fetched_at": {
|
||||
"name": "fetched_at",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"precio_centavos": {
|
||||
"name": "precio_centavos",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"in_stock": {
|
||||
"name": "in_stock",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"url": {
|
||||
"name": "url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"warc_record_id": {
|
||||
"name": "warc_record_id",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"parser_version": {
|
||||
"name": "parser_version",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"name": {
|
||||
"name": "name",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"image_url": {
|
||||
"name": "image_url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"precios_ean_idx": {
|
||||
"name": "precios_ean_idx",
|
||||
"columns": [
|
||||
"ean"
|
||||
],
|
||||
"isUnique": false
|
||||
},
|
||||
"precios_url_idx": {
|
||||
"name": "precios_url_idx",
|
||||
"columns": [
|
||||
"url"
|
||||
],
|
||||
"isUnique": false
|
||||
},
|
||||
"precios_fetched_at_idx": {
|
||||
"name": "precios_fetched_at_idx",
|
||||
"columns": [
|
||||
"fetched_at"
|
||||
],
|
||||
"isUnique": false
|
||||
},
|
||||
"precios_ean_fetched_at_idx": {
|
||||
"name": "precios_ean_fetched_at_idx",
|
||||
"columns": [
|
||||
"ean",
|
||||
"fetched_at"
|
||||
],
|
||||
"isUnique": false
|
||||
}
|
||||
},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {}
|
||||
},
|
||||
"producto_urls": {
|
||||
"name": "producto_urls",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "integer",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"autoincrement": true
|
||||
},
|
||||
"url": {
|
||||
"name": "url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"first_seen": {
|
||||
"name": "first_seen",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"last_seen": {
|
||||
"name": "last_seen",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"producto_urls_url_unique": {
|
||||
"name": "producto_urls_url_unique",
|
||||
"columns": [
|
||||
"url"
|
||||
],
|
||||
"isUnique": true
|
||||
}
|
||||
},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {}
|
||||
}
|
||||
},
|
||||
"enums": {},
|
||||
"_meta": {
|
||||
"columns": {},
|
||||
"schemas": {},
|
||||
"tables": {}
|
||||
},
|
||||
"internal": {
|
||||
"indexes": {}
|
||||
}
|
||||
}
|
|
@ -39,12 +39,12 @@ importers:
|
|||
dayjs:
|
||||
specifier: ^1.11.10
|
||||
version: 1.11.10
|
||||
drizzle-kit:
|
||||
specifier: ^0.23.0
|
||||
version: 0.23.0
|
||||
drizzle-orm:
|
||||
specifier: ^0.32.0
|
||||
version: 0.32.0(@types/better-sqlite3@7.6.9)(better-sqlite3@11.1.2)
|
||||
ky:
|
||||
specifier: ^1.5.0
|
||||
version: 1.5.0
|
||||
zod:
|
||||
specifier: ^3.22.4
|
||||
version: 3.22.4
|
||||
|
@ -1227,10 +1227,6 @@ packages:
|
|||
resolution: {integrity: sha512-o+NO+8WrRiQEE4/7nwRJhN1HWpVmJm511pBHUxPLtp0BUISzlBplORYSmTclCnJvQq2tKu/sgl3xVpkc7ZWuQQ==}
|
||||
engines: {node: '>=6'}
|
||||
|
||||
ky@1.5.0:
|
||||
resolution: {integrity: sha512-bkQo+UqryW6Zmo/DsixYZE4Z9t2mzvNMhceyIhuMuInb3knm5Q+GNGMKveydJAj+Z6piN1SwI6eR/V0G+Z0BtA==}
|
||||
engines: {node: '>=18'}
|
||||
|
||||
lilconfig@2.1.0:
|
||||
resolution: {integrity: sha512-utWOt/GHzuUxnLKxB6dk81RoOeoNeHgbrXiuGk4yyF5qlRz+iIVWu56E2fqGHFrXz0QNUhLB/8nKqvRH66JKGQ==}
|
||||
engines: {node: '>=10'}
|
||||
|
@ -2682,8 +2678,6 @@ snapshots:
|
|||
|
||||
kleur@4.1.5: {}
|
||||
|
||||
ky@1.5.0: {}
|
||||
|
||||
lilconfig@2.1.0: {}
|
||||
|
||||
lilconfig@3.1.1: {}
|
||||
|
|
|
@ -1 +1 @@
|
|||
DATABASE_URL=sqlite://../db.db
|
||||
DATABASE_URL=sqlite://../sqlite.db
|
||||
|
|
|
@ -1,20 +0,0 @@
|
|||
{
|
||||
"db_name": "SQLite",
|
||||
"query": "select count(distinct ean) as count from precios",
|
||||
"describe": {
|
||||
"columns": [
|
||||
{
|
||||
"name": "count",
|
||||
"ordinal": 0,
|
||||
"type_info": "Integer"
|
||||
}
|
||||
],
|
||||
"parameters": {
|
||||
"Right": 0
|
||||
},
|
||||
"nullable": [
|
||||
false
|
||||
]
|
||||
},
|
||||
"hash": "2e632fbda989abf0d8a88a1d3bc1de0a9aefb0d3f3cdc33d26158d09faed97b2"
|
||||
}
|
|
@ -1,38 +0,0 @@
|
|||
{
|
||||
"db_name": "SQLite",
|
||||
"query": "with search_results as (\n select f.ean from precios_fts f\n where f.name match ? and f.ean != ''\n group by f.ean\n\t\t\tlimit 100\n )\n select p.id, p.ean, p.name, p.image_url from search_results as s\n join precios as p\n on p.ean = s.ean\n where p.fetched_at = (\n SELECT MAX(fetched_at)\n FROM precios as pf\n WHERE pf.ean = s.ean and pf.name is not null\n );",
|
||||
"describe": {
|
||||
"columns": [
|
||||
{
|
||||
"name": "id",
|
||||
"ordinal": 0,
|
||||
"type_info": "Integer"
|
||||
},
|
||||
{
|
||||
"name": "ean",
|
||||
"ordinal": 1,
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"name": "name",
|
||||
"ordinal": 2,
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"name": "image_url",
|
||||
"ordinal": 3,
|
||||
"type_info": "Text"
|
||||
}
|
||||
],
|
||||
"parameters": {
|
||||
"Right": 1
|
||||
},
|
||||
"nullable": [
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
true
|
||||
]
|
||||
},
|
||||
"hash": "3ee249afda554bbffe736257af05aba689c71188ce1a869e01988ac7ca1220a2"
|
||||
}
|
|
@ -6,7 +6,7 @@
|
|||
{
|
||||
"name": "count",
|
||||
"ordinal": 0,
|
||||
"type_info": "Integer"
|
||||
"type_info": "Int"
|
||||
}
|
||||
],
|
||||
"parameters": {
|
||||
|
|
|
@ -1,56 +0,0 @@
|
|||
{
|
||||
"db_name": "SQLite",
|
||||
"query": "\nselect ean,fetched_at,precio_centavos,in_stock,url,name,image_url from precios\nwhere ean = ?\norder by fetched_at\n",
|
||||
"describe": {
|
||||
"columns": [
|
||||
{
|
||||
"name": "ean",
|
||||
"ordinal": 0,
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"name": "fetched_at",
|
||||
"ordinal": 1,
|
||||
"type_info": "Integer"
|
||||
},
|
||||
{
|
||||
"name": "precio_centavos",
|
||||
"ordinal": 2,
|
||||
"type_info": "Integer"
|
||||
},
|
||||
{
|
||||
"name": "in_stock",
|
||||
"ordinal": 3,
|
||||
"type_info": "Integer"
|
||||
},
|
||||
{
|
||||
"name": "url",
|
||||
"ordinal": 4,
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"name": "name",
|
||||
"ordinal": 5,
|
||||
"type_info": "Text"
|
||||
},
|
||||
{
|
||||
"name": "image_url",
|
||||
"ordinal": 6,
|
||||
"type_info": "Text"
|
||||
}
|
||||
],
|
||||
"parameters": {
|
||||
"Right": 1
|
||||
},
|
||||
"nullable": [
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
true,
|
||||
false,
|
||||
true,
|
||||
true
|
||||
]
|
||||
},
|
||||
"hash": "88a597e29390fb04bbc48d9f88303551e068ddc478b037354c62bc77bc70ad96"
|
||||
}
|
|
@ -6,7 +6,7 @@
|
|||
{
|
||||
"name": "count",
|
||||
"ordinal": 0,
|
||||
"type_info": "Integer"
|
||||
"type_info": "Int"
|
||||
}
|
||||
],
|
||||
"parameters": {
|
||||
|
|
135
rust/Cargo.lock
generated
135
rust/Cargo.lock
generated
|
@ -35,6 +35,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"getrandom 0.2.15",
|
||||
"once_cell",
|
||||
"version_check",
|
||||
"zerocopy",
|
||||
|
@ -162,7 +163,7 @@ checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107"
|
|||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"syn 2.0.71",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -353,7 +354,6 @@ dependencies = [
|
|||
"iana-time-zone",
|
||||
"js-sys",
|
||||
"num-traits",
|
||||
"serde",
|
||||
"wasm-bindgen",
|
||||
"windows-targets 0.52.6",
|
||||
]
|
||||
|
@ -386,10 +386,10 @@ version = "4.5.8"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2bac35c6dafb060fd4d275d9a4ffae97917c13a6327903a8be2153cd964f7085"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"syn 2.0.71",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -404,15 +404,6 @@ version = "1.0.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
|
||||
|
||||
[[package]]
|
||||
name = "concurrent-queue"
|
||||
version = "2.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const-oid"
|
||||
version = "0.9.6"
|
||||
|
@ -561,14 +552,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "event-listener"
|
||||
version = "5.3.1"
|
||||
version = "2.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6032be9bd27023a771701cc49f9f053c751055f71efb2e0ae5c15809093675ba"
|
||||
dependencies = [
|
||||
"concurrent-queue",
|
||||
"parking",
|
||||
"pin-project-lite",
|
||||
]
|
||||
checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
|
||||
|
||||
[[package]]
|
||||
name = "fastrand"
|
||||
|
@ -679,7 +665,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
|
|||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"syn 2.0.71",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -762,13 +748,22 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "hashlink"
|
||||
version = "0.9.1"
|
||||
version = "0.8.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af"
|
||||
checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
|
||||
dependencies = [
|
||||
"hashbrown",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
|
||||
dependencies = [
|
||||
"unicode-segmentation",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
|
@ -1038,9 +1033,9 @@ checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
|
|||
|
||||
[[package]]
|
||||
name = "libsqlite3-sys"
|
||||
version = "0.28.0"
|
||||
version = "0.27.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c10584274047cb335c23d3e61bcef8e323adae7c5c8c760540f73610177fc3f"
|
||||
checksum = "cf4e226dcd58b4be396f7bd3c20da8fdee2911400705297ba7d2d7cc2c30f716"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"pkg-config",
|
||||
|
@ -1239,12 +1234,6 @@ version = "0.1.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
|
||||
|
||||
[[package]]
|
||||
name = "parking"
|
||||
version = "2.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bb813b8af86854136c6922af0598d719255ecb2179515e6e7730d468f05c9cae"
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.11.2"
|
||||
|
@ -1331,7 +1320,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965"
|
|||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"syn 2.0.71",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -1805,7 +1794,7 @@ checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222"
|
|||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"syn 2.0.71",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -1911,9 +1900,6 @@ name = "smallvec"
|
|||
version = "1.13.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "socket2"
|
||||
|
@ -1956,9 +1942,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "sqlx"
|
||||
version = "0.8.0"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "27144619c6e5802f1380337a209d2ac1c431002dd74c6e60aebff3c506dc4f0c"
|
||||
checksum = "c9a2ccff1a000a5a59cd33da541d9f2fdcd9e6e8229cc200565942bff36d0aaa"
|
||||
dependencies = [
|
||||
"sqlx-core",
|
||||
"sqlx-macros",
|
||||
|
@ -1969,10 +1955,11 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "sqlx-core"
|
||||
version = "0.8.0"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a999083c1af5b5d6c071d34a708a19ba3e02106ad82ef7bbd69f5e48266b613b"
|
||||
checksum = "24ba59a9342a3d9bab6c56c118be528b27c9b60e490080e9711a04dccac83ef6"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"atoi",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
|
@ -1986,7 +1973,6 @@ dependencies = [
|
|||
"futures-intrusive",
|
||||
"futures-io",
|
||||
"futures-util",
|
||||
"hashbrown",
|
||||
"hashlink",
|
||||
"hex",
|
||||
"indexmap",
|
||||
|
@ -2009,26 +1995,26 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "sqlx-macros"
|
||||
version = "0.8.0"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a23217eb7d86c584b8cbe0337b9eacf12ab76fe7673c513141ec42565698bb88"
|
||||
checksum = "4ea40e2345eb2faa9e1e5e326db8c34711317d2b5e08d0d5741619048a803127"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"sqlx-core",
|
||||
"sqlx-macros-core",
|
||||
"syn",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sqlx-macros-core"
|
||||
version = "0.8.0"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a099220ae541c5db479c6424bdf1b200987934033c2584f79a0e1693601e776"
|
||||
checksum = "5833ef53aaa16d860e92123292f1f6a3d53c34ba8b1969f152ef1a7bb803f3c8"
|
||||
dependencies = [
|
||||
"dotenvy",
|
||||
"either",
|
||||
"heck",
|
||||
"heck 0.4.1",
|
||||
"hex",
|
||||
"once_cell",
|
||||
"proc-macro2",
|
||||
|
@ -2040,7 +2026,7 @@ dependencies = [
|
|||
"sqlx-mysql",
|
||||
"sqlx-postgres",
|
||||
"sqlx-sqlite",
|
||||
"syn",
|
||||
"syn 1.0.109",
|
||||
"tempfile",
|
||||
"tokio",
|
||||
"url",
|
||||
|
@ -2048,12 +2034,12 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "sqlx-mysql"
|
||||
version = "0.8.0"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5afe4c38a9b417b6a9a5eeffe7235d0a106716495536e7727d1c7f4b1ff3eba6"
|
||||
checksum = "1ed31390216d20e538e447a7a9b959e06ed9fc51c37b514b46eb758016ecd418"
|
||||
dependencies = [
|
||||
"atoi",
|
||||
"base64 0.22.1",
|
||||
"base64 0.21.7",
|
||||
"bitflags 2.6.0",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
|
@ -2091,12 +2077,12 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "sqlx-postgres"
|
||||
version = "0.8.0"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b1dbb157e65f10dbe01f729339c06d239120221c9ad9fa0ba8408c4cc18ecf21"
|
||||
checksum = "7c824eb80b894f926f89a0b9da0c7f435d27cdd35b8c655b114e58223918577e"
|
||||
dependencies = [
|
||||
"atoi",
|
||||
"base64 0.22.1",
|
||||
"base64 0.21.7",
|
||||
"bitflags 2.6.0",
|
||||
"byteorder",
|
||||
"chrono",
|
||||
|
@ -2130,9 +2116,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "sqlx-sqlite"
|
||||
version = "0.8.0"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b2cdd83c008a622d94499c0006d8ee5f821f36c89b7d625c900e5dc30b5c5ee"
|
||||
checksum = "b244ef0a8414da0bed4bb1910426e890b19e5e9bccc27ada6b797d05c55ae0aa"
|
||||
dependencies = [
|
||||
"atoi",
|
||||
"chrono",
|
||||
|
@ -2146,10 +2132,10 @@ dependencies = [
|
|||
"log",
|
||||
"percent-encoding",
|
||||
"serde",
|
||||
"serde_urlencoded",
|
||||
"sqlx-core",
|
||||
"tracing",
|
||||
"url",
|
||||
"urlencoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -2175,6 +2161,17 @@ version = "2.6.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.109"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.71"
|
||||
|
@ -2227,7 +2224,7 @@ checksum = "d20468752b09f49e909e55a5d338caa8bedf615594e9d80bc4c565d30faf798c"
|
|||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"syn 2.0.71",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -2288,7 +2285,7 @@ checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
|
|||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"syn 2.0.71",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -2386,7 +2383,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
|
|||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"syn 2.0.71",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -2467,6 +2464,12 @@ version = "0.1.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e4259d9d4425d9f0661581b804cb85fe66a4c631cadd8f490d1c13a35d5d9291"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-segmentation"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
|
||||
|
||||
[[package]]
|
||||
name = "unicode_categories"
|
||||
version = "0.1.1"
|
||||
|
@ -2490,6 +2493,12 @@ dependencies = [
|
|||
"percent-encoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "urlencoding"
|
||||
version = "2.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
|
||||
|
||||
[[package]]
|
||||
name = "utf8-width"
|
||||
version = "0.1.7"
|
||||
|
@ -2568,7 +2577,7 @@ dependencies = [
|
|||
"once_cell",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"syn 2.0.71",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
|
@ -2602,7 +2611,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
|
|||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"syn 2.0.71",
|
||||
"wasm-bindgen-backend",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
@ -2854,7 +2863,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
|
|||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"syn 2.0.71",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
|
@ -9,10 +9,10 @@ edition = "2021"
|
|||
again = "0.1.2"
|
||||
anyhow = "1.0.79"
|
||||
base64 = "0.21.7"
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
chrono = "0.4"
|
||||
clap = { version = "4.4.15", features = ["derive"] }
|
||||
cron = "0.12.0"
|
||||
sqlx = { version = "0.8", features = [ "runtime-tokio", "sqlite", "chrono", "json" ] }
|
||||
sqlx = { version = "0.7", features = [ "runtime-tokio", "sqlite", "chrono" ] }
|
||||
futures = "0.3.30"
|
||||
html-escape = "0.2.13"
|
||||
itertools = "0.12.0"
|
||||
|
|
|
@ -1,16 +1,8 @@
|
|||
use axum::{
|
||||
extract::{Path, State},
|
||||
http::StatusCode,
|
||||
response::IntoResponse,
|
||||
routing::get,
|
||||
Json, Router,
|
||||
};
|
||||
use chrono::{DateTime, Utc};
|
||||
use axum::{extract::State, http::StatusCode, response::IntoResponse, routing::get, Router};
|
||||
use clap::ValueEnum;
|
||||
use futures::future::join_all;
|
||||
use itertools::Itertools;
|
||||
use preciazo::supermercado::Supermercado;
|
||||
use serde::Serialize;
|
||||
use sqlx::{
|
||||
sqlite::{SqliteConnectOptions, SqlitePoolOptions},
|
||||
SqlitePool,
|
||||
|
@ -102,220 +94,31 @@ async fn healthcheck(State(pool): State<SqlitePool>) -> impl IntoResponse {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct CategoryWithProducts {
|
||||
category: String,
|
||||
products: Vec<Product>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Product {
|
||||
ean: String,
|
||||
name: Option<String>,
|
||||
image_url: Option<String>,
|
||||
}
|
||||
|
||||
async fn get_best_selling(State(pool): State<SqlitePool>) -> impl IntoResponse {
|
||||
#[derive(sqlx::FromRow, Debug)]
|
||||
struct ProductWithCategory {
|
||||
category: String,
|
||||
ean: String,
|
||||
name: Option<String>,
|
||||
image_url: Option<String>,
|
||||
}
|
||||
|
||||
let products_with_category = sqlx::query_as::<_, ProductWithCategory>(
|
||||
"with latest_best_selling as (
|
||||
select category, eans_json
|
||||
from db_best_selling
|
||||
group by category
|
||||
having max(fetched_at)
|
||||
),
|
||||
extracted_eans as (
|
||||
select latest_best_selling.category, json.value as ean
|
||||
from latest_best_selling, json_each(latest_best_selling.eans_json) json
|
||||
)
|
||||
select extracted_eans.category, extracted_eans.ean, precios.image_url, name
|
||||
from extracted_eans
|
||||
join precios
|
||||
on extracted_eans.ean = precios.ean
|
||||
where
|
||||
precios.fetched_at = (
|
||||
SELECT MAX(fetched_at)
|
||||
FROM precios
|
||||
WHERE ean = extracted_eans.ean
|
||||
)",
|
||||
)
|
||||
.fetch_all(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let categories = products_with_category
|
||||
.iter()
|
||||
.map(|p| p.category.clone())
|
||||
.unique()
|
||||
.collect_vec();
|
||||
|
||||
let categories_with_products = categories
|
||||
.into_iter()
|
||||
.map(|c| CategoryWithProducts {
|
||||
category: c.clone(),
|
||||
products: products_with_category
|
||||
.iter()
|
||||
.filter(|p| p.category == c)
|
||||
.map(|p| Product {
|
||||
ean: p.ean.clone(),
|
||||
image_url: p.image_url.clone(),
|
||||
name: p.name.clone(),
|
||||
})
|
||||
.collect_vec(),
|
||||
})
|
||||
.collect_vec();
|
||||
|
||||
Json(categories_with_products)
|
||||
}
|
||||
|
||||
async fn get_product_history(
|
||||
State(pool): State<SqlitePool>,
|
||||
Path(ean): Path<String>,
|
||||
) -> impl IntoResponse {
|
||||
#[derive(sqlx::FromRow, Debug, Serialize)]
|
||||
struct Precio {
|
||||
ean: String,
|
||||
fetched_at: chrono::DateTime<Utc>,
|
||||
precio_centavos: Option<i64>,
|
||||
in_stock: Option<bool>,
|
||||
url: String,
|
||||
name: Option<String>,
|
||||
image_url: Option<String>,
|
||||
}
|
||||
|
||||
let precios = sqlx::query!(
|
||||
"
|
||||
select ean,fetched_at,precio_centavos,in_stock,url,name,image_url from precios
|
||||
where ean = ?
|
||||
order by fetched_at
|
||||
",
|
||||
ean
|
||||
)
|
||||
.map(|r| Precio {
|
||||
ean: r.ean,
|
||||
url: r.url,
|
||||
fetched_at: DateTime::from_timestamp(r.fetched_at, 0).unwrap(),
|
||||
image_url: r.image_url,
|
||||
name: r.name,
|
||||
in_stock: r.in_stock.map(|x| x == 1),
|
||||
precio_centavos: r.precio_centavos,
|
||||
})
|
||||
.fetch_all(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
Json(precios)
|
||||
}
|
||||
async fn search(State(pool): State<SqlitePool>, Path(query): Path<String>) -> impl IntoResponse {
|
||||
let sql_query = query
|
||||
.clone()
|
||||
.replace("\"", "\"\"")
|
||||
.split(" ")
|
||||
.map(|x| format!("\"{}\"", x))
|
||||
.join(" ");
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Result {
|
||||
ean: String,
|
||||
name: String,
|
||||
image_url: String,
|
||||
}
|
||||
|
||||
let results = sqlx::query!(
|
||||
"with search_results as (
|
||||
select f.ean from precios_fts f
|
||||
where f.name match ? and f.ean != ''
|
||||
group by f.ean
|
||||
limit 100
|
||||
)
|
||||
select p.id, p.ean, p.name, p.image_url from search_results as s
|
||||
join precios as p
|
||||
on p.ean = s.ean
|
||||
where p.fetched_at = (
|
||||
SELECT MAX(fetched_at)
|
||||
FROM precios as pf
|
||||
WHERE pf.ean = s.ean and pf.name is not null
|
||||
);",
|
||||
sql_query
|
||||
)
|
||||
.fetch_all(&pool)
|
||||
.await
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|r| Result {
|
||||
ean: r.ean,
|
||||
image_url: r.image_url.unwrap(),
|
||||
name: r.name.unwrap(),
|
||||
})
|
||||
.collect_vec();
|
||||
|
||||
Json(results)
|
||||
}
|
||||
|
||||
async fn get_info(State(pool): State<SqlitePool>) -> impl IntoResponse {
|
||||
#[derive(Serialize)]
|
||||
struct Info {
|
||||
count: i64,
|
||||
}
|
||||
|
||||
let count = sqlx::query!("select count(distinct ean) as count from precios")
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap()
|
||||
.count;
|
||||
Json(Info { count })
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
tracing_subscriber::fmt::init();
|
||||
|
||||
let pool = SqlitePoolOptions::new()
|
||||
.max_connections(10)
|
||||
.max_connections(1)
|
||||
.connect_with(
|
||||
SqliteConnectOptions::from_str(&format!(
|
||||
"sqlite://{}",
|
||||
env::var("DB_PATH").unwrap_or("../db.db".to_string())
|
||||
env::var("DB_PATH").unwrap_or("../sqlite.db".to_string())
|
||||
))
|
||||
.unwrap()
|
||||
.journal_mode(sqlx::sqlite::SqliteJournalMode::Wal)
|
||||
.synchronous(sqlx::sqlite::SqliteSynchronous::Normal)
|
||||
.busy_timeout(Duration::from_secs(30))
|
||||
.busy_timeout(Duration::from_secs(15))
|
||||
.optimize_on_close(true, None),
|
||||
)
|
||||
.await
|
||||
.expect("can't connect to database");
|
||||
|
||||
sqlx::query("pragma temp_store = memory;")
|
||||
.execute(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
sqlx::query("pragma mmap_size = 30000000000;")
|
||||
.execute(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
sqlx::query("pragma page_size = 4096;")
|
||||
.execute(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let app = Router::new()
|
||||
.route("/", get(index))
|
||||
.route("/api/healthcheck", get(healthcheck))
|
||||
.route("/api/0/best-selling-products", get(get_best_selling))
|
||||
.route("/api/0/ean/:ean/history", get(get_product_history))
|
||||
.route("/api/0/info", get(get_info))
|
||||
.route("/api/0/search/:query", get(search))
|
||||
.with_state(pool);
|
||||
|
||||
let listener = tokio::net::TcpListener::bind("0.0.0.0:8000").await.unwrap();
|
||||
tracing::info!("listening on {}", listener.local_addr().unwrap());
|
||||
tracing::debug!("listening on {}", listener.local_addr().unwrap());
|
||||
axum::serve(listener, app).await.unwrap();
|
||||
}
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
use std::env;
|
||||
|
||||
use super::now_sec;
|
||||
use super::AutoArgs;
|
||||
use super::AutoTelegram;
|
||||
|
@ -66,16 +64,7 @@ impl Auto {
|
|||
// }
|
||||
{
|
||||
let t0 = now_sec();
|
||||
|
||||
let n_coroutines = if supermercado == Supermercado::Coto {
|
||||
50
|
||||
} else {
|
||||
env::var("N_COROUTINES")
|
||||
.map_or(Ok(24), |s| s.parse::<usize>())
|
||||
.expect("N_COROUTINES no es un número")
|
||||
};
|
||||
|
||||
let counters = self.scraper.fetch_list(&self.db, links, n_coroutines).await;
|
||||
let counters = self.scraper.fetch_list(&self.db, links).await;
|
||||
self.inform(&format!(
|
||||
"Downloaded {:?}: {:?} (took {})",
|
||||
&supermercado,
|
||||
|
|
|
@ -17,7 +17,7 @@ pub struct Db {
|
|||
|
||||
impl Db {
|
||||
pub async fn connect() -> anyhow::Result<Self> {
|
||||
let db_path = env::var("DB_PATH").unwrap_or("../db.db".to_string());
|
||||
let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string());
|
||||
info!("Opening DB at {}", db_path);
|
||||
let read_pool = connect_to_db(&db_path, 32).await?;
|
||||
let write_pool = connect_to_db(&db_path, 1).await?;
|
||||
|
|
|
@ -99,7 +99,7 @@ async fn fetch_list_cli(links_list_path: String) -> anyhow::Result<()> {
|
|||
|
||||
let db = Db::connect().await?;
|
||||
let scraper = Scraper::from_env().await?;
|
||||
let counters = scraper.fetch_list(&db, links, 100).await;
|
||||
let counters = scraper.fetch_list(&db, links).await;
|
||||
|
||||
println!("Finished: {:?}", counters);
|
||||
Ok(())
|
||||
|
|
|
@ -128,7 +128,11 @@ impl Scraper {
|
|||
counters
|
||||
}
|
||||
|
||||
pub async fn fetch_list(&self, db: &Db, links: Vec<String>, n_coroutines: usize) -> Counters {
|
||||
pub async fn fetch_list(&self, db: &Db, links: Vec<String>) -> Counters {
|
||||
let n_coroutines = env::var("N_COROUTINES")
|
||||
.map_or(Ok(24), |s| s.parse::<usize>())
|
||||
.expect("N_COROUTINES no es un número");
|
||||
|
||||
stream::iter(links)
|
||||
.map(|url| {
|
||||
let db = db.clone();
|
||||
|
|
|
@ -56,11 +56,7 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error>
|
|||
.find_map(|n| n.as_tag())
|
||||
.map(|t| t.inner_text(dom.parser()))
|
||||
// https://github.com/catdevnull/preciazo/issues/24
|
||||
.map(|s| {
|
||||
html_escape::decode_html_entities(s.trim())
|
||||
.trim()
|
||||
.to_string()
|
||||
});
|
||||
.map(|s| html_escape::decode_html_entities(s.trim()).to_string());
|
||||
|
||||
let image_url = dom
|
||||
.query_selector(".zoomImage1")
|
||||
|
|
|
@ -207,15 +207,14 @@ pub async fn get_best_selling_by_category(
|
|||
.append_pair("extensions", &{
|
||||
let variables_obj = json!({"hideUnavailableItems":true,"skusFilter":"FIRST_AVAILABLE","simulationBehavior":"default","installmentCriteria":"MAX_WITHOUT_INTEREST","productOriginVtex":false,"map":"c","query":query,"orderBy":"OrderByTopSaleDESC","from":0,"to":99,"selectedFacets":
|
||||
query.split('/').map(|f| json!({"key":"c","value":f})).collect::<Vec<_>>()
|
||||
,"facetsBehavior":"Static","categoryTreeBehavior":"default",
|
||||
"withFacets":false,"showSponsored":false,"advertisementOptions":{"showSponsored":false,"sponsoredCount":0,"advertisementPlacement":"top_search","repeatSponsoredProducts":true}});
|
||||
,"facetsBehavior":"Static","categoryTreeBehavior":"default","withFacets":false,"showSponsored":false});
|
||||
let b64=base64::prelude::BASE64_STANDARD.encode(variables_obj.to_string());
|
||||
|
||||
format!(
|
||||
r#"{{
|
||||
"persistedQuery": {{
|
||||
"version": 1,
|
||||
"sha256Hash": "8e3fd5f65d7d83516bfea23051b11e7aa469d85f26906f27e18afbee52c56ce4",
|
||||
"sha256Hash": "fd92698fe375e8e4fa55d26fa62951d979b790fcf1032a6f02926081d199f550",
|
||||
"sender": "vtex.store-resources@0.x",
|
||||
"provider": "vtex.search-graphql@0.x"
|
||||
}},
|
||||
|
|
175
sepa-precios-archiver/.gitignore
vendored
175
sepa-precios-archiver/.gitignore
vendored
|
@ -1,175 +0,0 @@
|
|||
# Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore
|
||||
|
||||
# Logs
|
||||
|
||||
logs
|
||||
_.log
|
||||
npm-debug.log_
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
lerna-debug.log*
|
||||
.pnpm-debug.log*
|
||||
|
||||
# Caches
|
||||
|
||||
.cache
|
||||
|
||||
# Diagnostic reports (https://nodejs.org/api/report.html)
|
||||
|
||||
report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
|
||||
|
||||
# Runtime data
|
||||
|
||||
pids
|
||||
_.pid
|
||||
_.seed
|
||||
*.pid.lock
|
||||
|
||||
# Directory for instrumented libs generated by jscoverage/JSCover
|
||||
|
||||
lib-cov
|
||||
|
||||
# Coverage directory used by tools like istanbul
|
||||
|
||||
coverage
|
||||
*.lcov
|
||||
|
||||
# nyc test coverage
|
||||
|
||||
.nyc_output
|
||||
|
||||
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
|
||||
|
||||
.grunt
|
||||
|
||||
# Bower dependency directory (https://bower.io/)
|
||||
|
||||
bower_components
|
||||
|
||||
# node-waf configuration
|
||||
|
||||
.lock-wscript
|
||||
|
||||
# Compiled binary addons (https://nodejs.org/api/addons.html)
|
||||
|
||||
build/Release
|
||||
|
||||
# Dependency directories
|
||||
|
||||
node_modules/
|
||||
jspm_packages/
|
||||
|
||||
# Snowpack dependency directory (https://snowpack.dev/)
|
||||
|
||||
web_modules/
|
||||
|
||||
# TypeScript cache
|
||||
|
||||
*.tsbuildinfo
|
||||
|
||||
# Optional npm cache directory
|
||||
|
||||
.npm
|
||||
|
||||
# Optional eslint cache
|
||||
|
||||
.eslintcache
|
||||
|
||||
# Optional stylelint cache
|
||||
|
||||
.stylelintcache
|
||||
|
||||
# Microbundle cache
|
||||
|
||||
.rpt2_cache/
|
||||
.rts2_cache_cjs/
|
||||
.rts2_cache_es/
|
||||
.rts2_cache_umd/
|
||||
|
||||
# Optional REPL history
|
||||
|
||||
.node_repl_history
|
||||
|
||||
# Output of 'npm pack'
|
||||
|
||||
*.tgz
|
||||
|
||||
# Yarn Integrity file
|
||||
|
||||
.yarn-integrity
|
||||
|
||||
# dotenv environment variable files
|
||||
|
||||
.env
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
.env.local
|
||||
|
||||
# parcel-bundler cache (https://parceljs.org/)
|
||||
|
||||
.parcel-cache
|
||||
|
||||
# Next.js build output
|
||||
|
||||
.next
|
||||
out
|
||||
|
||||
# Nuxt.js build / generate output
|
||||
|
||||
.nuxt
|
||||
dist
|
||||
|
||||
# Gatsby files
|
||||
|
||||
# Comment in the public line in if your project uses Gatsby and not Next.js
|
||||
|
||||
# https://nextjs.org/blog/next-9-1#public-directory-support
|
||||
|
||||
# public
|
||||
|
||||
# vuepress build output
|
||||
|
||||
.vuepress/dist
|
||||
|
||||
# vuepress v2.x temp and cache directory
|
||||
|
||||
.temp
|
||||
|
||||
# Docusaurus cache and generated files
|
||||
|
||||
.docusaurus
|
||||
|
||||
# Serverless directories
|
||||
|
||||
.serverless/
|
||||
|
||||
# FuseBox cache
|
||||
|
||||
.fusebox/
|
||||
|
||||
# DynamoDB Local files
|
||||
|
||||
.dynamodb/
|
||||
|
||||
# TernJS port file
|
||||
|
||||
.tern-port
|
||||
|
||||
# Stores VSCode versions used for testing VSCode extensions
|
||||
|
||||
.vscode-test
|
||||
|
||||
# yarn v2
|
||||
|
||||
.yarn/cache
|
||||
.yarn/unplugged
|
||||
.yarn/build-state.yml
|
||||
.yarn/install-state.gz
|
||||
.pnp.*
|
||||
|
||||
# IntelliJ based IDEs
|
||||
.idea
|
||||
|
||||
# Finder (MacOS) folder config
|
||||
.DS_Store
|
|
@ -1,19 +0,0 @@
|
|||
# sepa-precios-archiver
|
||||
|
||||
Archivador del dataset de precios de [Precios Claros - Base SEPA](https://datos.produccion.gob.ar/dataset/sepa-precios). Recomprime para utilizar ~8 veces menos espacio, y resube a un bucket mio de Backblaze B2.
|
||||
|
||||
## Instalación
|
||||
|
||||
Para instalar las dependencias:
|
||||
|
||||
```bash
|
||||
bun install
|
||||
```
|
||||
|
||||
Para ejecutarlo:
|
||||
|
||||
```bash
|
||||
bun run index.ts
|
||||
```
|
||||
|
||||
This project was created using `bun init` in bun v1.1.25. [Bun](https://bun.sh) is a fast all-in-one JavaScript runtime.
|
Binary file not shown.
|
@ -1,190 +0,0 @@
|
|||
import { z } from "zod";
|
||||
import { zDatasetInfo } from "./schemas";
|
||||
import { mkdtemp, writeFile, readdir, mkdir, rm } from "fs/promises";
|
||||
import { basename, extname, join } from "path";
|
||||
import { $, write } from "bun";
|
||||
import { S3Client, HeadObjectCommand } from "@aws-sdk/client-s3";
|
||||
import { Upload } from "@aws-sdk/lib-storage";
|
||||
|
||||
function checkEnvVariable(variableName: string) {
|
||||
const value = process.env[variableName];
|
||||
if (value) {
|
||||
console.log(`✅ ${variableName} is set`);
|
||||
return value;
|
||||
} else {
|
||||
console.log(`❌ ${variableName} is not set`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
const GITHUB_TOKEN = checkEnvVariable("GITHUB_TOKEN");
|
||||
const B2_BUCKET_NAME = checkEnvVariable("B2_BUCKET_NAME");
|
||||
const B2_BUCKET_KEY_ID = checkEnvVariable("B2_BUCKET_KEY_ID");
|
||||
const B2_BUCKET_KEY = checkEnvVariable("B2_BUCKET_KEY");
|
||||
|
||||
const s3 = new S3Client({
|
||||
endpoint: "https://s3.us-west-004.backblazeb2.com",
|
||||
region: "us-west-004",
|
||||
credentials: {
|
||||
accessKeyId: B2_BUCKET_KEY_ID,
|
||||
secretAccessKey: B2_BUCKET_KEY,
|
||||
},
|
||||
});
|
||||
|
||||
async function getRawDatasetInfo() {
|
||||
const response = await fetchWithRetry(
|
||||
"https://datos.produccion.gob.ar/api/3/action/package_show?id=sepa-precios",
|
||||
);
|
||||
const json = await response.json();
|
||||
return json;
|
||||
}
|
||||
|
||||
async function saveDatasetInfoIntoRepo(datasetInfo: any) {
|
||||
const dir = await mkdtemp("/tmp/sepa-precios-archiver-metadata-repo-");
|
||||
try {
|
||||
await $`git clone https://catdevnull:${GITHUB_TOKEN}@github.com/catdevnull/sepa-precios-metadata.git ${dir}`;
|
||||
await writeFile(
|
||||
dir + "/dataset-info.json",
|
||||
JSON.stringify(datasetInfo, null, 2),
|
||||
);
|
||||
await $`cd ${dir} && git add dataset-info.json`;
|
||||
await $`cd ${dir} && git config user.email "git@nulo.in" && git config user.name "github actions"`;
|
||||
await $`cd ${dir} && git diff --staged --quiet || git commit -m "Update dataset info"`;
|
||||
await $`cd ${dir} && git push origin main`;
|
||||
} finally {
|
||||
await $`rm -rf ${dir}`;
|
||||
}
|
||||
console.log(`✅ Saved dataset info into repo`);
|
||||
}
|
||||
|
||||
async function checkFileExistsInB2(fileName: string): Promise<boolean> {
|
||||
try {
|
||||
await s3.send(
|
||||
new HeadObjectCommand({
|
||||
Bucket: B2_BUCKET_NAME,
|
||||
Key: fileName,
|
||||
}),
|
||||
);
|
||||
return true;
|
||||
} catch (error) {
|
||||
if ((error as any).name === "NotFound") {
|
||||
return false;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async function uploadToB2Bucket(
|
||||
fileName: string,
|
||||
fileContent: ReadableStream | Blob | string,
|
||||
) {
|
||||
const upload = new Upload({
|
||||
client: s3,
|
||||
params: {
|
||||
Bucket: B2_BUCKET_NAME,
|
||||
Key: fileName,
|
||||
Body: fileContent,
|
||||
},
|
||||
});
|
||||
|
||||
await upload.done();
|
||||
}
|
||||
|
||||
const rawDatasetInfo = await getRawDatasetInfo();
|
||||
|
||||
await saveDatasetInfoIntoRepo(rawDatasetInfo);
|
||||
|
||||
let errored = false;
|
||||
|
||||
async function fetchWithRetry(
|
||||
url: string,
|
||||
maxRetries = 3,
|
||||
waitTime = 15000,
|
||||
): Promise<Response> {
|
||||
let retries = 0;
|
||||
while (retries < maxRetries) {
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
signal: AbortSignal.timeout(waitTime),
|
||||
});
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP error! status: ${response.status}`);
|
||||
}
|
||||
return response;
|
||||
} catch (error) {
|
||||
console.error(`Attempt ${retries + 1} failed: ${error}`);
|
||||
retries++;
|
||||
if (retries >= maxRetries) {
|
||||
throw error;
|
||||
}
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000 * retries));
|
||||
}
|
||||
}
|
||||
throw new Error("Max retries reached");
|
||||
}
|
||||
|
||||
function checkRes(
|
||||
res: Response,
|
||||
): res is Response & { body: ReadableStream<Uint8Array> } {
|
||||
if (!res.ok) {
|
||||
console.error(`❌ Error downloading ${res.url}`);
|
||||
errored = true;
|
||||
return false;
|
||||
}
|
||||
if (!res.body) throw new Error(`❌ No body in response`);
|
||||
return true;
|
||||
}
|
||||
|
||||
await uploadToB2Bucket(
|
||||
`timestamped-metadata/${new Date().toISOString()}.json`,
|
||||
JSON.stringify(rawDatasetInfo, null, 2),
|
||||
);
|
||||
|
||||
const datasetInfo = z.object({ result: zDatasetInfo }).parse(rawDatasetInfo);
|
||||
for (const resource of datasetInfo.result.resources) {
|
||||
if (extname(resource.url) === ".zip") {
|
||||
const fileName = `${resource.id}-${basename(resource.url)}-repackaged.tar.zst`;
|
||||
if (await checkFileExistsInB2(fileName)) continue;
|
||||
console.log(`⬇️ Downloading, repackaging and uploading ${resource.url}`);
|
||||
const dir = await mkdtemp("/tmp/sepa-precios-archiver-repackage-");
|
||||
console.info(dir);
|
||||
try {
|
||||
const zip = join(dir, "zip");
|
||||
await $`curl --retry 8 --retry-delay 5 --retry-all-errors -L -o ${zip} ${resource.url}`;
|
||||
await $`unzip ${zip} -d ${dir}`;
|
||||
await rm(zip);
|
||||
|
||||
for (const file of await readdir(dir)) {
|
||||
const path = join(dir, file);
|
||||
if (extname(file) !== ".zip") continue;
|
||||
const extractDir = join(dir, basename(file, ".zip"));
|
||||
await mkdir(extractDir, { recursive: true });
|
||||
await $`cd ${dir} && unzip ${path} -d ${extractDir}`;
|
||||
await rm(path);
|
||||
}
|
||||
|
||||
await writeFile(
|
||||
join(dir, "dataset-info.json"),
|
||||
JSON.stringify(rawDatasetInfo, null, 2),
|
||||
);
|
||||
|
||||
const compressed =
|
||||
await $`tar -c -C ${dir} . | zstd -15 --long -T0`.blob();
|
||||
await uploadToB2Bucket(fileName, compressed);
|
||||
} finally {
|
||||
await $`rm -rf ${dir}`;
|
||||
}
|
||||
} else {
|
||||
const fileName = `${resource.id}-${basename(resource.url)}`;
|
||||
if (await checkFileExistsInB2(fileName)) continue;
|
||||
console.log(`⬇️ Downloading and reuploading ${resource.url}`);
|
||||
const response = await fetchWithRetry(resource.url, 3, 60 * 1000);
|
||||
if (!checkRes(response)) continue;
|
||||
|
||||
await uploadToB2Bucket(fileName, response.body);
|
||||
}
|
||||
}
|
||||
|
||||
if (errored) {
|
||||
process.exit(1);
|
||||
}
|
|
@ -1,16 +0,0 @@
|
|||
{
|
||||
"name": "sepa-precios-archiver",
|
||||
"module": "index.ts",
|
||||
"type": "module",
|
||||
"devDependencies": {
|
||||
"@types/bun": "latest"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"typescript": "^5.5.4"
|
||||
},
|
||||
"dependencies": {
|
||||
"@aws-sdk/client-s3": "^3.637.0",
|
||||
"@aws-sdk/lib-storage": "^3.637.0",
|
||||
"zod": "^3.23.8"
|
||||
}
|
||||
}
|
|
@ -1,17 +0,0 @@
|
|||
import { z } from "zod";
|
||||
|
||||
export const zDatasetInfo = z.object({
|
||||
metadata_modified: z.coerce.date(),
|
||||
metadata_created: z.coerce.date(),
|
||||
resources: z.array(
|
||||
z.object({
|
||||
id: z.string(),
|
||||
size: z.number(),
|
||||
format: z.string(),
|
||||
created: z.coerce.date(),
|
||||
url: z.string(),
|
||||
modified: z.coerce.date().optional(),
|
||||
description: z.string(),
|
||||
}),
|
||||
),
|
||||
});
|
|
@ -1,27 +0,0 @@
|
|||
{
|
||||
"compilerOptions": {
|
||||
// Enable latest features
|
||||
"lib": ["ESNext", "DOM"],
|
||||
"target": "ESNext",
|
||||
"module": "ESNext",
|
||||
"moduleDetection": "force",
|
||||
"jsx": "react-jsx",
|
||||
"allowJs": true,
|
||||
|
||||
// Bundler mode
|
||||
"moduleResolution": "bundler",
|
||||
"allowImportingTsExtensions": true,
|
||||
"verbatimModuleSyntax": true,
|
||||
"noEmit": true,
|
||||
|
||||
// Best practices
|
||||
"strict": true,
|
||||
"skipLibCheck": true,
|
||||
"noFallthroughCasesInSwitch": true,
|
||||
|
||||
// Some stricter flags (disabled by default)
|
||||
"noUnusedLocals": false,
|
||||
"noUnusedParameters": false,
|
||||
"noPropertyAccessFromIndexSignature": false
|
||||
}
|
||||
}
|
175
sepa-precios-importer/.gitignore
vendored
175
sepa-precios-importer/.gitignore
vendored
|
@ -1,175 +0,0 @@
|
|||
# Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore
|
||||
|
||||
# Logs
|
||||
|
||||
logs
|
||||
_.log
|
||||
npm-debug.log_
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
lerna-debug.log*
|
||||
.pnpm-debug.log*
|
||||
|
||||
# Caches
|
||||
|
||||
.cache
|
||||
|
||||
# Diagnostic reports (https://nodejs.org/api/report.html)
|
||||
|
||||
report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
|
||||
|
||||
# Runtime data
|
||||
|
||||
pids
|
||||
_.pid
|
||||
_.seed
|
||||
*.pid.lock
|
||||
|
||||
# Directory for instrumented libs generated by jscoverage/JSCover
|
||||
|
||||
lib-cov
|
||||
|
||||
# Coverage directory used by tools like istanbul
|
||||
|
||||
coverage
|
||||
*.lcov
|
||||
|
||||
# nyc test coverage
|
||||
|
||||
.nyc_output
|
||||
|
||||
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
|
||||
|
||||
.grunt
|
||||
|
||||
# Bower dependency directory (https://bower.io/)
|
||||
|
||||
bower_components
|
||||
|
||||
# node-waf configuration
|
||||
|
||||
.lock-wscript
|
||||
|
||||
# Compiled binary addons (https://nodejs.org/api/addons.html)
|
||||
|
||||
build/Release
|
||||
|
||||
# Dependency directories
|
||||
|
||||
node_modules/
|
||||
jspm_packages/
|
||||
|
||||
# Snowpack dependency directory (https://snowpack.dev/)
|
||||
|
||||
web_modules/
|
||||
|
||||
# TypeScript cache
|
||||
|
||||
*.tsbuildinfo
|
||||
|
||||
# Optional npm cache directory
|
||||
|
||||
.npm
|
||||
|
||||
# Optional eslint cache
|
||||
|
||||
.eslintcache
|
||||
|
||||
# Optional stylelint cache
|
||||
|
||||
.stylelintcache
|
||||
|
||||
# Microbundle cache
|
||||
|
||||
.rpt2_cache/
|
||||
.rts2_cache_cjs/
|
||||
.rts2_cache_es/
|
||||
.rts2_cache_umd/
|
||||
|
||||
# Optional REPL history
|
||||
|
||||
.node_repl_history
|
||||
|
||||
# Output of 'npm pack'
|
||||
|
||||
*.tgz
|
||||
|
||||
# Yarn Integrity file
|
||||
|
||||
.yarn-integrity
|
||||
|
||||
# dotenv environment variable files
|
||||
|
||||
.env
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
.env.local
|
||||
|
||||
# parcel-bundler cache (https://parceljs.org/)
|
||||
|
||||
.parcel-cache
|
||||
|
||||
# Next.js build output
|
||||
|
||||
.next
|
||||
out
|
||||
|
||||
# Nuxt.js build / generate output
|
||||
|
||||
.nuxt
|
||||
dist
|
||||
|
||||
# Gatsby files
|
||||
|
||||
# Comment in the public line in if your project uses Gatsby and not Next.js
|
||||
|
||||
# https://nextjs.org/blog/next-9-1#public-directory-support
|
||||
|
||||
# public
|
||||
|
||||
# vuepress build output
|
||||
|
||||
.vuepress/dist
|
||||
|
||||
# vuepress v2.x temp and cache directory
|
||||
|
||||
.temp
|
||||
|
||||
# Docusaurus cache and generated files
|
||||
|
||||
.docusaurus
|
||||
|
||||
# Serverless directories
|
||||
|
||||
.serverless/
|
||||
|
||||
# FuseBox cache
|
||||
|
||||
.fusebox/
|
||||
|
||||
# DynamoDB Local files
|
||||
|
||||
.dynamodb/
|
||||
|
||||
# TernJS port file
|
||||
|
||||
.tern-port
|
||||
|
||||
# Stores VSCode versions used for testing VSCode extensions
|
||||
|
||||
.vscode-test
|
||||
|
||||
# yarn v2
|
||||
|
||||
.yarn/cache
|
||||
.yarn/unplugged
|
||||
.yarn/build-state.yml
|
||||
.yarn/install-state.gz
|
||||
.pnp.*
|
||||
|
||||
# IntelliJ based IDEs
|
||||
.idea
|
||||
|
||||
# Finder (MacOS) folder config
|
||||
.DS_Store
|
|
@ -1,14 +0,0 @@
|
|||
# sepa-precios-importer
|
||||
|
||||
Importador de [datasets de precios de SEPA](https://datos.produccion.gob.ar/dataset/sepa-precios/archivo/d076720f-a7f0-4af8-b1d6-1b99d5a90c14) a una base de datos PostgreSQL.
|
||||
|
||||
Vease [Errores en el formato de los datos SEPA](https://gist.github.com/catdevnull/587d5c63c4bab11b9798861c917db93b)
|
||||
|
||||
To install dependencies:
|
||||
|
||||
```bash
|
||||
bun install
|
||||
bun run index.ts ~/carpeta-con-datasets-descomprimidos
|
||||
```
|
||||
|
||||
This project was created using `bun init` in bun v1.1.26. [Bun](https://bun.sh) is a fast all-in-one JavaScript runtime.
|
Binary file not shown.
|
@ -1,231 +0,0 @@
|
|||
import { readFile } from "fs/promises";
|
||||
import Papa from "papaparse";
|
||||
import { basename, join, dirname } from "path";
|
||||
import postgres from "postgres";
|
||||
import { Readable } from "stream";
|
||||
import { pipeline } from "node:stream/promises";
|
||||
import { Glob } from "bun";
|
||||
|
||||
const sql = postgres({
|
||||
database: "sepa-precios",
|
||||
});
|
||||
|
||||
// await sql`
|
||||
// drop table if exists precios;`;
|
||||
// await sql`
|
||||
// drop table if exists datasets;`;
|
||||
await sql`
|
||||
CREATE TABLE if not exists datasets (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name TEXT UNIQUE,
|
||||
date DATE
|
||||
);`;
|
||||
await sql`
|
||||
CREATE TABLE if not exists sucursales (
|
||||
id_dataset INTEGER REFERENCES datasets(id),
|
||||
id_comercio INTEGER,
|
||||
id_bandera INTEGER,
|
||||
id_sucursal INTEGER,
|
||||
sucursales_nombre TEXT,
|
||||
sucursales_tipo TEXT,
|
||||
sucursales_calle TEXT,
|
||||
sucursales_numero TEXT,
|
||||
sucursales_latitud NUMERIC,
|
||||
sucursales_longitud NUMERIC,
|
||||
sucursales_observaciones TEXT,
|
||||
sucursales_barrio TEXT,
|
||||
sucursales_codigo_postal TEXT,
|
||||
sucursales_localidad TEXT,
|
||||
sucursales_provincia TEXT,
|
||||
sucursales_lunes_horario_atencion TEXT,
|
||||
sucursales_martes_horario_atencion TEXT,
|
||||
sucursales_miercoles_horario_atencion TEXT,
|
||||
sucursales_jueves_horario_atencion TEXT,
|
||||
sucursales_viernes_horario_atencion TEXT,
|
||||
sucursales_sabado_horario_atencion TEXT,
|
||||
sucursales_domingo_horario_atencion TEXT,
|
||||
UNIQUE (id_dataset, id_comercio, id_bandera, id_sucursal)
|
||||
);`;
|
||||
await sql`
|
||||
CREATE TABLE if not exists precios (
|
||||
id_dataset INTEGER REFERENCES datasets(id),
|
||||
id_comercio INTEGER,
|
||||
id_bandera INTEGER,
|
||||
id_sucursal INTEGER,
|
||||
id_producto BIGINT,
|
||||
productos_ean INTEGER,
|
||||
productos_descripcion TEXT,
|
||||
productos_cantidad_presentacion NUMERIC(10, 2),
|
||||
productos_unidad_medida_presentacion TEXT,
|
||||
productos_marca TEXT,
|
||||
productos_precio_lista NUMERIC(10, 2),
|
||||
productos_precio_referencia NUMERIC(10, 2),
|
||||
productos_cantidad_referencia NUMERIC(10, 2),
|
||||
productos_unidad_medida_referencia TEXT,
|
||||
productos_precio_unitario_promo1 NUMERIC(10, 2),
|
||||
productos_leyenda_promo1 TEXT,
|
||||
productos_precio_unitario_promo2 NUMERIC(10, 2),
|
||||
productos_leyenda_promo2 TEXT,
|
||||
FOREIGN KEY (id_dataset, id_comercio, id_bandera, id_sucursal) REFERENCES sucursales(id_dataset, id_comercio, id_bandera, id_sucursal)
|
||||
);
|
||||
`;
|
||||
|
||||
async function importSucursales(
|
||||
sql: postgres.Sql,
|
||||
datasetId: number,
|
||||
dir: string,
|
||||
) {
|
||||
const sucursales: Papa.ParseResult<any> = Papa.parse(
|
||||
await readFile(join(dir, "sucursales.csv"), "utf-8"),
|
||||
{
|
||||
header: true,
|
||||
},
|
||||
);
|
||||
|
||||
const objs = sucursales.data
|
||||
.filter((data) => data.id_comercio && data.id_bandera && data.id_sucursal)
|
||||
.map((data) => {
|
||||
// Megatone
|
||||
if ("sucursales_domingohorario_atencion" in data) {
|
||||
data.sucursales_domingo_horario_atencion =
|
||||
data.sucursales_domingohorario_atencion;
|
||||
delete data.sucursales_domingohorario_atencion;
|
||||
}
|
||||
return {
|
||||
id_dataset: datasetId,
|
||||
...data,
|
||||
};
|
||||
});
|
||||
const keys = Object.keys(objs[0]);
|
||||
const lines = Readable.from(
|
||||
objs.map((data) => keys.map((key) => (data as any)[key]).join("\t") + "\n"),
|
||||
);
|
||||
const writable =
|
||||
await sql`copy sucursales (${sql.unsafe(keys.join(", "))}) from stdin with CSV DELIMITER E'\t' QUOTE E'\b'`.writable();
|
||||
await pipeline(lines, writable);
|
||||
}
|
||||
|
||||
async function importDataset(dir: string) {
|
||||
const date = basename(dir).match(/(\d{4}-\d{2}-\d{2})/)![1];
|
||||
// TODO: parsear "Ultima actualizacion" al final del CSV y insertarlo en la tabla datasets
|
||||
|
||||
// {
|
||||
// const res =
|
||||
// await sql`select id from datasets where name = ${basename(dir)}`;
|
||||
// await importSucursales(sql, res[0].id, dir);
|
||||
// }
|
||||
|
||||
try {
|
||||
await sql.begin(async (sql) => {
|
||||
let datasetId: number;
|
||||
const res =
|
||||
await sql`insert into datasets (name, date) values (${basename(dir)}, ${date}) returning id`;
|
||||
datasetId = res[0].id;
|
||||
const datas: any[] = [];
|
||||
|
||||
const comercios: Papa.ParseResult<{ comercio_cuit: string }> = Papa.parse(
|
||||
await readFile(join(dir, "comercio.csv"), "utf-8"),
|
||||
{ header: true },
|
||||
);
|
||||
const comercioCuit = comercios.data[0].comercio_cuit;
|
||||
console.log(`dataset ${datasetId}, comercio ${comercioCuit}`);
|
||||
|
||||
await importSucursales(sql, datasetId, dir);
|
||||
|
||||
let file = await readFile(join(dir, "productos.csv"), "utf-8");
|
||||
// WALL OF SHAME: estos proveedores no saben producir CSVs correctos
|
||||
if (comercioCuit == "30612929455") {
|
||||
// Libertad S.A.
|
||||
file = file.replaceAll("|RAPTOR 6X16X45", "/RAPTOR 6X16X45");
|
||||
} else if (comercioCuit == "30578411174") {
|
||||
// Alberdi S.A.
|
||||
file = file.replaceAll(";", "|");
|
||||
}
|
||||
if (
|
||||
["33504047089", "30707429468", "30589621499"].includes(comercioCuit)
|
||||
) {
|
||||
// TODO: si tienen los valores, pero con otros nombres, por ejemplo
|
||||
// productos_precio_lista seria precio_unitario_bulto_por_unidad_venta_con_iva.
|
||||
// pero no quiero mentir, asi que por ahora no lo importo
|
||||
console.error(
|
||||
`No voy a importar el dataset ${dir} porque el formato está mal. Pero se podría importar. Pero por ahora no lo voy a hacer. Véase https://gist.github.com/catdevnull/587d5c63c4bab11b9798861c917db93b`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
console.time("parse");
|
||||
return await new Promise((resolve, reject) => {
|
||||
Papa.parse(file, {
|
||||
header: true,
|
||||
step: function (result: any) {
|
||||
const { data } = result;
|
||||
if (
|
||||
data.id_comercio &&
|
||||
data.id_bandera &&
|
||||
data.id_sucursal &&
|
||||
data.id_producto
|
||||
)
|
||||
datas.push(data);
|
||||
},
|
||||
complete: async function () {
|
||||
try {
|
||||
console.timeEnd("parse");
|
||||
console.time("map");
|
||||
const objs = datas.map((data) => {
|
||||
delete data.id_dun_14;
|
||||
return {
|
||||
id_dataset: datasetId,
|
||||
...data,
|
||||
productos_descripcion: data.productos_descripcion.replaceAll(
|
||||
"\t",
|
||||
" ",
|
||||
),
|
||||
};
|
||||
});
|
||||
if (!objs.length) {
|
||||
console.error(`No hay datos para el dataset ${dir}`);
|
||||
return;
|
||||
}
|
||||
const keys = Object.keys(objs[0]);
|
||||
const lines = Readable.from(
|
||||
objs.map(
|
||||
(data) => keys.map((key) => data[key]).join("\t") + "\n",
|
||||
),
|
||||
);
|
||||
console.timeEnd("map");
|
||||
console.time("copy");
|
||||
const writable =
|
||||
await sql`copy precios (${sql.unsafe(keys.join(", "))}) from stdin with CSV DELIMITER E'\t' QUOTE E'\b'`.writable();
|
||||
await pipeline(lines, writable);
|
||||
console.timeEnd("copy");
|
||||
console.info(`saved ${objs.length} rows`);
|
||||
} catch (e) {
|
||||
reject(e);
|
||||
return;
|
||||
} finally {
|
||||
Bun.gc(true);
|
||||
resolve(void 0);
|
||||
}
|
||||
},
|
||||
skipEmptyLines: true,
|
||||
});
|
||||
});
|
||||
});
|
||||
} catch (e) {
|
||||
if ((e as any).code == "23505") {
|
||||
console.log(`dataset ${basename(dir)} already exists`);
|
||||
return;
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const glob = new Glob("**/productos.csv");
|
||||
for await (const file of glob.scan(process.argv[2])) {
|
||||
const dir = join(process.argv[2], dirname(file));
|
||||
console.log(dir);
|
||||
await importDataset(dir);
|
||||
}
|
||||
} finally {
|
||||
await sql.end();
|
||||
}
|
|
@ -1,17 +0,0 @@
|
|||
{
|
||||
"name": "sepa-precios-importer",
|
||||
"module": "index.ts",
|
||||
"type": "module",
|
||||
"devDependencies": {
|
||||
"@types/bun": "^1.1.7",
|
||||
"@types/papaparse": "^5.3.14"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"typescript": "^5.0.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"p-queue": "^8.0.1",
|
||||
"papaparse": "^5.4.1",
|
||||
"postgres": "^3.4.4"
|
||||
}
|
||||
}
|
|
@ -1,27 +0,0 @@
|
|||
{
|
||||
"compilerOptions": {
|
||||
// Enable latest features
|
||||
"lib": ["ESNext", "DOM"],
|
||||
"target": "ESNext",
|
||||
"module": "ESNext",
|
||||
"moduleDetection": "force",
|
||||
"jsx": "react-jsx",
|
||||
"allowJs": true,
|
||||
|
||||
// Bundler mode
|
||||
"moduleResolution": "bundler",
|
||||
"allowImportingTsExtensions": true,
|
||||
"verbatimModuleSyntax": true,
|
||||
"noEmit": true,
|
||||
|
||||
// Best practices
|
||||
"strict": true,
|
||||
"skipLibCheck": true,
|
||||
"noFallthroughCasesInSwitch": true,
|
||||
|
||||
// Some stricter flags (disabled by default)
|
||||
"noUnusedLocals": false,
|
||||
"noUnusedParameters": false,
|
||||
"noPropertyAccessFromIndexSignature": false
|
||||
}
|
||||
}
|
|
@ -1,2 +0,0 @@
|
|||
DB_PATH=../db.db
|
||||
VITE_API_HOST=http://localhost:8000
|
2
sitio/.gitignore
vendored
2
sitio/.gitignore
vendored
|
@ -4,7 +4,7 @@ node_modules
|
|||
/.svelte-kit
|
||||
/package
|
||||
.env
|
||||
*.local
|
||||
.env.*
|
||||
!.env.example
|
||||
vite.config.js.timestamp-*
|
||||
vite.config.ts.timestamp-*
|
||||
|
|
|
@ -40,7 +40,6 @@
|
|||
"chartjs-adapter-dayjs-4": "^1.0.4",
|
||||
"dayjs": "^1.11.10",
|
||||
"drizzle-orm": "^0.32.0",
|
||||
"ky": "^1.5.0",
|
||||
"zod": "^3.22.4"
|
||||
},
|
||||
"packageManager": "pnpm@9.5.0+sha512.140036830124618d624a2187b50d04289d5a087f326c9edfc0ccd733d76c4f52c3a313d4fc148794a2a9d81553016004e6742e8cf850670268a7387fc220c903"
|
||||
|
|
|
@ -1,9 +1,5 @@
|
|||
<script lang="ts" context="module">
|
||||
export type Product = {
|
||||
ean: string;
|
||||
name: string | null;
|
||||
image_url: string | null;
|
||||
};
|
||||
export type Product = { ean: string; name: string; imageUrl: string | null };
|
||||
</script>
|
||||
|
||||
<script lang="ts">
|
||||
|
@ -11,9 +7,9 @@
|
|||
</script>
|
||||
|
||||
<a href={`/ean/${product.ean}`} class="flex gap-2">
|
||||
{#if product.image_url}
|
||||
{#if product.imageUrl}
|
||||
<img
|
||||
src={product.image_url}
|
||||
src={product.imageUrl}
|
||||
alt={product.name}
|
||||
class="max-h-48"
|
||||
loading="lazy"
|
||||
|
|
|
@ -1,2 +1 @@
|
|||
// place files you want to import through the `$lib` alias in this folder.
|
||||
export const API_HOST = import.meta.env.VITE_API_HOST;
|
||||
|
|
2
sitio/src/lib/server/db.ts
Normal file
2
sitio/src/lib/server/db.ts
Normal file
|
@ -0,0 +1,2 @@
|
|||
export { getDb } from "db-datos/db.js";
|
||||
export * as schema from "db-datos/schema.js";
|
|
@ -1,17 +1,15 @@
|
|||
import { countDistinct } from "drizzle-orm";
|
||||
import type { PageServerLoad } from "./$types";
|
||||
import { z } from "zod";
|
||||
import ky from "ky";
|
||||
import { API_HOST } from "$lib";
|
||||
|
||||
async function getInfo() {
|
||||
return z
|
||||
.object({
|
||||
count: z.number(),
|
||||
})
|
||||
.parse(await ky.get(`${API_HOST}/api/0/info`).json());
|
||||
}
|
||||
import { getDb, schema } from "$lib/server/db";
|
||||
const { precios } = schema;
|
||||
|
||||
export const load: PageServerLoad = async () => {
|
||||
const nProductos = (await getInfo()).count;
|
||||
const db = await getDb();
|
||||
const nProductosR = await db
|
||||
.select({
|
||||
count: countDistinct(precios.ean),
|
||||
})
|
||||
.from(precios);
|
||||
const nProductos = nProductosR[0].count;
|
||||
return { nProductos };
|
||||
};
|
||||
|
|
|
@ -1,29 +1,68 @@
|
|||
import type { PageServerLoad } from "./$types";
|
||||
import { getDb, schema } from "$lib/server/db";
|
||||
const { precios, bestSelling } = schema;
|
||||
import { max, sql } from "drizzle-orm";
|
||||
import z from "zod";
|
||||
import type { Product } from "$lib/ProductPreview.svelte";
|
||||
|
||||
async function getBestSelling() {
|
||||
const res = await fetch(
|
||||
`${import.meta.env.VITE_API_HOST}/api/0/best-selling-products`,
|
||||
type Data = {
|
||||
category: string;
|
||||
products: Product[];
|
||||
}[];
|
||||
|
||||
let cache: Promise<{ key: Date; data: Data }> = doQuery();
|
||||
|
||||
async function doQuery() {
|
||||
const db = await getDb();
|
||||
|
||||
const categories = await db
|
||||
.select({
|
||||
fetchedAt: bestSelling.fetchedAt,
|
||||
category: bestSelling.category,
|
||||
eansJson: bestSelling.eansJson,
|
||||
})
|
||||
.from(bestSelling)
|
||||
.groupBy(bestSelling.category)
|
||||
.having(max(bestSelling.fetchedAt));
|
||||
|
||||
const categoriesWithProducts = await Promise.all(
|
||||
categories.map(async (category) => {
|
||||
const eans = z.array(z.string()).parse(JSON.parse(category.eansJson));
|
||||
|
||||
const products = await db
|
||||
.select({
|
||||
ean: precios.ean,
|
||||
name: precios.name,
|
||||
imageUrl: precios.imageUrl,
|
||||
})
|
||||
.from(precios)
|
||||
.where(sql`${precios.ean} in ${eans}`)
|
||||
.groupBy(precios.ean)
|
||||
.having(max(precios.fetchedAt));
|
||||
|
||||
return {
|
||||
category: category.category,
|
||||
products: eans
|
||||
.map((ean) => products.find((p) => p.ean === ean))
|
||||
.filter((x): x is Product => !!x && !!x.name),
|
||||
};
|
||||
}),
|
||||
);
|
||||
const json = await res.json();
|
||||
return z
|
||||
.array(
|
||||
z.object({
|
||||
category: z.string(),
|
||||
products: z.array(
|
||||
z.object({
|
||||
ean: z.string(),
|
||||
name: z.string().nullable(),
|
||||
image_url: z.string().nullable(),
|
||||
}),
|
||||
),
|
||||
}),
|
||||
)
|
||||
.parse(json);
|
||||
|
||||
return { key: new Date(), data: categoriesWithProducts };
|
||||
}
|
||||
|
||||
export const load: PageServerLoad = async ({ params }) => {
|
||||
return {
|
||||
data: await getBestSelling(),
|
||||
};
|
||||
console.log("setting up interval");
|
||||
setInterval(
|
||||
async () => {
|
||||
const c = await doQuery();
|
||||
cache = Promise.resolve(c);
|
||||
},
|
||||
4 * 60 * 60 * 1000,
|
||||
);
|
||||
|
||||
export const load: PageServerLoad = async ({
|
||||
params,
|
||||
}): Promise<{ data: Data }> => {
|
||||
return { data: (await cache).data };
|
||||
};
|
||||
|
|
|
@ -1,23 +1,20 @@
|
|||
import { error } from "@sveltejs/kit";
|
||||
import { eq } from "drizzle-orm";
|
||||
import type { PageServerLoad } from "./$types";
|
||||
import { z } from "zod";
|
||||
import { zPrecio, type Precio } from "./common";
|
||||
import { API_HOST } from "$lib";
|
||||
|
||||
async function getProductHistory(ean: string) {
|
||||
const res = await fetch(`${API_HOST}/api/0/ean/${ean}/history`);
|
||||
const json = await res.json();
|
||||
return z.array(zPrecio).parse(json);
|
||||
}
|
||||
import { getDb, schema } from "$lib/server/db";
|
||||
const { precios } = schema;
|
||||
|
||||
export const load: PageServerLoad = async ({ params }) => {
|
||||
const res = await getProductHistory(params.ean);
|
||||
const db = await getDb();
|
||||
const q = db
|
||||
.select()
|
||||
.from(precios)
|
||||
.where(eq(precios.ean, params.ean))
|
||||
.orderBy(precios.fetchedAt);
|
||||
const res = await q;
|
||||
if (res.length === 0) return error(404, "Not Found");
|
||||
|
||||
const meta = res.findLast(
|
||||
(p): p is Precio & { name: string; image_url: string } =>
|
||||
!!(p.name && p.image_url),
|
||||
);
|
||||
const meta = res.findLast((p) => p.name);
|
||||
|
||||
return { precios: res, meta };
|
||||
};
|
||||
|
|
|
@ -1,18 +1,18 @@
|
|||
<script lang="ts">
|
||||
import { Supermercado, hosts } from "db-datos/supermercado";
|
||||
import * as schema from "db-datos/schema";
|
||||
import type { PageData } from "./$types";
|
||||
import Chart from "./Chart.svelte";
|
||||
import type { Precio } from "./common";
|
||||
|
||||
export let data: PageData;
|
||||
|
||||
let urls: Map<Supermercado, Precio>;
|
||||
let urls: Map<Supermercado, schema.Precio>;
|
||||
$: urls = data.precios.reduce((prev, curr) => {
|
||||
const url = new URL(curr.url);
|
||||
const supermercado = hosts[url.hostname];
|
||||
prev.set(supermercado, curr);
|
||||
return prev;
|
||||
}, new Map<Supermercado, Precio>());
|
||||
}, new Map<Supermercado, schema.Precio>());
|
||||
|
||||
const classBySupermercado: { [supermercado in Supermercado]: string } = {
|
||||
[Supermercado.Dia]: "bg-[#d52b1e] focus:ring-[#d52b1e]",
|
||||
|
@ -30,18 +30,18 @@
|
|||
|
||||
{#if data.meta}
|
||||
<h1 class="text-3xl font-bold">{data.meta.name}</h1>
|
||||
<img src={data.meta.image_url} alt={data.meta.name} class="max-h-48" />
|
||||
<img src={data.meta.imageUrl} alt={data.meta.name} class="max-h-48" />
|
||||
<div class="flex gap-2">
|
||||
{#each urls as [supermercado, { url, precio_centavos }]}
|
||||
{#each urls as [supermercado, { url, precioCentavos }]}
|
||||
<a
|
||||
href={url}
|
||||
rel="noreferrer noopener"
|
||||
target="_blank"
|
||||
class={`focus:shadow-outline inline-flex flex-col items-center justify-center rounded-md ${classBySupermercado[supermercado]} px-4 py-2 font-medium tracking-wide text-white transition-colors duration-200 hover:bg-opacity-80 focus:outline-none focus:ring-2 focus:ring-offset-2`}
|
||||
>
|
||||
{#if precio_centavos}
|
||||
{#if precioCentavos}
|
||||
<span class="text-lg font-bold"
|
||||
>{formatter.format(precio_centavos / 100)}</span
|
||||
>{formatter.format(precioCentavos / 100)}</span
|
||||
>
|
||||
{/if}
|
||||
<span class="text-sm">{supermercado}</span>
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
<script lang="ts">
|
||||
import type { Precio } from "db-datos/schema";
|
||||
// import dayjs from "dayjs";
|
||||
import ChartJs from "./ChartJs.svelte";
|
||||
import { hosts, colorBySupermercado } from "db-datos/supermercado";
|
||||
import type { Precio } from "./common";
|
||||
|
||||
export let precios: Precio[];
|
||||
|
||||
|
@ -15,15 +15,15 @@
|
|||
const ps = precios
|
||||
.filter((p) => new URL(p.url!).hostname === host)
|
||||
.filter(
|
||||
(p): p is Precio & { precio_centavos: number } =>
|
||||
p.precio_centavos !== null,
|
||||
(p): p is Precio & { precioCentavos: number } =>
|
||||
p.precioCentavos !== null,
|
||||
);
|
||||
return {
|
||||
label: supermercado,
|
||||
data: [
|
||||
...ps.map((p) => ({
|
||||
x: p.fetched_at,
|
||||
y: p.precio_centavos / 100,
|
||||
x: p.fetchedAt,
|
||||
y: p.precioCentavos / 100,
|
||||
})),
|
||||
// lie
|
||||
// ...ps.map((p) => ({
|
||||
|
|
|
@ -1,12 +0,0 @@
|
|||
import { z } from "zod";
|
||||
|
||||
export const zPrecio = z.object({
|
||||
ean: z.string(),
|
||||
fetched_at: z.coerce.date(),
|
||||
precio_centavos: z.number().nullable(),
|
||||
in_stock: z.boolean().nullable(),
|
||||
url: z.string(),
|
||||
name: z.string().nullable(),
|
||||
image_url: z.string().nullable(),
|
||||
});
|
||||
export type Precio = z.infer<typeof zPrecio>;
|
|
@ -1,29 +1,26 @@
|
|||
import { z } from "zod";
|
||||
import { sql } from "drizzle-orm";
|
||||
import type { PageServerLoad } from "./$types";
|
||||
import { API_HOST } from "$lib";
|
||||
import ky from "ky";
|
||||
|
||||
const zProductResult = z.object({
|
||||
ean: z.string(),
|
||||
name: z.string(),
|
||||
image_url: z.string(),
|
||||
});
|
||||
|
||||
async function search(query: string) {
|
||||
return z
|
||||
.array(zProductResult)
|
||||
.parse(
|
||||
await ky
|
||||
.get(`${API_HOST}/api/0/search/${encodeURIComponent(query)}`)
|
||||
.json(),
|
||||
);
|
||||
}
|
||||
import { getDb } from "$lib/server/db";
|
||||
|
||||
export const load: PageServerLoad = async ({ url }) => {
|
||||
const db = await getDb();
|
||||
const query = url.searchParams.get("q");
|
||||
let results: null | { ean: string; name: string; image_url: string }[] = query
|
||||
? await search(query)
|
||||
: null;
|
||||
let results: null | { ean: string; name: string; imageUrl: string }[] = null;
|
||||
if (query) {
|
||||
const sQuery = query
|
||||
.replaceAll(`"`, `""`)
|
||||
.split(" ")
|
||||
.map((s) => `"${s}"`)
|
||||
.join(" ");
|
||||
console.debug(sQuery);
|
||||
const sqlQuery = sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
|
||||
join precios p on p.ean = f.ean
|
||||
where f.name match ${sQuery}
|
||||
group by p.ean
|
||||
having max(p.fetched_at)
|
||||
order by p.in_stock desc;`;
|
||||
results = db.all(sqlQuery);
|
||||
}
|
||||
|
||||
return { query, results };
|
||||
};
|
||||
|
|
Loading…
Reference in a new issue