Compare commits

...

4 commits

Author SHA1 Message Date
adf1d7ac59 usar 50 coroutinas en coto 2024-08-19 20:47:33 -03:00
5943d80252 usar db.db 2024-08-13 20:13:02 -03:00
905d94a55e trim harder
closes #44
2024-08-04 17:31:00 -03:00
8617a0b2a5 index ean, fetched_at 2024-08-04 15:35:43 -03:00
28 changed files with 309 additions and 89 deletions

View file

@ -2,6 +2,8 @@ data/warcs/
data/carrefour/ data/carrefour/
*/*.db* */*.db*
sqlite.db sqlite.db
db.db
db.db-wal
downloader/ downloader/
node_modules/ node_modules/
*/node_modules/ */node_modules/

View file

@ -1 +1 @@
DB_PATH=../sqlite.db DB_PATH=../db.db

View file

@ -1,4 +1,4 @@
export const DB_PATH = process.env.DB_PATH ?? "../sqlite.db"; export const DB_PATH = process.env.DB_PATH ?? "../db.db";
/** @type { import("drizzle-kit").Config } */ /** @type { import("drizzle-kit").Config } */
export default { export default {

View file

@ -0,0 +1 @@
CREATE INDEX `precios_ean_fetched_at_idx` ON `precios` (`ean`,`fetched_at`);

View file

@ -1,8 +1,6 @@
{ {
"version": "5", "version": "6",
"dialect": "sqlite", "dialect": "sqlite",
"id": "88aa0254-106e-424e-ab66-417ff44bbf0b",
"prevId": "00000000-0000-0000-0000-000000000000",
"tables": { "tables": {
"precios": { "precios": {
"name": "precios", "name": "precios",
@ -58,8 +56,9 @@
}, },
"enums": {}, "enums": {},
"_meta": { "_meta": {
"schemas": {},
"tables": {}, "tables": {},
"columns": {} "columns": {}
} },
"id": "88aa0254-106e-424e-ab66-417ff44bbf0b",
"prevId": "00000000-0000-0000-0000-000000000000"
} }

View file

@ -1,8 +1,6 @@
{ {
"version": "5", "version": "6",
"dialect": "sqlite", "dialect": "sqlite",
"id": "a565621c-046e-4f4d-b505-104e2c4f2b6c",
"prevId": "88aa0254-106e-424e-ab66-417ff44bbf0b",
"tables": { "tables": {
"precios": { "precios": {
"name": "precios", "name": "precios",
@ -72,8 +70,9 @@
}, },
"enums": {}, "enums": {},
"_meta": { "_meta": {
"schemas": {},
"tables": {}, "tables": {},
"columns": {} "columns": {}
} },
"id": "a565621c-046e-4f4d-b505-104e2c4f2b6c",
"prevId": "88aa0254-106e-424e-ab66-417ff44bbf0b"
} }

View file

@ -1,8 +1,6 @@
{ {
"version": "5", "version": "6",
"dialect": "sqlite", "dialect": "sqlite",
"id": "cbd90a60-7568-489f-ac45-95bd8818ffbd",
"prevId": "a565621c-046e-4f4d-b505-104e2c4f2b6c",
"tables": { "tables": {
"precios": { "precios": {
"name": "precios", "name": "precios",
@ -86,8 +84,9 @@
}, },
"enums": {}, "enums": {},
"_meta": { "_meta": {
"schemas": {},
"tables": {}, "tables": {},
"columns": {} "columns": {}
} },
"id": "cbd90a60-7568-489f-ac45-95bd8818ffbd",
"prevId": "a565621c-046e-4f4d-b505-104e2c4f2b6c"
} }

View file

@ -1,8 +1,6 @@
{ {
"version": "5", "version": "6",
"dialect": "sqlite", "dialect": "sqlite",
"id": "e1217fdb-6f54-44c5-a04b-c5aebf202102",
"prevId": "cbd90a60-7568-489f-ac45-95bd8818ffbd",
"tables": { "tables": {
"precios": { "precios": {
"name": "precios", "name": "precios",
@ -94,8 +92,9 @@
}, },
"enums": {}, "enums": {},
"_meta": { "_meta": {
"schemas": {},
"tables": {}, "tables": {},
"columns": {} "columns": {}
} },
"id": "e1217fdb-6f54-44c5-a04b-c5aebf202102",
"prevId": "cbd90a60-7568-489f-ac45-95bd8818ffbd"
} }

View file

@ -1,7 +1,5 @@
{ {
"id": "bf90a1cd-ae6a-4dba-a1aa-79f14a11d958", "version": "6",
"prevId": "e1217fdb-6f54-44c5-a04b-c5aebf202102",
"version": "5",
"dialect": "sqlite", "dialect": "sqlite",
"tables": { "tables": {
"precios": { "precios": {
@ -94,8 +92,9 @@
}, },
"enums": {}, "enums": {},
"_meta": { "_meta": {
"columns": {}, "tables": {},
"schemas": {}, "columns": {}
"tables": {} },
} "id": "bf90a1cd-ae6a-4dba-a1aa-79f14a11d958",
"prevId": "e1217fdb-6f54-44c5-a04b-c5aebf202102"
} }

View file

@ -1,7 +1,5 @@
{ {
"id": "f2cf47b9-e137-41c9-b7fb-6bc016588db0", "version": "6",
"prevId": "bf90a1cd-ae6a-4dba-a1aa-79f14a11d958",
"version": "5",
"dialect": "sqlite", "dialect": "sqlite",
"tables": { "tables": {
"precios": { "precios": {
@ -94,8 +92,9 @@
}, },
"enums": {}, "enums": {},
"_meta": { "_meta": {
"columns": {}, "tables": {},
"schemas": {}, "columns": {}
"tables": {} },
} "id": "f2cf47b9-e137-41c9-b7fb-6bc016588db0",
"prevId": "bf90a1cd-ae6a-4dba-a1aa-79f14a11d958"
} }

View file

@ -1,7 +1,5 @@
{ {
"id": "ac099405-ecd0-4637-ae5e-fb29c9847e45", "version": "6",
"prevId": "f2cf47b9-e137-41c9-b7fb-6bc016588db0",
"version": "5",
"dialect": "sqlite", "dialect": "sqlite",
"tables": { "tables": {
"precios": { "precios": {
@ -94,8 +92,9 @@
}, },
"enums": {}, "enums": {},
"_meta": { "_meta": {
"columns": {}, "tables": {},
"schemas": {}, "columns": {}
"tables": {} },
} "id": "ac099405-ecd0-4637-ae5e-fb29c9847e45",
"prevId": "f2cf47b9-e137-41c9-b7fb-6bc016588db0"
} }

View file

@ -1,7 +1,5 @@
{ {
"id": "9d2f23bf-dc60-4adb-b1bd-ec75e90dda25", "version": "6",
"prevId": "ac099405-ecd0-4637-ae5e-fb29c9847e45",
"version": "5",
"dialect": "sqlite", "dialect": "sqlite",
"tables": { "tables": {
"precios": { "precios": {
@ -94,8 +92,9 @@
}, },
"enums": {}, "enums": {},
"_meta": { "_meta": {
"columns": {}, "tables": {},
"schemas": {}, "columns": {}
"tables": {} },
} "id": "9d2f23bf-dc60-4adb-b1bd-ec75e90dda25",
"prevId": "ac099405-ecd0-4637-ae5e-fb29c9847e45"
} }

View file

@ -1,7 +1,5 @@
{ {
"id": "082630a9-3744-4e33-bde5-06045ca57d36", "version": "6",
"prevId": "9d2f23bf-dc60-4adb-b1bd-ec75e90dda25",
"version": "5",
"dialect": "sqlite", "dialect": "sqlite",
"tables": { "tables": {
"precios": { "precios": {
@ -94,8 +92,9 @@
}, },
"enums": {}, "enums": {},
"_meta": { "_meta": {
"columns": {}, "tables": {},
"schemas": {}, "columns": {}
"tables": {} },
} "id": "082630a9-3744-4e33-bde5-06045ca57d36",
"prevId": "9d2f23bf-dc60-4adb-b1bd-ec75e90dda25"
} }

View file

@ -1,8 +1,6 @@
{ {
"version": "5", "version": "6",
"dialect": "sqlite", "dialect": "sqlite",
"id": "2e398920-ffaf-4d55-ae13-d906cb9e0efa",
"prevId": "082630a9-3744-4e33-bde5-06045ca57d36",
"tables": { "tables": {
"precios": { "precios": {
"name": "precios", "name": "precios",
@ -139,8 +137,9 @@
}, },
"enums": {}, "enums": {},
"_meta": { "_meta": {
"schemas": {},
"tables": {}, "tables": {},
"columns": {} "columns": {}
} },
"id": "2e398920-ffaf-4d55-ae13-d906cb9e0efa",
"prevId": "082630a9-3744-4e33-bde5-06045ca57d36"
} }

View file

@ -1,8 +1,6 @@
{ {
"version": "5", "version": "6",
"dialect": "sqlite", "dialect": "sqlite",
"id": "c8297337-4ed8-432e-8782-65d41be42e00",
"prevId": "2e398920-ffaf-4d55-ae13-d906cb9e0efa",
"tables": { "tables": {
"db_best_selling": { "db_best_selling": {
"name": "db_best_selling", "name": "db_best_selling",
@ -176,8 +174,9 @@
}, },
"enums": {}, "enums": {},
"_meta": { "_meta": {
"schemas": {},
"tables": {}, "tables": {},
"columns": {} "columns": {}
} },
"id": "c8297337-4ed8-432e-8782-65d41be42e00",
"prevId": "2e398920-ffaf-4d55-ae13-d906cb9e0efa"
} }

View file

@ -1,8 +1,6 @@
{ {
"version": "5", "version": "6",
"dialect": "sqlite", "dialect": "sqlite",
"id": "8b4921b5-6ecd-4d69-ba64-9b0bfb53db84",
"prevId": "c8297337-4ed8-432e-8782-65d41be42e00",
"tables": { "tables": {
"db_best_selling": { "db_best_selling": {
"name": "db_best_selling", "name": "db_best_selling",
@ -183,8 +181,9 @@
}, },
"enums": {}, "enums": {},
"_meta": { "_meta": {
"schemas": {},
"tables": {}, "tables": {},
"columns": {} "columns": {}
} },
"id": "8b4921b5-6ecd-4d69-ba64-9b0bfb53db84",
"prevId": "c8297337-4ed8-432e-8782-65d41be42e00"
} }

View file

@ -1,8 +1,6 @@
{ {
"version": "5", "version": "6",
"dialect": "sqlite", "dialect": "sqlite",
"id": "16046188-ab24-4bd4-bfb4-8a81f24c6f28",
"prevId": "8b4921b5-6ecd-4d69-ba64-9b0bfb53db84",
"tables": { "tables": {
"db_best_selling": { "db_best_selling": {
"name": "db_best_selling", "name": "db_best_selling",
@ -190,8 +188,9 @@
}, },
"enums": {}, "enums": {},
"_meta": { "_meta": {
"schemas": {},
"tables": {}, "tables": {},
"columns": {} "columns": {}
} },
"id": "16046188-ab24-4bd4-bfb4-8a81f24c6f28",
"prevId": "8b4921b5-6ecd-4d69-ba64-9b0bfb53db84"
} }

View file

@ -0,0 +1,208 @@
{
"version": "6",
"dialect": "sqlite",
"id": "c95c6547-d540-45cf-aa9d-9d828efb468e",
"prevId": "16046188-ab24-4bd4-bfb4-8a81f24c6f28",
"tables": {
"db_best_selling": {
"name": "db_best_selling",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"category": {
"name": "category",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"eans_json": {
"name": "eans_json",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
}
},
"indexes": {},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
},
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {
"precios_ean_idx": {
"name": "precios_ean_idx",
"columns": [
"ean"
],
"isUnique": false
},
"precios_url_idx": {
"name": "precios_url_idx",
"columns": [
"url"
],
"isUnique": false
},
"precios_fetched_at_idx": {
"name": "precios_fetched_at_idx",
"columns": [
"fetched_at"
],
"isUnique": false
},
"precios_ean_fetched_at_idx": {
"name": "precios_ean_fetched_at_idx",
"columns": [
"ean",
"fetched_at"
],
"isUnique": false
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
},
"producto_urls": {
"name": "producto_urls",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"first_seen": {
"name": "first_seen",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"last_seen": {
"name": "last_seen",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
}
},
"indexes": {
"producto_urls_url_unique": {
"name": "producto_urls_url_unique",
"columns": [
"url"
],
"isUnique": true
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"schemas": {},
"tables": {},
"columns": {}
},
"internal": {
"indexes": {}
}
}

View file

@ -92,6 +92,13 @@
"when": 1719680946811, "when": 1719680946811,
"tag": "0012_hard_red_wolf", "tag": "0012_hard_red_wolf",
"breakpoints": true "breakpoints": true
},
{
"idx": 13,
"version": "6",
"when": 1722796469056,
"tag": "0013_harsh_starbolt",
"breakpoints": true
} }
] ]
} }

View file

@ -5,7 +5,7 @@
"description": "", "description": "",
"main": "index.js", "main": "index.js",
"scripts": { "scripts": {
"generate": "drizzle-kit generate:sqlite", "generate": "drizzle-kit generate",
"migrate": "node migrate-cli.js" "migrate": "node migrate-cli.js"
}, },
"keywords": [], "keywords": [],

View file

@ -22,6 +22,10 @@ export const precios = sqliteTable(
preciosFetchedAtIdx: index("precios_fetched_at_idx").on( preciosFetchedAtIdx: index("precios_fetched_at_idx").on(
precios.fetchedAt precios.fetchedAt
), ),
preciosEanFetchedAtIdx: index("precios_ean_fetched_at_idx").on(
precios.ean,
precios.fetchedAt
),
}; };
} }
); );

View file

@ -1 +1 @@
DATABASE_URL=sqlite://../sqlite.db DATABASE_URL=sqlite://../db.db

View file

@ -282,7 +282,7 @@ async fn main() {
.connect_with( .connect_with(
SqliteConnectOptions::from_str(&format!( SqliteConnectOptions::from_str(&format!(
"sqlite://{}", "sqlite://{}",
env::var("DB_PATH").unwrap_or("../sqlite.db".to_string()) env::var("DB_PATH").unwrap_or("../db.db".to_string())
)) ))
.unwrap() .unwrap()
.journal_mode(sqlx::sqlite::SqliteJournalMode::Wal) .journal_mode(sqlx::sqlite::SqliteJournalMode::Wal)

View file

@ -1,3 +1,5 @@
use std::env;
use super::now_sec; use super::now_sec;
use super::AutoArgs; use super::AutoArgs;
use super::AutoTelegram; use super::AutoTelegram;
@ -61,7 +63,16 @@ impl Auto {
// } // }
{ {
let t0 = now_sec(); let t0 = now_sec();
let counters = self.scraper.fetch_list(&self.db, links).await;
let n_coroutines = if supermercado == Supermercado::Coto {
50
} else {
env::var("N_COROUTINES")
.map_or(Ok(24), |s| s.parse::<usize>())
.expect("N_COROUTINES no es un número")
};
let counters = self.scraper.fetch_list(&self.db, links, n_coroutines).await;
self.inform(&format!( self.inform(&format!(
"Downloaded {:?}: {:?} (took {})", "Downloaded {:?}: {:?} (took {})",
&supermercado, &supermercado,

View file

@ -17,7 +17,7 @@ pub struct Db {
impl Db { impl Db {
pub async fn connect() -> anyhow::Result<Self> { pub async fn connect() -> anyhow::Result<Self> {
let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string()); let db_path = env::var("DB_PATH").unwrap_or("../db.db".to_string());
info!("Opening DB at {}", db_path); info!("Opening DB at {}", db_path);
let read_pool = connect_to_db(&db_path, 32).await?; let read_pool = connect_to_db(&db_path, 32).await?;
let write_pool = connect_to_db(&db_path, 1).await?; let write_pool = connect_to_db(&db_path, 1).await?;

View file

@ -128,11 +128,7 @@ impl Scraper {
counters counters
} }
pub async fn fetch_list(&self, db: &Db, links: Vec<String>) -> Counters { pub async fn fetch_list(&self, db: &Db, links: Vec<String>, n_coroutines: usize) -> Counters {
let n_coroutines = env::var("N_COROUTINES")
.map_or(Ok(24), |s| s.parse::<usize>())
.expect("N_COROUTINES no es un número");
stream::iter(links) stream::iter(links)
.map(|url| { .map(|url| {
let db = db.clone(); let db = db.clone();

View file

@ -56,7 +56,11 @@ pub fn parse(url: String, dom: &tl::VDom) -> Result<PrecioPoint, anyhow::Error>
.find_map(|n| n.as_tag()) .find_map(|n| n.as_tag())
.map(|t| t.inner_text(dom.parser())) .map(|t| t.inner_text(dom.parser()))
// https://github.com/catdevnull/preciazo/issues/24 // https://github.com/catdevnull/preciazo/issues/24
.map(|s| html_escape::decode_html_entities(s.trim()).to_string()); .map(|s| {
html_escape::decode_html_entities(s.trim())
.trim()
.to_string()
});
let image_url = dom let image_url = dom
.query_selector(".zoomImage1") .query_selector(".zoomImage1")

View file

@ -1,2 +1,2 @@
DB_PATH=../sqlite.db DB_PATH=../db.db
VITE_API_HOST=http://localhost:8000 VITE_API_HOST=http://localhost:8000