mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-29 21:16:19 +00:00
Compare commits
No commits in common. "5dcc901a80fb765995bc795f3b009616dd404768" and "198e51fc97e799510ca725dfc597c9922af07e46" have entirely different histories.
5dcc901a80
...
198e51fc97
45 changed files with 201 additions and 1159 deletions
BIN
bun.lockb
BIN
bun.lockb
Binary file not shown.
|
@ -1,44 +0,0 @@
|
|||
import pMap from "p-map";
|
||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||
|
||||
await scrapBySitemap();
|
||||
|
||||
export async function scrapCarrefourProducts() {
|
||||
await scrapBySitemap();
|
||||
}
|
||||
|
||||
async function scrapBySitemap() {
|
||||
// de https://www.carrefour.com.ar/sitemap.xml
|
||||
const sitemaps = [
|
||||
"https://www.carrefour.com.ar/sitemap/product-0.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-1.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-2.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-3.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-4.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-5.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-6.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-7.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-8.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-9.xml",
|
||||
];
|
||||
|
||||
await pMap(
|
||||
sitemaps,
|
||||
async (sitemapUrl) => {
|
||||
const res = await fetch(sitemapUrl);
|
||||
const xml = await res.text();
|
||||
let urls = new Set<string>();
|
||||
new HTMLRewriter()
|
||||
.on("loc", {
|
||||
text(element) {
|
||||
const txt = element.text.trim();
|
||||
if (!txt) return;
|
||||
urls.add(txt);
|
||||
},
|
||||
})
|
||||
.transform(new Response(xml));
|
||||
saveUrls(Array.from(urls));
|
||||
},
|
||||
{ concurrency: 3 }
|
||||
);
|
||||
}
|
|
@ -1,17 +0,0 @@
|
|||
{
|
||||
"name": "carrefour-link-scraper",
|
||||
"type": "module",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"linkedom": "^0.16.5",
|
||||
"p-map": "^7.0.1"
|
||||
}
|
||||
}
|
|
@ -1,24 +1,23 @@
|
|||
import { getHtml } from "../scraper/fetch.js";
|
||||
import { parseHTML } from "linkedom";
|
||||
import PQueue from "p-queue";
|
||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||
|
||||
export async function scrapCotoProducts() {
|
||||
// let fetched = new Set<string>();
|
||||
{
|
||||
const initial =
|
||||
"https://www.cotodigital3.com.ar/sitios/cdigi/browse?Nf=product.endDate%7CGTEQ+1.7032032E12%7C%7Cproduct.startDate%7CLTEQ+1.7032032E12&No=2200&Nr=AND%28product.sDisp_200%3A1004%2Cproduct.language%3Aespa%C3%B1ol%2COR%28product.siteId%3ACotoDigital%29%29&Nrpp=200";
|
||||
|
||||
const queue = new PQueue({ concurrency: 4 });
|
||||
const queue = new PQueue({ concurrency: 2 });
|
||||
|
||||
const pageSize = 300; // hasta 1000
|
||||
const links = Array.from(
|
||||
{ length: Math.ceil(29000 / pageSize) },
|
||||
(x, i) => i
|
||||
).map((i) => {
|
||||
const url = new URL(initial);
|
||||
url.searchParams.set("No", `${i * pageSize}`);
|
||||
url.searchParams.set("Nrpp", `${pageSize}`);
|
||||
return url.toString();
|
||||
});
|
||||
const links = Array.from({ length: Math.ceil(29000 / 300) }, (x, i) => i).map(
|
||||
(i) => {
|
||||
const url = new URL(initial);
|
||||
url.searchParams.set("No", `${i * pageSize}`);
|
||||
url.searchParams.set("Nrpp", `${pageSize}`);
|
||||
return url.toString();
|
||||
}
|
||||
);
|
||||
|
||||
const promises = links.map((l) => queue.add(getPage(l)));
|
||||
await Promise.all(promises);
|
||||
|
@ -39,6 +38,22 @@ function getPage(url: string) {
|
|||
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
|
||||
(a) => new URL(a.href, url).toString()
|
||||
);
|
||||
saveUrls(hrefs);
|
||||
hrefs.forEach((h) => process.stdout.write(h + "\n"));
|
||||
|
||||
// const nextLinks = Array.from(
|
||||
// document.querySelectorAll<HTMLAnchorElement>(
|
||||
// "#atg_store_pagination a[href]"
|
||||
// ),
|
||||
// (a) => new URL(a.href, url).toString()
|
||||
// );
|
||||
|
||||
// await Promise.all(
|
||||
// nextLinks
|
||||
// .filter((l) => !fetched.has(l))
|
||||
// .map((l) => {
|
||||
// fetched.add(l);
|
||||
// return queue.add(getPage(l));
|
||||
// })
|
||||
// );
|
||||
};
|
||||
}
|
||||
|
|
|
@ -12,6 +12,8 @@
|
|||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"linkedom": "^0.16.5",
|
||||
"p-queue": "^8.0.1"
|
||||
"p-queue": "^8.0.1",
|
||||
"tsx": "^4.7.0",
|
||||
"undici": "^6.2.0"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,10 +0,0 @@
|
|||
import { Database } from "bun:sqlite";
|
||||
import { drizzle } from "drizzle-orm/bun-sqlite";
|
||||
import { DB_PATH } from "./drizzle.config.js";
|
||||
import { migrateDb } from "./migrate.js";
|
||||
import * as schema from "./schema.js";
|
||||
|
||||
migrateDb();
|
||||
|
||||
export const sqlite = new Database(DB_PATH);
|
||||
export const db = drizzle(sqlite, { schema });
|
|
@ -1,3 +0,0 @@
|
|||
-- Custom SQL migration file, put you code below! --
|
||||
create virtual table precios_fts using fts5(ean, url, name, content=precios, content_rowid=id);
|
||||
|
|
@ -1,2 +0,0 @@
|
|||
-- Custom SQL migration file, put you code below! --
|
||||
insert into precios_fts(rowid,ean,url,name) select id,ean,url,name from precios;
|
|
@ -1,7 +0,0 @@
|
|||
-- Custom SQL migration file, put you code below! --
|
||||
|
||||
-- https://sqlite.org/fts5.html#external_content_and_contentless_tables
|
||||
-- Triggers to keep the FTS index up to date.
|
||||
CREATE TRIGGER precios_fts_ai AFTER INSERT ON precios BEGIN
|
||||
INSERT INTO precios_fts(rowid, ean, url, name) VALUES (new.id, new.ean, new.url, new.name);
|
||||
END;
|
|
@ -1,6 +0,0 @@
|
|||
-- Custom SQL migration file, put you code below! --
|
||||
-- https://sqlite.org/fts5.html#external_content_and_contentless_tables
|
||||
-- Triggers to keep the FTS index up to date.
|
||||
CREATE TRIGGER precios_fts_ad AFTER DELETE ON precios BEGIN
|
||||
INSERT INTO precios_fts(precios_fts, rowid, ean, url, name) VALUES('delete', old.id, old.ean, old.url, old.name);
|
||||
END;
|
|
@ -1,8 +0,0 @@
|
|||
-- Custom SQL migration file, put you code below! --
|
||||
|
||||
-- https://sqlite.org/fts5.html#external_content_and_contentless_tables
|
||||
-- Triggers to keep the FTS index up to date.
|
||||
CREATE TRIGGER precios_fts_au AFTER UPDATE ON precios BEGIN
|
||||
INSERT INTO precios_fts(precios_fts, rowid, ean, url, name) VALUES('delete', old.id, old.ean, old.url, old.name);
|
||||
INSERT INTO precios_fts(rowid, ean, url, name) VALUES (new.id, new.ean, new.url, new.name);
|
||||
END;
|
|
@ -1,8 +0,0 @@
|
|||
CREATE TABLE `producto_urls` (
|
||||
`id` integer PRIMARY KEY AUTOINCREMENT NOT NULL,
|
||||
`url` text NOT NULL,
|
||||
`first_seen` integer NOT NULL,
|
||||
`last_seen` integer NOT NULL
|
||||
);
|
||||
--> statement-breakpoint
|
||||
CREATE UNIQUE INDEX `producto_urls_url_unique` ON `producto_urls` (`url`);
|
|
@ -1,101 +0,0 @@
|
|||
{
|
||||
"id": "bf90a1cd-ae6a-4dba-a1aa-79f14a11d958",
|
||||
"prevId": "e1217fdb-6f54-44c5-a04b-c5aebf202102",
|
||||
"version": "5",
|
||||
"dialect": "sqlite",
|
||||
"tables": {
|
||||
"precios": {
|
||||
"name": "precios",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "integer",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"autoincrement": true
|
||||
},
|
||||
"ean": {
|
||||
"name": "ean",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"fetched_at": {
|
||||
"name": "fetched_at",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"precio_centavos": {
|
||||
"name": "precio_centavos",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"in_stock": {
|
||||
"name": "in_stock",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"url": {
|
||||
"name": "url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"warc_record_id": {
|
||||
"name": "warc_record_id",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"parser_version": {
|
||||
"name": "parser_version",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"name": {
|
||||
"name": "name",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"image_url": {
|
||||
"name": "image_url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"precios_ean_idx": {
|
||||
"name": "precios_ean_idx",
|
||||
"columns": [
|
||||
"ean"
|
||||
],
|
||||
"isUnique": false
|
||||
}
|
||||
},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {}
|
||||
}
|
||||
},
|
||||
"enums": {},
|
||||
"_meta": {
|
||||
"columns": {},
|
||||
"schemas": {},
|
||||
"tables": {}
|
||||
}
|
||||
}
|
|
@ -1,101 +0,0 @@
|
|||
{
|
||||
"id": "f2cf47b9-e137-41c9-b7fb-6bc016588db0",
|
||||
"prevId": "bf90a1cd-ae6a-4dba-a1aa-79f14a11d958",
|
||||
"version": "5",
|
||||
"dialect": "sqlite",
|
||||
"tables": {
|
||||
"precios": {
|
||||
"name": "precios",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "integer",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"autoincrement": true
|
||||
},
|
||||
"ean": {
|
||||
"name": "ean",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"fetched_at": {
|
||||
"name": "fetched_at",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"precio_centavos": {
|
||||
"name": "precio_centavos",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"in_stock": {
|
||||
"name": "in_stock",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"url": {
|
||||
"name": "url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"warc_record_id": {
|
||||
"name": "warc_record_id",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"parser_version": {
|
||||
"name": "parser_version",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"name": {
|
||||
"name": "name",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"image_url": {
|
||||
"name": "image_url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"precios_ean_idx": {
|
||||
"name": "precios_ean_idx",
|
||||
"columns": [
|
||||
"ean"
|
||||
],
|
||||
"isUnique": false
|
||||
}
|
||||
},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {}
|
||||
}
|
||||
},
|
||||
"enums": {},
|
||||
"_meta": {
|
||||
"columns": {},
|
||||
"schemas": {},
|
||||
"tables": {}
|
||||
}
|
||||
}
|
|
@ -1,101 +0,0 @@
|
|||
{
|
||||
"id": "ac099405-ecd0-4637-ae5e-fb29c9847e45",
|
||||
"prevId": "f2cf47b9-e137-41c9-b7fb-6bc016588db0",
|
||||
"version": "5",
|
||||
"dialect": "sqlite",
|
||||
"tables": {
|
||||
"precios": {
|
||||
"name": "precios",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "integer",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"autoincrement": true
|
||||
},
|
||||
"ean": {
|
||||
"name": "ean",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"fetched_at": {
|
||||
"name": "fetched_at",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"precio_centavos": {
|
||||
"name": "precio_centavos",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"in_stock": {
|
||||
"name": "in_stock",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"url": {
|
||||
"name": "url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"warc_record_id": {
|
||||
"name": "warc_record_id",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"parser_version": {
|
||||
"name": "parser_version",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"name": {
|
||||
"name": "name",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"image_url": {
|
||||
"name": "image_url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"precios_ean_idx": {
|
||||
"name": "precios_ean_idx",
|
||||
"columns": [
|
||||
"ean"
|
||||
],
|
||||
"isUnique": false
|
||||
}
|
||||
},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {}
|
||||
}
|
||||
},
|
||||
"enums": {},
|
||||
"_meta": {
|
||||
"columns": {},
|
||||
"schemas": {},
|
||||
"tables": {}
|
||||
}
|
||||
}
|
|
@ -1,101 +0,0 @@
|
|||
{
|
||||
"id": "9d2f23bf-dc60-4adb-b1bd-ec75e90dda25",
|
||||
"prevId": "ac099405-ecd0-4637-ae5e-fb29c9847e45",
|
||||
"version": "5",
|
||||
"dialect": "sqlite",
|
||||
"tables": {
|
||||
"precios": {
|
||||
"name": "precios",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "integer",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"autoincrement": true
|
||||
},
|
||||
"ean": {
|
||||
"name": "ean",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"fetched_at": {
|
||||
"name": "fetched_at",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"precio_centavos": {
|
||||
"name": "precio_centavos",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"in_stock": {
|
||||
"name": "in_stock",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"url": {
|
||||
"name": "url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"warc_record_id": {
|
||||
"name": "warc_record_id",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"parser_version": {
|
||||
"name": "parser_version",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"name": {
|
||||
"name": "name",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"image_url": {
|
||||
"name": "image_url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"precios_ean_idx": {
|
||||
"name": "precios_ean_idx",
|
||||
"columns": [
|
||||
"ean"
|
||||
],
|
||||
"isUnique": false
|
||||
}
|
||||
},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {}
|
||||
}
|
||||
},
|
||||
"enums": {},
|
||||
"_meta": {
|
||||
"columns": {},
|
||||
"schemas": {},
|
||||
"tables": {}
|
||||
}
|
||||
}
|
|
@ -1,101 +0,0 @@
|
|||
{
|
||||
"id": "082630a9-3744-4e33-bde5-06045ca57d36",
|
||||
"prevId": "9d2f23bf-dc60-4adb-b1bd-ec75e90dda25",
|
||||
"version": "5",
|
||||
"dialect": "sqlite",
|
||||
"tables": {
|
||||
"precios": {
|
||||
"name": "precios",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "integer",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"autoincrement": true
|
||||
},
|
||||
"ean": {
|
||||
"name": "ean",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"fetched_at": {
|
||||
"name": "fetched_at",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"precio_centavos": {
|
||||
"name": "precio_centavos",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"in_stock": {
|
||||
"name": "in_stock",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"url": {
|
||||
"name": "url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"warc_record_id": {
|
||||
"name": "warc_record_id",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"parser_version": {
|
||||
"name": "parser_version",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"name": {
|
||||
"name": "name",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"image_url": {
|
||||
"name": "image_url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"precios_ean_idx": {
|
||||
"name": "precios_ean_idx",
|
||||
"columns": [
|
||||
"ean"
|
||||
],
|
||||
"isUnique": false
|
||||
}
|
||||
},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {}
|
||||
}
|
||||
},
|
||||
"enums": {},
|
||||
"_meta": {
|
||||
"columns": {},
|
||||
"schemas": {},
|
||||
"tables": {}
|
||||
}
|
||||
}
|
|
@ -1,146 +0,0 @@
|
|||
{
|
||||
"version": "5",
|
||||
"dialect": "sqlite",
|
||||
"id": "2e398920-ffaf-4d55-ae13-d906cb9e0efa",
|
||||
"prevId": "082630a9-3744-4e33-bde5-06045ca57d36",
|
||||
"tables": {
|
||||
"precios": {
|
||||
"name": "precios",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "integer",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"autoincrement": true
|
||||
},
|
||||
"ean": {
|
||||
"name": "ean",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"fetched_at": {
|
||||
"name": "fetched_at",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"precio_centavos": {
|
||||
"name": "precio_centavos",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"in_stock": {
|
||||
"name": "in_stock",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"url": {
|
||||
"name": "url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"warc_record_id": {
|
||||
"name": "warc_record_id",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"parser_version": {
|
||||
"name": "parser_version",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"name": {
|
||||
"name": "name",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"image_url": {
|
||||
"name": "image_url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"precios_ean_idx": {
|
||||
"name": "precios_ean_idx",
|
||||
"columns": [
|
||||
"ean"
|
||||
],
|
||||
"isUnique": false
|
||||
}
|
||||
},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {}
|
||||
},
|
||||
"producto_urls": {
|
||||
"name": "producto_urls",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "integer",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"autoincrement": true
|
||||
},
|
||||
"url": {
|
||||
"name": "url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"first_seen": {
|
||||
"name": "first_seen",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"last_seen": {
|
||||
"name": "last_seen",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"producto_urls_url_unique": {
|
||||
"name": "producto_urls_url_unique",
|
||||
"columns": [
|
||||
"url"
|
||||
],
|
||||
"isUnique": true
|
||||
}
|
||||
},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {}
|
||||
}
|
||||
},
|
||||
"enums": {},
|
||||
"_meta": {
|
||||
"schemas": {},
|
||||
"tables": {},
|
||||
"columns": {}
|
||||
}
|
||||
}
|
|
@ -29,48 +29,6 @@
|
|||
"when": 1703521964385,
|
||||
"tag": "0003_abandoned_landau",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 4,
|
||||
"version": "5",
|
||||
"when": 1703726748364,
|
||||
"tag": "0004_left_wolfsbane",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 5,
|
||||
"version": "5",
|
||||
"when": 1703807455551,
|
||||
"tag": "0005_lucky_epoch",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 6,
|
||||
"version": "5",
|
||||
"when": 1703807457204,
|
||||
"tag": "0006_jazzy_madripoor",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 7,
|
||||
"version": "5",
|
||||
"when": 1703807458666,
|
||||
"tag": "0007_bright_silvermane",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 8,
|
||||
"version": "5",
|
||||
"when": 1703807460152,
|
||||
"tag": "0008_funny_nighthawk",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 9,
|
||||
"version": "5",
|
||||
"when": 1703895109501,
|
||||
"tag": "0009_breezy_forge",
|
||||
"breakpoints": true
|
||||
}
|
||||
]
|
||||
}
|
|
@ -1,16 +1,15 @@
|
|||
import Database from "bun:sqlite";
|
||||
import { join, dirname } from "node:path";
|
||||
import { join } from "node:path";
|
||||
import { drizzle } from "drizzle-orm/bun-sqlite";
|
||||
import { migrate } from "drizzle-orm/bun-sqlite/migrator";
|
||||
import * as schema from "./schema.js";
|
||||
import { DB_PATH } from "./drizzle.config.js";
|
||||
|
||||
const url = new URL(import.meta.url);
|
||||
export function migrateDb() {
|
||||
const sqlite = new Database(DB_PATH);
|
||||
const db = drizzle(sqlite, { schema });
|
||||
|
||||
migrate(db, { migrationsFolder: join(dirname(url.pathname), "drizzle") });
|
||||
migrate(db, { migrationsFolder: join(import.meta.dir, "drizzle") });
|
||||
sqlite.run(`
|
||||
pragma journal_mode = WAL;
|
||||
PRAGMA synchronous = NORMAL;
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"drizzle-orm": "=0.29.1"
|
||||
"drizzle-orm": "^0.29.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/bun": "^1.0.0",
|
||||
|
|
|
@ -22,12 +22,3 @@ export const precios = sqliteTable(
|
|||
);
|
||||
|
||||
export type Precio = typeof precios.$inferSelect;
|
||||
|
||||
export const productoUrls = sqliteTable("producto_urls", {
|
||||
id: integer("id", { mode: "number" }).primaryKey({ autoIncrement: true }),
|
||||
url: text("url").unique().notNull(),
|
||||
firstSeen: integer("first_seen", { mode: "timestamp" }).notNull(),
|
||||
lastSeen: integer("last_seen", { mode: "timestamp" }).notNull(),
|
||||
});
|
||||
|
||||
export type ProductUrl = typeof productoUrls.$inferSelect;
|
||||
|
|
|
@ -1,25 +0,0 @@
|
|||
import { sql } from "drizzle-orm";
|
||||
import { db } from "./db.js";
|
||||
import { productoUrls } from "./schema.js";
|
||||
|
||||
export function saveUrls(urls: string[]) {
|
||||
db.transaction((tx) => {
|
||||
const now = new Date();
|
||||
const insertUrlTra = tx
|
||||
.insert(productoUrls)
|
||||
.values({
|
||||
url: sql.placeholder("url"),
|
||||
firstSeen: now,
|
||||
lastSeen: now,
|
||||
})
|
||||
.onConflictDoUpdate({
|
||||
target: productoUrls.url,
|
||||
set: { lastSeen: now },
|
||||
})
|
||||
.prepare();
|
||||
|
||||
for (const href of urls) {
|
||||
insertUrlTra.run({ url: href });
|
||||
}
|
||||
});
|
||||
}
|
|
@ -1,110 +1,94 @@
|
|||
import pMap from "p-map";
|
||||
import { parseHTML } from "linkedom";
|
||||
import { getHtml } from "../scraper/fetch.js";
|
||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||
|
||||
const categorias = [
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/conservas",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/aceites-y-aderezos",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/pastas-secas",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/arroz-y-legumbres",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/panaderia",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/golosinas-y-alfajores",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/reposteria",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/comidas-listas",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/harinas",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/picadas",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/panaderia/pan-rallado-y-rebozadores",
|
||||
"https://diaonline.supermercadosdia.com.ar/desayuno",
|
||||
"https://diaonline.supermercadosdia.com.ar/desayuno/galletitas-y-cereales",
|
||||
"https://diaonline.supermercadosdia.com.ar/desayuno/infusiones-y-endulzantes",
|
||||
"https://diaonline.supermercadosdia.com.ar/desayuno/para-untar",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/leches",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/fiambreria",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/lacteos",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/carniceria",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/pastas-frescas",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/listos-para-disfrutar",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutas",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/verduras",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/huevos",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutos-secos",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/gaseosas",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/cervezas",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/aguas",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/bodega",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/jugos-e-isot%C3%B3nicas",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/aperitivos",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/bebidas-blancas-y-licores",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/hamburguesas-y-medallones",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/rebozados",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/vegetales-congelados",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/postres-congelados",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/pescaderia",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/papas-congeladas",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/comidas-congeladas",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/hielo",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/cuidado-de-la-ropa",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/papeleria",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/limpiadores",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/limpieza-de-cocina",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/accesorios-de-limpieza",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/desodorantes-de-ambiente",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/insecticidas",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/fosforos-y-velas",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/bolsas",
|
||||
"https://diaonline.supermercadosdia.com.ar/4160?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||
"https://diaonline.supermercadosdia.com.ar/4136?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||
"https://diaonline.supermercadosdia.com.ar/4143?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||
"https://diaonline.supermercadosdia.com.ar/4189?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||
"https://diaonline.supermercadosdia.com.ar/4086?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||
"https://diaonline.supermercadosdia.com.ar/2089?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||
];
|
||||
|
||||
export async function scrapDiaProducts() {
|
||||
await Promise.all([scrapBySite(), scrapBySitemap()]);
|
||||
}
|
||||
|
||||
async function scrapBySitemap() {
|
||||
// de https://diaonline.supermercadosdia.com.ar/sitemap.xml
|
||||
const sitemaps = [
|
||||
"https://diaonline.supermercadosdia.com.ar/sitemap/product-1.xml",
|
||||
"https://diaonline.supermercadosdia.com.ar/sitemap/product-2.xml",
|
||||
"https://diaonline.supermercadosdia.com.ar/sitemap/product-3.xml",
|
||||
"https://diaonline.supermercadosdia.com.ar/sitemap/product-4.xml",
|
||||
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
|
||||
(async () => {
|
||||
const categorias = [
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/conservas",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/aceites-y-aderezos",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/pastas-secas",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/arroz-y-legumbres",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/panaderia",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/golosinas-y-alfajores",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/reposteria",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/comidas-listas",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/harinas",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/picadas",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/panaderia/pan-rallado-y-rebozadores",
|
||||
"https://diaonline.supermercadosdia.com.ar/desayuno",
|
||||
"https://diaonline.supermercadosdia.com.ar/desayuno/galletitas-y-cereales",
|
||||
"https://diaonline.supermercadosdia.com.ar/desayuno/infusiones-y-endulzantes",
|
||||
"https://diaonline.supermercadosdia.com.ar/desayuno/para-untar",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/leches",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/fiambreria",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/lacteos",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/carniceria",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/pastas-frescas",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/listos-para-disfrutar",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutas",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/verduras",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/huevos",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutos-secos",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/gaseosas",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/cervezas",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/aguas",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/bodega",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/jugos-e-isot%C3%B3nicas",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/aperitivos",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/bebidas-blancas-y-licores",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/hamburguesas-y-medallones",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/rebozados",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/vegetales-congelados",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/postres-congelados",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/pescaderia",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/papas-congeladas",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/comidas-congeladas",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/hielo",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/cuidado-de-la-ropa",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/papeleria",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/limpiadores",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/limpieza-de-cocina",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/accesorios-de-limpieza",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/desodorantes-de-ambiente",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/insecticidas",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/fosforos-y-velas",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/bolsas",
|
||||
"https://diaonline.supermercadosdia.com.ar/4160?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||
"https://diaonline.supermercadosdia.com.ar/4136?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||
"https://diaonline.supermercadosdia.com.ar/4143?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||
"https://diaonline.supermercadosdia.com.ar/4189?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||
"https://diaonline.supermercadosdia.com.ar/4086?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||
"https://diaonline.supermercadosdia.com.ar/2089?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||
];
|
||||
|
||||
await pMap(sitemaps, async (sitemapUrl) => {
|
||||
const res = await fetch(sitemapUrl);
|
||||
const xml = await res.text();
|
||||
let urls = new Set<string>();
|
||||
new HTMLRewriter()
|
||||
.on("loc", {
|
||||
text(element) {
|
||||
const txt = element.text.trim();
|
||||
if (!txt) return;
|
||||
urls.add(txt);
|
||||
},
|
||||
const links = categorias.flatMap(
|
||||
(link) =>
|
||||
Array.from({ length: 51 }, (x, i) => i).map((i) => {
|
||||
const url = new URL(link);
|
||||
url.searchParams.set("page", `${i}`);
|
||||
return url.toString();
|
||||
})
|
||||
.transform(new Response(xml));
|
||||
saveUrls(Array.from(urls));
|
||||
});
|
||||
}
|
||||
|
||||
async function scrapBySite() {
|
||||
const links = categorias.flatMap((link) =>
|
||||
Array.from({ length: 51 }, (x, i) => i).map((i) => {
|
||||
const url = new URL(link);
|
||||
url.searchParams.set("page", `${i}`);
|
||||
return url.toString();
|
||||
})
|
||||
// el order solo carga con el frontend :(
|
||||
// .flatMap((link) =>
|
||||
// [
|
||||
// "OrderByNameASC",
|
||||
// "OrderByNameDESC",
|
||||
// "OrderByTopSaleDESC",
|
||||
// "OrderByPriceDESC",
|
||||
// "OrderByPriceASC",
|
||||
// "",
|
||||
// ].map((order) => {
|
||||
// const url = new URL(link);
|
||||
// url.searchParams.set("order", order);
|
||||
// return url.toString();
|
||||
// })
|
||||
// )
|
||||
);
|
||||
|
||||
await pMap(
|
||||
|
@ -119,8 +103,8 @@ async function scrapBySite() {
|
|||
),
|
||||
(a) => new URL(a.href, url).toString()
|
||||
);
|
||||
saveUrls(hrefs);
|
||||
hrefs.forEach((h) => process.stdout.write(h + "\n"));
|
||||
},
|
||||
{ concurrency: 32 }
|
||||
);
|
||||
}
|
||||
})();
|
||||
|
|
|
@ -12,6 +12,8 @@
|
|||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"linkedom": "^0.16.5",
|
||||
"p-map": "^7.0.0"
|
||||
"p-map": "^7.0.0",
|
||||
"tsx": "^4.7.0",
|
||||
"undici": "^6.2.0"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,7 +4,6 @@
|
|||
"workspaces": [
|
||||
"dia-link-scraper",
|
||||
"coto-link-scraper",
|
||||
"carrefour-link-scraper",
|
||||
"scraper",
|
||||
"sitio",
|
||||
"db-datos"
|
||||
|
|
|
@ -1,20 +1,14 @@
|
|||
import { mkdtemp, access, writeFile } from "node:fs/promises";
|
||||
import { mkdtemp, access } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join, resolve } from "node:path";
|
||||
import { spawn } from "node:child_process";
|
||||
import { Supermercado, hosts } from "db-datos/supermercado.js";
|
||||
import { Supermercado } from "db-datos/supermercado.js";
|
||||
import PQueue from "p-queue";
|
||||
import { format, formatDuration, intervalToDuration } from "date-fns";
|
||||
import { parseWarc } from "./scrap.js";
|
||||
import { S3Client } from "@aws-sdk/client-s3";
|
||||
import { Upload } from "@aws-sdk/lib-storage";
|
||||
import { BunFile } from "bun";
|
||||
import { db } from "db-datos/db.js";
|
||||
import { like } from "drizzle-orm";
|
||||
import { productoUrls } from "db-datos/schema.js";
|
||||
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
|
||||
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
|
||||
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
|
||||
|
||||
const supermercados: Supermercado[] = [
|
||||
Supermercado.Carrefour,
|
||||
|
@ -77,40 +71,11 @@ class Auto {
|
|||
}
|
||||
|
||||
async downloadList(supermercado: Supermercado) {
|
||||
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-"));
|
||||
|
||||
let listPath: string;
|
||||
{
|
||||
const t0 = performance.now();
|
||||
switch (supermercado) {
|
||||
case "Dia":
|
||||
await scrapDiaProducts();
|
||||
break;
|
||||
case "Coto":
|
||||
await scrapCotoProducts();
|
||||
break;
|
||||
case "Carrefour":
|
||||
await scrapCarrefourProducts();
|
||||
break;
|
||||
}
|
||||
this.inform(
|
||||
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
|
||||
);
|
||||
}
|
||||
|
||||
listPath = join(ctxPath, `lista-${supermercado}.txt`);
|
||||
const host = Object.entries(hosts).find(
|
||||
([host, supe]) => supe === supermercado
|
||||
)![0];
|
||||
const results = await db.query.productoUrls
|
||||
.findMany({
|
||||
where: like(productoUrls.url, `%${host}%`),
|
||||
})
|
||||
.execute();
|
||||
const urls = results.map((r) => r.url);
|
||||
await writeFile(listPath, urls.join("\n") + "\n");
|
||||
|
||||
const listPath = resolve(
|
||||
join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`)
|
||||
);
|
||||
const date = new Date();
|
||||
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-"));
|
||||
const zstdWarcName = `${supermercado}-${format(
|
||||
date,
|
||||
"yyyy-MM-dd-HH:mm"
|
||||
|
@ -133,7 +98,7 @@ class Auto {
|
|||
const t0 = performance.now();
|
||||
await subproc.exited;
|
||||
this.inform(
|
||||
`[wget] ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}`
|
||||
`wget para ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}`
|
||||
);
|
||||
|
||||
const gzippedWarcPath = join(ctxPath, "temp.warc.gz");
|
||||
|
@ -222,6 +187,7 @@ class Auto {
|
|||
stdio: ["pipe", null, null],
|
||||
}
|
||||
);
|
||||
// @ts-expect-error a los types de bun no le gusta????
|
||||
decompressor.stdout.pipe(compressor.stdin);
|
||||
compressor.on("close", (code) => {
|
||||
if (code !== 0) {
|
||||
|
|
|
@ -1,6 +1,32 @@
|
|||
import { request } from "undici";
|
||||
import { createBrotliDecompress, createUnzip } from "node:zlib";
|
||||
import { pipeline } from "node:stream/promises";
|
||||
|
||||
export async function getHtml(url: string) {
|
||||
const res = await fetch(url);
|
||||
return readableToBuffer(res.body!);
|
||||
const res = await request(url, {
|
||||
headers: {
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
},
|
||||
throwOnError: true,
|
||||
bodyTimeout: 10 * 60 * 1000,
|
||||
});
|
||||
let output: Buffer;
|
||||
switch (res.headers["content-encoding"]) {
|
||||
case "gzip":
|
||||
case "deflate":
|
||||
output = await pipeline(res.body, createUnzip(), readableToBuffer);
|
||||
break;
|
||||
case "br":
|
||||
output = await pipeline(
|
||||
res.body,
|
||||
createBrotliDecompress(),
|
||||
readableToBuffer
|
||||
);
|
||||
break;
|
||||
default:
|
||||
output = await readableToBuffer(res.body);
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
async function readableToBuffer(source: AsyncIterable<any>) {
|
||||
|
|
|
@ -5,8 +5,7 @@
|
|||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"build:container": "podman build -t gitea.nulo.in/nulo/preciazo/scraper -f ./Containerfile ..",
|
||||
"push:container": "bun build:container && podman push gitea.nulo.in/nulo/preciazo/scraper"
|
||||
"build:container": "podman build -t gitea.nulo.in/nulo/preciazo/scraper -f ./Containerfile .."
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
|
@ -16,10 +15,11 @@
|
|||
"@aws-sdk/lib-storage": "^3.478.0",
|
||||
"date-fns": "^3.0.6",
|
||||
"db-datos": "workspace:^",
|
||||
"drizzle-orm": "=0.29.1",
|
||||
"drizzle-orm": "^0.29.1",
|
||||
"linkedom": "^0.16.5",
|
||||
"nanoid": "^5.0.4",
|
||||
"p-queue": "^8.0.1",
|
||||
"undici": "^6.2.0",
|
||||
"warcio": "^2.2.1",
|
||||
"zod": "^3.22.4"
|
||||
},
|
||||
|
|
|
@ -34,11 +34,10 @@ export function getCotoProduct(html: string | Buffer): Precioish {
|
|||
const ean = getEanFromText(dom);
|
||||
const precioCentavos = getPriceFromText(dom);
|
||||
|
||||
const name = dom.document
|
||||
.querySelector("h1.product_page")
|
||||
?.textContent?.trim();
|
||||
const imageUrl =
|
||||
dom.document.querySelector<HTMLImageElement>(".zoom img")?.src;
|
||||
const name = dom.document.querySelector("h1.product_page")?.textContent;
|
||||
const imageUrl = dom.document.querySelector<HTMLImageElement>(
|
||||
".productImageZoom img"
|
||||
)?.src;
|
||||
|
||||
return { name, imageUrl, ean, precioCentavos };
|
||||
}
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
import { Database } from "bun:sqlite";
|
||||
import { drizzle } from "drizzle-orm/bun-sqlite";
|
||||
import * as schema from "db-datos/schema.js";
|
||||
import { WARCParser } from "warcio";
|
||||
import { writeFile } from "fs/promises";
|
||||
|
@ -7,10 +9,16 @@ import { getDiaProduct } from "./parsers/dia.js";
|
|||
import { getCotoProduct } from "./parsers/coto.js";
|
||||
import { join } from "path";
|
||||
import { and, eq, sql } from "drizzle-orm";
|
||||
import { db } from "db-datos/db.js";
|
||||
import { DB_PATH } from "db-datos/drizzle.config.js";
|
||||
import { migrateDb } from "db-datos/migrate.js";
|
||||
|
||||
const DEBUG = false;
|
||||
const PARSER_VERSION = 4;
|
||||
const PARSER_VERSION = 2;
|
||||
|
||||
migrateDb();
|
||||
|
||||
const sqlite = new Database(DB_PATH);
|
||||
const db = drizzle(sqlite, { schema });
|
||||
|
||||
const getPrevPrecio = db
|
||||
.select({ id: schema.precios.id })
|
||||
|
|
|
@ -1,24 +1,7 @@
|
|||
FROM docker.io/oven/bun:1-alpine as build
|
||||
RUN apk add --no-cache nodejs
|
||||
WORKDIR /usr/src/app
|
||||
COPY . .
|
||||
WORKDIR /usr/src/app/sitio
|
||||
RUN bun install && \
|
||||
bun run build
|
||||
|
||||
# FROM docker.io/oven/bun:1-alpine as deps
|
||||
# WORKDIR /usr/src/app/sitio
|
||||
# RUN bun init && bun install "better-sqlite3"@"^9.2.2" "chart.js"@"^4.4.1" "chartjs-adapter-dayjs-4"@"^1.0.4" "dayjs"@"^1.11.10" "drizzle-orm"@"^0.29.1"
|
||||
# COPY --from=build /usr/src/app/db-datos node_modules/db-datos
|
||||
|
||||
FROM docker.io/alpine:3.19
|
||||
RUN apk add --no-cache tini nodejs npm jq
|
||||
|
||||
WORKDIR /app
|
||||
COPY --from=build /usr/src/app/sitio/package.json package.real.json
|
||||
RUN sh -c 'echo {\"name\":\"sitio\",\"type\":\"module\",\"dependencies\":$(jq .dependencies < package.real.json)} > package.json' && npm install
|
||||
COPY --from=build /usr/src/app/db-datos node_modules/db-datos
|
||||
COPY --from=build /usr/src/app/sitio/build .
|
||||
FROM docker.io/oven/bun:1-alpine
|
||||
COPY build/ .
|
||||
RUN bun i
|
||||
EXPOSE 3000
|
||||
|
||||
# https://github.com/gornostay25/svelte-adapter-bun/issues/39
|
||||
ENV PROTOCOL_HEADER=x-forwarded-proto
|
||||
|
@ -26,6 +9,5 @@ ENV HOST_HEADER=x-forwarded-host
|
|||
|
||||
VOLUME /db
|
||||
ENV DB_PATH=/db/db.db
|
||||
EXPOSE 3000
|
||||
|
||||
CMD ["tini", "node", "."]
|
||||
CMD ["bun", "run", "start"]
|
|
@ -5,7 +5,7 @@
|
|||
"scripts": {
|
||||
"dev": "vite dev",
|
||||
"build": "vite build",
|
||||
"build:container": "podman build -t gitea.nulo.in/nulo/preciazo/sitio -f ./Containerfile ..",
|
||||
"build:container": "bun --bun vite build && podman build -t gitea.nulo.in/nulo/preciazo/sitio .",
|
||||
"push:container": "bun build:container && podman push gitea.nulo.in/nulo/preciazo/sitio",
|
||||
"preview": "vite preview",
|
||||
"check": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json",
|
||||
|
@ -16,6 +16,7 @@
|
|||
"devDependencies": {
|
||||
"@sveltejs/kit": "^2.0.0",
|
||||
"@sveltejs/vite-plugin-svelte": "^3.0.0",
|
||||
"@types/bun": "^1.0.0",
|
||||
"autoprefixer": "^10.4.16",
|
||||
"db-datos": "workspace:^",
|
||||
"postcss": "^8.4.32",
|
||||
|
@ -24,21 +25,18 @@
|
|||
"prettier-plugin-svelte": "^3.1.2",
|
||||
"prettier-plugin-tailwindcss": "^0.5.9",
|
||||
"svelte": "^4.2.7",
|
||||
"svelte-adapter-bun": "^0.5.1",
|
||||
"svelte-check": "^3.6.0",
|
||||
"tailwindcss": "^3.3.6",
|
||||
"tslib": "^2.4.1",
|
||||
"typescript": "^5.0.0",
|
||||
"vite": "^5.0.3",
|
||||
"@sveltejs/adapter-node": "^2.0.2",
|
||||
"@types/better-sqlite3": "^7.6.8",
|
||||
"@types/node": "^20.10.6"
|
||||
"vite": "^5.0.3"
|
||||
},
|
||||
"type": "module",
|
||||
"dependencies": {
|
||||
"better-sqlite3": "^9.2.2",
|
||||
"chart.js": "^4.4.1",
|
||||
"chartjs-adapter-dayjs-4": "^1.0.4",
|
||||
"dayjs": "^1.11.10",
|
||||
"drizzle-orm": "=0.29.1"
|
||||
"drizzle-orm": "^0.29.1"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,10 +6,7 @@
|
|||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
%sveltekit.head%
|
||||
</head>
|
||||
<body
|
||||
class="bg-neutral-100 dark:bg-neutral-900 dark:text-neutral-200"
|
||||
data-sveltekit-preload-data="hover"
|
||||
>
|
||||
<body data-sveltekit-preload-data="hover">
|
||||
<div style="display: contents">%sveltekit.body%</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -1,8 +0,0 @@
|
|||
<script lang="ts">
|
||||
export let product: { ean: string; name: string; imageUrl: string };
|
||||
</script>
|
||||
|
||||
<a href={`/ean/${product.ean}`} class="flex">
|
||||
<img src={product.imageUrl} alt={product.name} class="max-h-48" />
|
||||
<p class="text-xl">{product.name}</p>
|
||||
</a>
|
|
@ -1,10 +1,9 @@
|
|||
import Database from "better-sqlite3";
|
||||
import { drizzle } from "drizzle-orm/better-sqlite3";
|
||||
import Database from "bun:sqlite";
|
||||
import { drizzle } from "drizzle-orm/bun-sqlite";
|
||||
import * as schema from "db-datos/schema.js";
|
||||
import { env } from "$env/dynamic/private";
|
||||
|
||||
const sqlite = new Database(env.DB_PATH ?? "../scraper/sqlite.db");
|
||||
const db = drizzle(sqlite, { schema });
|
||||
|
||||
export { db };
|
||||
export const db = drizzle(sqlite, { schema });
|
||||
export * as schema from "db-datos/schema.js";
|
||||
|
|
|
@ -1,14 +0,0 @@
|
|||
import { countDistinct } from "drizzle-orm";
|
||||
import type { PageServerLoad } from "./$types";
|
||||
import { db, schema } from "$lib/server/db";
|
||||
const { precios } = schema;
|
||||
|
||||
export const load: PageServerLoad = async () => {
|
||||
const nProductosR = await db
|
||||
.select({
|
||||
count: countDistinct(precios.ean),
|
||||
})
|
||||
.from(precios);
|
||||
const nProductos = nProductosR[0].count;
|
||||
return { nProductos };
|
||||
};
|
|
@ -1,43 +1,5 @@
|
|||
<script lang="ts">
|
||||
<script>
|
||||
import "../app.pcss";
|
||||
|
||||
import type { PageData } from "./$types";
|
||||
|
||||
export let data: PageData;
|
||||
</script>
|
||||
|
||||
<!-- https://flowbite.com/docs/forms/search-input/ -->
|
||||
<form method="GET" action="/search">
|
||||
<div class="flex items-stretch p-4">
|
||||
<input
|
||||
type="search"
|
||||
name="q"
|
||||
class="block w-full rounded-l-lg border border-gray-300 bg-gray-50 p-2.5 text-sm text-gray-900 focus:border-blue-500 focus:ring-blue-500 dark:border-gray-600 dark:bg-gray-700 dark:text-white dark:placeholder-gray-400 dark:focus:border-blue-500"
|
||||
placeholder={`Buscar entre ${data.nProductos} productos`}
|
||||
required
|
||||
/>
|
||||
<button
|
||||
type="submit"
|
||||
class="block rounded-e-lg border border-blue-700 bg-blue-700 p-2.5 text-sm font-medium text-white hover:bg-blue-800 focus:outline-none focus:ring-4 focus:ring-blue-300 dark:bg-blue-600 dark:hover:bg-blue-700 dark:focus:ring-blue-800"
|
||||
>
|
||||
<svg
|
||||
class="h-4 w-4"
|
||||
aria-hidden="true"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
fill="none"
|
||||
viewBox="0 0 20 20"
|
||||
>
|
||||
<path
|
||||
stroke="currentColor"
|
||||
stroke-linecap="round"
|
||||
stroke-linejoin="round"
|
||||
stroke-width="2"
|
||||
d="m19 19-4-4m0-7A7 7 0 1 1 1 8a7 7 0 0 1 14 0Z"
|
||||
/>
|
||||
</svg>
|
||||
<span class="sr-only">Search</span>
|
||||
</button>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
<slot />
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import { error } from "@sveltejs/kit";
|
||||
import type { PageServerLoad } from "./$types";
|
||||
import { db, schema } from "$lib/server/db";
|
||||
const { precios } = schema;
|
||||
|
@ -5,11 +6,7 @@ import { sql } from "drizzle-orm";
|
|||
|
||||
export const load: PageServerLoad = async ({ params }) => {
|
||||
const q = db
|
||||
.select({
|
||||
ean: precios.ean,
|
||||
name: precios.name,
|
||||
imageUrl: precios.imageUrl,
|
||||
})
|
||||
.select({ ean: precios.ean, name: precios.name })
|
||||
.from(precios)
|
||||
.groupBy(precios.ean)
|
||||
.having(sql`max(length(name))`)
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
<script lang="ts">
|
||||
import ProductPreview from "$lib/ProductPreview.svelte";
|
||||
import type { PageData } from "./$types";
|
||||
|
||||
export let data: PageData;
|
||||
|
@ -31,10 +30,12 @@
|
|||
|
||||
<section>
|
||||
<h2 class="text-lg font-bold">Random</h2>
|
||||
<ul class="grid grid-cols-1 gap-4 md:grid-cols-2 lg:grid-cols-3">
|
||||
<ul>
|
||||
{#each data.precios as product}
|
||||
<li>
|
||||
<ProductPreview {product} />
|
||||
<a href={`/ean/${product.ean}`}>
|
||||
{product.name}
|
||||
</a>
|
||||
</li>
|
||||
{/each}
|
||||
</ul>
|
||||
|
|
|
@ -22,7 +22,7 @@
|
|||
|
||||
{#if data.meta}
|
||||
<h1 class="text-3xl font-bold">{data.meta.name}</h1>
|
||||
<img src={data.meta.imageUrl} alt={data.meta.name} class="max-h-48" />
|
||||
<img src={data.meta.imageUrl} class="max-h-48" />
|
||||
<div class="flex gap-2">
|
||||
{#each urls as [supermercado, url]}
|
||||
<a
|
||||
|
|
|
@ -42,7 +42,7 @@
|
|||
}
|
||||
</script>
|
||||
|
||||
<div class="h-[300px] w-full min-w-[500px] bg-neutral-200 dark:invert">
|
||||
<div class="h-[300px] w-full min-w-[500px]">
|
||||
<ChartJs
|
||||
type="line"
|
||||
data={{ datasets }}
|
||||
|
|
|
@ -1,19 +0,0 @@
|
|||
import { error } from "@sveltejs/kit";
|
||||
import { eq, max, sql } from "drizzle-orm";
|
||||
import type { PageServerLoad } from "./$types";
|
||||
import { db, schema } from "$lib/server/db";
|
||||
const { precios } = schema;
|
||||
|
||||
export const load: PageServerLoad = async ({ url }) => {
|
||||
const query = url.searchParams.get("q");
|
||||
let results: null | { ean: string; name: string; imageUrl: string }[] = null;
|
||||
if (query) {
|
||||
results = db.all(
|
||||
sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
|
||||
join precios p on p.ean = f.ean
|
||||
where f.name match ${query};`,
|
||||
);
|
||||
}
|
||||
|
||||
return { query, results };
|
||||
};
|
|
@ -1,21 +0,0 @@
|
|||
<script lang="ts">
|
||||
import ProductPreview from "$lib/ProductPreview.svelte";
|
||||
import type { PageData } from "./$types";
|
||||
|
||||
export let data: PageData;
|
||||
</script>
|
||||
|
||||
{#if data.results}
|
||||
<header class="my-2">
|
||||
<h1 class="text-2xl font-bold">Resultados para "{data.query}"</h1>
|
||||
</header>
|
||||
<ul class="grid grid-cols-1 gap-4 md:grid-cols-2 lg:grid-cols-3">
|
||||
{#each data.results as product}
|
||||
<li>
|
||||
<ProductPreview {product} />
|
||||
</li>
|
||||
{/each}
|
||||
</ul>
|
||||
{:else}
|
||||
Probá buscando algo.
|
||||
{/if}
|
|
@ -1,5 +1,5 @@
|
|||
import adapter from "@sveltejs/adapter-node";
|
||||
// import adapter from "svelte-adapter-bun";
|
||||
// import adapter from "@sveltejs/adapter-node";
|
||||
import adapter from "svelte-adapter-bun";
|
||||
import { vitePreprocess } from "@sveltejs/vite-plugin-svelte";
|
||||
|
||||
/** @type {import('@sveltejs/kit').Config} */
|
||||
|
|
Loading…
Reference in a new issue