Compare commits

..

No commits in common. "5dcc901a80fb765995bc795f3b009616dd404768" and "198e51fc97e799510ca725dfc597c9922af07e46" have entirely different histories.

45 changed files with 201 additions and 1159 deletions

BIN
bun.lockb

Binary file not shown.

View file

@ -1,44 +0,0 @@
import pMap from "p-map";
import { saveUrls } from "db-datos/urlHelpers.js";
await scrapBySitemap();
export async function scrapCarrefourProducts() {
await scrapBySitemap();
}
async function scrapBySitemap() {
// de https://www.carrefour.com.ar/sitemap.xml
const sitemaps = [
"https://www.carrefour.com.ar/sitemap/product-0.xml",
"https://www.carrefour.com.ar/sitemap/product-1.xml",
"https://www.carrefour.com.ar/sitemap/product-2.xml",
"https://www.carrefour.com.ar/sitemap/product-3.xml",
"https://www.carrefour.com.ar/sitemap/product-4.xml",
"https://www.carrefour.com.ar/sitemap/product-5.xml",
"https://www.carrefour.com.ar/sitemap/product-6.xml",
"https://www.carrefour.com.ar/sitemap/product-7.xml",
"https://www.carrefour.com.ar/sitemap/product-8.xml",
"https://www.carrefour.com.ar/sitemap/product-9.xml",
];
await pMap(
sitemaps,
async (sitemapUrl) => {
const res = await fetch(sitemapUrl);
const xml = await res.text();
let urls = new Set<string>();
new HTMLRewriter()
.on("loc", {
text(element) {
const txt = element.text.trim();
if (!txt) return;
urls.add(txt);
},
})
.transform(new Response(xml));
saveUrls(Array.from(urls));
},
{ concurrency: 3 }
);
}

View file

@ -1,17 +0,0 @@
{
"name": "carrefour-link-scraper",
"type": "module",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"linkedom": "^0.16.5",
"p-map": "^7.0.1"
}
}

View file

@ -1,24 +1,23 @@
import { getHtml } from "../scraper/fetch.js"; import { getHtml } from "../scraper/fetch.js";
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import PQueue from "p-queue"; import PQueue from "p-queue";
import { saveUrls } from "db-datos/urlHelpers.js";
export async function scrapCotoProducts() { // let fetched = new Set<string>();
{
const initial = const initial =
"https://www.cotodigital3.com.ar/sitios/cdigi/browse?Nf=product.endDate%7CGTEQ+1.7032032E12%7C%7Cproduct.startDate%7CLTEQ+1.7032032E12&No=2200&Nr=AND%28product.sDisp_200%3A1004%2Cproduct.language%3Aespa%C3%B1ol%2COR%28product.siteId%3ACotoDigital%29%29&Nrpp=200"; "https://www.cotodigital3.com.ar/sitios/cdigi/browse?Nf=product.endDate%7CGTEQ+1.7032032E12%7C%7Cproduct.startDate%7CLTEQ+1.7032032E12&No=2200&Nr=AND%28product.sDisp_200%3A1004%2Cproduct.language%3Aespa%C3%B1ol%2COR%28product.siteId%3ACotoDigital%29%29&Nrpp=200";
const queue = new PQueue({ concurrency: 4 }); const queue = new PQueue({ concurrency: 2 });
const pageSize = 300; // hasta 1000 const pageSize = 300; // hasta 1000
const links = Array.from( const links = Array.from({ length: Math.ceil(29000 / 300) }, (x, i) => i).map(
{ length: Math.ceil(29000 / pageSize) }, (i) => {
(x, i) => i
).map((i) => {
const url = new URL(initial); const url = new URL(initial);
url.searchParams.set("No", `${i * pageSize}`); url.searchParams.set("No", `${i * pageSize}`);
url.searchParams.set("Nrpp", `${pageSize}`); url.searchParams.set("Nrpp", `${pageSize}`);
return url.toString(); return url.toString();
}); }
);
const promises = links.map((l) => queue.add(getPage(l))); const promises = links.map((l) => queue.add(getPage(l)));
await Promise.all(promises); await Promise.all(promises);
@ -39,6 +38,22 @@ function getPage(url: string) {
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"), document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
(a) => new URL(a.href, url).toString() (a) => new URL(a.href, url).toString()
); );
saveUrls(hrefs); hrefs.forEach((h) => process.stdout.write(h + "\n"));
// const nextLinks = Array.from(
// document.querySelectorAll<HTMLAnchorElement>(
// "#atg_store_pagination a[href]"
// ),
// (a) => new URL(a.href, url).toString()
// );
// await Promise.all(
// nextLinks
// .filter((l) => !fetched.has(l))
// .map((l) => {
// fetched.add(l);
// return queue.add(getPage(l));
// })
// );
}; };
} }

View file

@ -12,6 +12,8 @@
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"linkedom": "^0.16.5", "linkedom": "^0.16.5",
"p-queue": "^8.0.1" "p-queue": "^8.0.1",
"tsx": "^4.7.0",
"undici": "^6.2.0"
} }
} }

View file

@ -1,10 +0,0 @@
import { Database } from "bun:sqlite";
import { drizzle } from "drizzle-orm/bun-sqlite";
import { DB_PATH } from "./drizzle.config.js";
import { migrateDb } from "./migrate.js";
import * as schema from "./schema.js";
migrateDb();
export const sqlite = new Database(DB_PATH);
export const db = drizzle(sqlite, { schema });

View file

@ -1,3 +0,0 @@
-- Custom SQL migration file, put you code below! --
create virtual table precios_fts using fts5(ean, url, name, content=precios, content_rowid=id);

View file

@ -1,2 +0,0 @@
-- Custom SQL migration file, put you code below! --
insert into precios_fts(rowid,ean,url,name) select id,ean,url,name from precios;

View file

@ -1,7 +0,0 @@
-- Custom SQL migration file, put you code below! --
-- https://sqlite.org/fts5.html#external_content_and_contentless_tables
-- Triggers to keep the FTS index up to date.
CREATE TRIGGER precios_fts_ai AFTER INSERT ON precios BEGIN
INSERT INTO precios_fts(rowid, ean, url, name) VALUES (new.id, new.ean, new.url, new.name);
END;

View file

@ -1,6 +0,0 @@
-- Custom SQL migration file, put you code below! --
-- https://sqlite.org/fts5.html#external_content_and_contentless_tables
-- Triggers to keep the FTS index up to date.
CREATE TRIGGER precios_fts_ad AFTER DELETE ON precios BEGIN
INSERT INTO precios_fts(precios_fts, rowid, ean, url, name) VALUES('delete', old.id, old.ean, old.url, old.name);
END;

View file

@ -1,8 +0,0 @@
-- Custom SQL migration file, put you code below! --
-- https://sqlite.org/fts5.html#external_content_and_contentless_tables
-- Triggers to keep the FTS index up to date.
CREATE TRIGGER precios_fts_au AFTER UPDATE ON precios BEGIN
INSERT INTO precios_fts(precios_fts, rowid, ean, url, name) VALUES('delete', old.id, old.ean, old.url, old.name);
INSERT INTO precios_fts(rowid, ean, url, name) VALUES (new.id, new.ean, new.url, new.name);
END;

View file

@ -1,8 +0,0 @@
CREATE TABLE `producto_urls` (
`id` integer PRIMARY KEY AUTOINCREMENT NOT NULL,
`url` text NOT NULL,
`first_seen` integer NOT NULL,
`last_seen` integer NOT NULL
);
--> statement-breakpoint
CREATE UNIQUE INDEX `producto_urls_url_unique` ON `producto_urls` (`url`);

View file

@ -1,101 +0,0 @@
{
"id": "bf90a1cd-ae6a-4dba-a1aa-79f14a11d958",
"prevId": "e1217fdb-6f54-44c5-a04b-c5aebf202102",
"version": "5",
"dialect": "sqlite",
"tables": {
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {
"precios_ean_idx": {
"name": "precios_ean_idx",
"columns": [
"ean"
],
"isUnique": false
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"columns": {},
"schemas": {},
"tables": {}
}
}

View file

@ -1,101 +0,0 @@
{
"id": "f2cf47b9-e137-41c9-b7fb-6bc016588db0",
"prevId": "bf90a1cd-ae6a-4dba-a1aa-79f14a11d958",
"version": "5",
"dialect": "sqlite",
"tables": {
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {
"precios_ean_idx": {
"name": "precios_ean_idx",
"columns": [
"ean"
],
"isUnique": false
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"columns": {},
"schemas": {},
"tables": {}
}
}

View file

@ -1,101 +0,0 @@
{
"id": "ac099405-ecd0-4637-ae5e-fb29c9847e45",
"prevId": "f2cf47b9-e137-41c9-b7fb-6bc016588db0",
"version": "5",
"dialect": "sqlite",
"tables": {
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {
"precios_ean_idx": {
"name": "precios_ean_idx",
"columns": [
"ean"
],
"isUnique": false
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"columns": {},
"schemas": {},
"tables": {}
}
}

View file

@ -1,101 +0,0 @@
{
"id": "9d2f23bf-dc60-4adb-b1bd-ec75e90dda25",
"prevId": "ac099405-ecd0-4637-ae5e-fb29c9847e45",
"version": "5",
"dialect": "sqlite",
"tables": {
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {
"precios_ean_idx": {
"name": "precios_ean_idx",
"columns": [
"ean"
],
"isUnique": false
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"columns": {},
"schemas": {},
"tables": {}
}
}

View file

@ -1,101 +0,0 @@
{
"id": "082630a9-3744-4e33-bde5-06045ca57d36",
"prevId": "9d2f23bf-dc60-4adb-b1bd-ec75e90dda25",
"version": "5",
"dialect": "sqlite",
"tables": {
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {
"precios_ean_idx": {
"name": "precios_ean_idx",
"columns": [
"ean"
],
"isUnique": false
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"columns": {},
"schemas": {},
"tables": {}
}
}

View file

@ -1,146 +0,0 @@
{
"version": "5",
"dialect": "sqlite",
"id": "2e398920-ffaf-4d55-ae13-d906cb9e0efa",
"prevId": "082630a9-3744-4e33-bde5-06045ca57d36",
"tables": {
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {
"precios_ean_idx": {
"name": "precios_ean_idx",
"columns": [
"ean"
],
"isUnique": false
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
},
"producto_urls": {
"name": "producto_urls",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"first_seen": {
"name": "first_seen",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"last_seen": {
"name": "last_seen",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
}
},
"indexes": {
"producto_urls_url_unique": {
"name": "producto_urls_url_unique",
"columns": [
"url"
],
"isUnique": true
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"schemas": {},
"tables": {},
"columns": {}
}
}

View file

@ -29,48 +29,6 @@
"when": 1703521964385, "when": 1703521964385,
"tag": "0003_abandoned_landau", "tag": "0003_abandoned_landau",
"breakpoints": true "breakpoints": true
},
{
"idx": 4,
"version": "5",
"when": 1703726748364,
"tag": "0004_left_wolfsbane",
"breakpoints": true
},
{
"idx": 5,
"version": "5",
"when": 1703807455551,
"tag": "0005_lucky_epoch",
"breakpoints": true
},
{
"idx": 6,
"version": "5",
"when": 1703807457204,
"tag": "0006_jazzy_madripoor",
"breakpoints": true
},
{
"idx": 7,
"version": "5",
"when": 1703807458666,
"tag": "0007_bright_silvermane",
"breakpoints": true
},
{
"idx": 8,
"version": "5",
"when": 1703807460152,
"tag": "0008_funny_nighthawk",
"breakpoints": true
},
{
"idx": 9,
"version": "5",
"when": 1703895109501,
"tag": "0009_breezy_forge",
"breakpoints": true
} }
] ]
} }

View file

@ -1,16 +1,15 @@
import Database from "bun:sqlite"; import Database from "bun:sqlite";
import { join, dirname } from "node:path"; import { join } from "node:path";
import { drizzle } from "drizzle-orm/bun-sqlite"; import { drizzle } from "drizzle-orm/bun-sqlite";
import { migrate } from "drizzle-orm/bun-sqlite/migrator"; import { migrate } from "drizzle-orm/bun-sqlite/migrator";
import * as schema from "./schema.js"; import * as schema from "./schema.js";
import { DB_PATH } from "./drizzle.config.js"; import { DB_PATH } from "./drizzle.config.js";
const url = new URL(import.meta.url);
export function migrateDb() { export function migrateDb() {
const sqlite = new Database(DB_PATH); const sqlite = new Database(DB_PATH);
const db = drizzle(sqlite, { schema }); const db = drizzle(sqlite, { schema });
migrate(db, { migrationsFolder: join(dirname(url.pathname), "drizzle") }); migrate(db, { migrationsFolder: join(import.meta.dir, "drizzle") });
sqlite.run(` sqlite.run(`
pragma journal_mode = WAL; pragma journal_mode = WAL;
PRAGMA synchronous = NORMAL; PRAGMA synchronous = NORMAL;

View file

@ -11,7 +11,7 @@
"author": "", "author": "",
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"drizzle-orm": "=0.29.1" "drizzle-orm": "^0.29.1"
}, },
"devDependencies": { "devDependencies": {
"@types/bun": "^1.0.0", "@types/bun": "^1.0.0",

View file

@ -22,12 +22,3 @@ export const precios = sqliteTable(
); );
export type Precio = typeof precios.$inferSelect; export type Precio = typeof precios.$inferSelect;
export const productoUrls = sqliteTable("producto_urls", {
id: integer("id", { mode: "number" }).primaryKey({ autoIncrement: true }),
url: text("url").unique().notNull(),
firstSeen: integer("first_seen", { mode: "timestamp" }).notNull(),
lastSeen: integer("last_seen", { mode: "timestamp" }).notNull(),
});
export type ProductUrl = typeof productoUrls.$inferSelect;

View file

@ -1,25 +0,0 @@
import { sql } from "drizzle-orm";
import { db } from "./db.js";
import { productoUrls } from "./schema.js";
export function saveUrls(urls: string[]) {
db.transaction((tx) => {
const now = new Date();
const insertUrlTra = tx
.insert(productoUrls)
.values({
url: sql.placeholder("url"),
firstSeen: now,
lastSeen: now,
})
.onConflictDoUpdate({
target: productoUrls.url,
set: { lastSeen: now },
})
.prepare();
for (const href of urls) {
insertUrlTra.run({ url: href });
}
});
}

View file

@ -1,9 +1,8 @@
import pMap from "p-map"; import pMap from "p-map";
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import { getHtml } from "../scraper/fetch.js"; import { getHtml } from "../scraper/fetch.js";
import { saveUrls } from "db-datos/urlHelpers.js"; (async () => {
const categorias = [
const categorias = [
"https://diaonline.supermercadosdia.com.ar/almacen", "https://diaonline.supermercadosdia.com.ar/almacen",
"https://diaonline.supermercadosdia.com.ar/almacen/conservas", "https://diaonline.supermercadosdia.com.ar/almacen/conservas",
"https://diaonline.supermercadosdia.com.ar/almacen/aceites-y-aderezos", "https://diaonline.supermercadosdia.com.ar/almacen/aceites-y-aderezos",
@ -65,46 +64,31 @@ const categorias = [
"https://diaonline.supermercadosdia.com.ar/4189?map=productClusterIds&order=OrderByBestDiscountDESC", "https://diaonline.supermercadosdia.com.ar/4189?map=productClusterIds&order=OrderByBestDiscountDESC",
"https://diaonline.supermercadosdia.com.ar/4086?map=productClusterIds&order=OrderByBestDiscountDESC", "https://diaonline.supermercadosdia.com.ar/4086?map=productClusterIds&order=OrderByBestDiscountDESC",
"https://diaonline.supermercadosdia.com.ar/2089?map=productClusterIds&order=OrderByBestDiscountDESC", "https://diaonline.supermercadosdia.com.ar/2089?map=productClusterIds&order=OrderByBestDiscountDESC",
];
export async function scrapDiaProducts() {
await Promise.all([scrapBySite(), scrapBySitemap()]);
}
async function scrapBySitemap() {
// de https://diaonline.supermercadosdia.com.ar/sitemap.xml
const sitemaps = [
"https://diaonline.supermercadosdia.com.ar/sitemap/product-1.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-2.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-3.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-4.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
]; ];
await pMap(sitemaps, async (sitemapUrl) => { const links = categorias.flatMap(
const res = await fetch(sitemapUrl); (link) =>
const xml = await res.text();
let urls = new Set<string>();
new HTMLRewriter()
.on("loc", {
text(element) {
const txt = element.text.trim();
if (!txt) return;
urls.add(txt);
},
})
.transform(new Response(xml));
saveUrls(Array.from(urls));
});
}
async function scrapBySite() {
const links = categorias.flatMap((link) =>
Array.from({ length: 51 }, (x, i) => i).map((i) => { Array.from({ length: 51 }, (x, i) => i).map((i) => {
const url = new URL(link); const url = new URL(link);
url.searchParams.set("page", `${i}`); url.searchParams.set("page", `${i}`);
return url.toString(); return url.toString();
}) })
// el order solo carga con el frontend :(
// .flatMap((link) =>
// [
// "OrderByNameASC",
// "OrderByNameDESC",
// "OrderByTopSaleDESC",
// "OrderByPriceDESC",
// "OrderByPriceASC",
// "",
// ].map((order) => {
// const url = new URL(link);
// url.searchParams.set("order", order);
// return url.toString();
// })
// )
); );
await pMap( await pMap(
@ -119,8 +103,8 @@ async function scrapBySite() {
), ),
(a) => new URL(a.href, url).toString() (a) => new URL(a.href, url).toString()
); );
saveUrls(hrefs); hrefs.forEach((h) => process.stdout.write(h + "\n"));
}, },
{ concurrency: 32 } { concurrency: 32 }
); );
} })();

View file

@ -12,6 +12,8 @@
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"linkedom": "^0.16.5", "linkedom": "^0.16.5",
"p-map": "^7.0.0" "p-map": "^7.0.0",
"tsx": "^4.7.0",
"undici": "^6.2.0"
} }
} }

View file

@ -4,7 +4,6 @@
"workspaces": [ "workspaces": [
"dia-link-scraper", "dia-link-scraper",
"coto-link-scraper", "coto-link-scraper",
"carrefour-link-scraper",
"scraper", "scraper",
"sitio", "sitio",
"db-datos" "db-datos"

View file

@ -1,20 +1,14 @@
import { mkdtemp, access, writeFile } from "node:fs/promises"; import { mkdtemp, access } from "node:fs/promises";
import { tmpdir } from "node:os"; import { tmpdir } from "node:os";
import { join, resolve } from "node:path"; import { join, resolve } from "node:path";
import { spawn } from "node:child_process"; import { spawn } from "node:child_process";
import { Supermercado, hosts } from "db-datos/supermercado.js"; import { Supermercado } from "db-datos/supermercado.js";
import PQueue from "p-queue"; import PQueue from "p-queue";
import { format, formatDuration, intervalToDuration } from "date-fns"; import { format, formatDuration, intervalToDuration } from "date-fns";
import { parseWarc } from "./scrap.js"; import { parseWarc } from "./scrap.js";
import { S3Client } from "@aws-sdk/client-s3"; import { S3Client } from "@aws-sdk/client-s3";
import { Upload } from "@aws-sdk/lib-storage"; import { Upload } from "@aws-sdk/lib-storage";
import { BunFile } from "bun"; import { BunFile } from "bun";
import { db } from "db-datos/db.js";
import { like } from "drizzle-orm";
import { productoUrls } from "db-datos/schema.js";
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
const supermercados: Supermercado[] = [ const supermercados: Supermercado[] = [
Supermercado.Carrefour, Supermercado.Carrefour,
@ -77,40 +71,11 @@ class Auto {
} }
async downloadList(supermercado: Supermercado) { async downloadList(supermercado: Supermercado) {
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-")); const listPath = resolve(
join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`)
let listPath: string;
{
const t0 = performance.now();
switch (supermercado) {
case "Dia":
await scrapDiaProducts();
break;
case "Coto":
await scrapCotoProducts();
break;
case "Carrefour":
await scrapCarrefourProducts();
break;
}
this.inform(
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
); );
}
listPath = join(ctxPath, `lista-${supermercado}.txt`);
const host = Object.entries(hosts).find(
([host, supe]) => supe === supermercado
)![0];
const results = await db.query.productoUrls
.findMany({
where: like(productoUrls.url, `%${host}%`),
})
.execute();
const urls = results.map((r) => r.url);
await writeFile(listPath, urls.join("\n") + "\n");
const date = new Date(); const date = new Date();
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-"));
const zstdWarcName = `${supermercado}-${format( const zstdWarcName = `${supermercado}-${format(
date, date,
"yyyy-MM-dd-HH:mm" "yyyy-MM-dd-HH:mm"
@ -133,7 +98,7 @@ class Auto {
const t0 = performance.now(); const t0 = performance.now();
await subproc.exited; await subproc.exited;
this.inform( this.inform(
`[wget] ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}` `wget para ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}`
); );
const gzippedWarcPath = join(ctxPath, "temp.warc.gz"); const gzippedWarcPath = join(ctxPath, "temp.warc.gz");
@ -222,6 +187,7 @@ class Auto {
stdio: ["pipe", null, null], stdio: ["pipe", null, null],
} }
); );
// @ts-expect-error a los types de bun no le gusta????
decompressor.stdout.pipe(compressor.stdin); decompressor.stdout.pipe(compressor.stdin);
compressor.on("close", (code) => { compressor.on("close", (code) => {
if (code !== 0) { if (code !== 0) {

View file

@ -1,6 +1,32 @@
import { request } from "undici";
import { createBrotliDecompress, createUnzip } from "node:zlib";
import { pipeline } from "node:stream/promises";
export async function getHtml(url: string) { export async function getHtml(url: string) {
const res = await fetch(url); const res = await request(url, {
return readableToBuffer(res.body!); headers: {
"Accept-Encoding": "gzip, deflate, br",
},
throwOnError: true,
bodyTimeout: 10 * 60 * 1000,
});
let output: Buffer;
switch (res.headers["content-encoding"]) {
case "gzip":
case "deflate":
output = await pipeline(res.body, createUnzip(), readableToBuffer);
break;
case "br":
output = await pipeline(
res.body,
createBrotliDecompress(),
readableToBuffer
);
break;
default:
output = await readableToBuffer(res.body);
}
return output;
} }
async function readableToBuffer(source: AsyncIterable<any>) { async function readableToBuffer(source: AsyncIterable<any>) {

View file

@ -5,8 +5,7 @@
"description": "", "description": "",
"main": "index.js", "main": "index.js",
"scripts": { "scripts": {
"build:container": "podman build -t gitea.nulo.in/nulo/preciazo/scraper -f ./Containerfile ..", "build:container": "podman build -t gitea.nulo.in/nulo/preciazo/scraper -f ./Containerfile .."
"push:container": "bun build:container && podman push gitea.nulo.in/nulo/preciazo/scraper"
}, },
"keywords": [], "keywords": [],
"author": "", "author": "",
@ -16,10 +15,11 @@
"@aws-sdk/lib-storage": "^3.478.0", "@aws-sdk/lib-storage": "^3.478.0",
"date-fns": "^3.0.6", "date-fns": "^3.0.6",
"db-datos": "workspace:^", "db-datos": "workspace:^",
"drizzle-orm": "=0.29.1", "drizzle-orm": "^0.29.1",
"linkedom": "^0.16.5", "linkedom": "^0.16.5",
"nanoid": "^5.0.4", "nanoid": "^5.0.4",
"p-queue": "^8.0.1", "p-queue": "^8.0.1",
"undici": "^6.2.0",
"warcio": "^2.2.1", "warcio": "^2.2.1",
"zod": "^3.22.4" "zod": "^3.22.4"
}, },

View file

@ -34,11 +34,10 @@ export function getCotoProduct(html: string | Buffer): Precioish {
const ean = getEanFromText(dom); const ean = getEanFromText(dom);
const precioCentavos = getPriceFromText(dom); const precioCentavos = getPriceFromText(dom);
const name = dom.document const name = dom.document.querySelector("h1.product_page")?.textContent;
.querySelector("h1.product_page") const imageUrl = dom.document.querySelector<HTMLImageElement>(
?.textContent?.trim(); ".productImageZoom img"
const imageUrl = )?.src;
dom.document.querySelector<HTMLImageElement>(".zoom img")?.src;
return { name, imageUrl, ean, precioCentavos }; return { name, imageUrl, ean, precioCentavos };
} }

View file

@ -1,3 +1,5 @@
import { Database } from "bun:sqlite";
import { drizzle } from "drizzle-orm/bun-sqlite";
import * as schema from "db-datos/schema.js"; import * as schema from "db-datos/schema.js";
import { WARCParser } from "warcio"; import { WARCParser } from "warcio";
import { writeFile } from "fs/promises"; import { writeFile } from "fs/promises";
@ -7,10 +9,16 @@ import { getDiaProduct } from "./parsers/dia.js";
import { getCotoProduct } from "./parsers/coto.js"; import { getCotoProduct } from "./parsers/coto.js";
import { join } from "path"; import { join } from "path";
import { and, eq, sql } from "drizzle-orm"; import { and, eq, sql } from "drizzle-orm";
import { db } from "db-datos/db.js"; import { DB_PATH } from "db-datos/drizzle.config.js";
import { migrateDb } from "db-datos/migrate.js";
const DEBUG = false; const DEBUG = false;
const PARSER_VERSION = 4; const PARSER_VERSION = 2;
migrateDb();
const sqlite = new Database(DB_PATH);
const db = drizzle(sqlite, { schema });
const getPrevPrecio = db const getPrevPrecio = db
.select({ id: schema.precios.id }) .select({ id: schema.precios.id })

View file

@ -1,24 +1,7 @@
FROM docker.io/oven/bun:1-alpine as build FROM docker.io/oven/bun:1-alpine
RUN apk add --no-cache nodejs COPY build/ .
WORKDIR /usr/src/app RUN bun i
COPY . . EXPOSE 3000
WORKDIR /usr/src/app/sitio
RUN bun install && \
bun run build
# FROM docker.io/oven/bun:1-alpine as deps
# WORKDIR /usr/src/app/sitio
# RUN bun init && bun install "better-sqlite3"@"^9.2.2" "chart.js"@"^4.4.1" "chartjs-adapter-dayjs-4"@"^1.0.4" "dayjs"@"^1.11.10" "drizzle-orm"@"^0.29.1"
# COPY --from=build /usr/src/app/db-datos node_modules/db-datos
FROM docker.io/alpine:3.19
RUN apk add --no-cache tini nodejs npm jq
WORKDIR /app
COPY --from=build /usr/src/app/sitio/package.json package.real.json
RUN sh -c 'echo {\"name\":\"sitio\",\"type\":\"module\",\"dependencies\":$(jq .dependencies < package.real.json)} > package.json' && npm install
COPY --from=build /usr/src/app/db-datos node_modules/db-datos
COPY --from=build /usr/src/app/sitio/build .
# https://github.com/gornostay25/svelte-adapter-bun/issues/39 # https://github.com/gornostay25/svelte-adapter-bun/issues/39
ENV PROTOCOL_HEADER=x-forwarded-proto ENV PROTOCOL_HEADER=x-forwarded-proto
@ -26,6 +9,5 @@ ENV HOST_HEADER=x-forwarded-host
VOLUME /db VOLUME /db
ENV DB_PATH=/db/db.db ENV DB_PATH=/db/db.db
EXPOSE 3000
CMD ["tini", "node", "."] CMD ["bun", "run", "start"]

View file

@ -5,7 +5,7 @@
"scripts": { "scripts": {
"dev": "vite dev", "dev": "vite dev",
"build": "vite build", "build": "vite build",
"build:container": "podman build -t gitea.nulo.in/nulo/preciazo/sitio -f ./Containerfile ..", "build:container": "bun --bun vite build && podman build -t gitea.nulo.in/nulo/preciazo/sitio .",
"push:container": "bun build:container && podman push gitea.nulo.in/nulo/preciazo/sitio", "push:container": "bun build:container && podman push gitea.nulo.in/nulo/preciazo/sitio",
"preview": "vite preview", "preview": "vite preview",
"check": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json", "check": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json",
@ -16,6 +16,7 @@
"devDependencies": { "devDependencies": {
"@sveltejs/kit": "^2.0.0", "@sveltejs/kit": "^2.0.0",
"@sveltejs/vite-plugin-svelte": "^3.0.0", "@sveltejs/vite-plugin-svelte": "^3.0.0",
"@types/bun": "^1.0.0",
"autoprefixer": "^10.4.16", "autoprefixer": "^10.4.16",
"db-datos": "workspace:^", "db-datos": "workspace:^",
"postcss": "^8.4.32", "postcss": "^8.4.32",
@ -24,21 +25,18 @@
"prettier-plugin-svelte": "^3.1.2", "prettier-plugin-svelte": "^3.1.2",
"prettier-plugin-tailwindcss": "^0.5.9", "prettier-plugin-tailwindcss": "^0.5.9",
"svelte": "^4.2.7", "svelte": "^4.2.7",
"svelte-adapter-bun": "^0.5.1",
"svelte-check": "^3.6.0", "svelte-check": "^3.6.0",
"tailwindcss": "^3.3.6", "tailwindcss": "^3.3.6",
"tslib": "^2.4.1", "tslib": "^2.4.1",
"typescript": "^5.0.0", "typescript": "^5.0.0",
"vite": "^5.0.3", "vite": "^5.0.3"
"@sveltejs/adapter-node": "^2.0.2",
"@types/better-sqlite3": "^7.6.8",
"@types/node": "^20.10.6"
}, },
"type": "module", "type": "module",
"dependencies": { "dependencies": {
"better-sqlite3": "^9.2.2",
"chart.js": "^4.4.1", "chart.js": "^4.4.1",
"chartjs-adapter-dayjs-4": "^1.0.4", "chartjs-adapter-dayjs-4": "^1.0.4",
"dayjs": "^1.11.10", "dayjs": "^1.11.10",
"drizzle-orm": "=0.29.1" "drizzle-orm": "^0.29.1"
} }
} }

View file

@ -6,10 +6,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1" /> <meta name="viewport" content="width=device-width, initial-scale=1" />
%sveltekit.head% %sveltekit.head%
</head> </head>
<body <body data-sveltekit-preload-data="hover">
class="bg-neutral-100 dark:bg-neutral-900 dark:text-neutral-200"
data-sveltekit-preload-data="hover"
>
<div style="display: contents">%sveltekit.body%</div> <div style="display: contents">%sveltekit.body%</div>
</body> </body>
</html> </html>

View file

@ -1,8 +0,0 @@
<script lang="ts">
export let product: { ean: string; name: string; imageUrl: string };
</script>
<a href={`/ean/${product.ean}`} class="flex">
<img src={product.imageUrl} alt={product.name} class="max-h-48" />
<p class="text-xl">{product.name}</p>
</a>

View file

@ -1,10 +1,9 @@
import Database from "better-sqlite3"; import Database from "bun:sqlite";
import { drizzle } from "drizzle-orm/better-sqlite3"; import { drizzle } from "drizzle-orm/bun-sqlite";
import * as schema from "db-datos/schema.js"; import * as schema from "db-datos/schema.js";
import { env } from "$env/dynamic/private"; import { env } from "$env/dynamic/private";
const sqlite = new Database(env.DB_PATH ?? "../scraper/sqlite.db"); const sqlite = new Database(env.DB_PATH ?? "../scraper/sqlite.db");
const db = drizzle(sqlite, { schema });
export { db }; export const db = drizzle(sqlite, { schema });
export * as schema from "db-datos/schema.js"; export * as schema from "db-datos/schema.js";

View file

@ -1,14 +0,0 @@
import { countDistinct } from "drizzle-orm";
import type { PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db";
const { precios } = schema;
export const load: PageServerLoad = async () => {
const nProductosR = await db
.select({
count: countDistinct(precios.ean),
})
.from(precios);
const nProductos = nProductosR[0].count;
return { nProductos };
};

View file

@ -1,43 +1,5 @@
<script lang="ts"> <script>
import "../app.pcss"; import "../app.pcss";
import type { PageData } from "./$types";
export let data: PageData;
</script> </script>
<!-- https://flowbite.com/docs/forms/search-input/ -->
<form method="GET" action="/search">
<div class="flex items-stretch p-4">
<input
type="search"
name="q"
class="block w-full rounded-l-lg border border-gray-300 bg-gray-50 p-2.5 text-sm text-gray-900 focus:border-blue-500 focus:ring-blue-500 dark:border-gray-600 dark:bg-gray-700 dark:text-white dark:placeholder-gray-400 dark:focus:border-blue-500"
placeholder={`Buscar entre ${data.nProductos} productos`}
required
/>
<button
type="submit"
class="block rounded-e-lg border border-blue-700 bg-blue-700 p-2.5 text-sm font-medium text-white hover:bg-blue-800 focus:outline-none focus:ring-4 focus:ring-blue-300 dark:bg-blue-600 dark:hover:bg-blue-700 dark:focus:ring-blue-800"
>
<svg
class="h-4 w-4"
aria-hidden="true"
xmlns="http://www.w3.org/2000/svg"
fill="none"
viewBox="0 0 20 20"
>
<path
stroke="currentColor"
stroke-linecap="round"
stroke-linejoin="round"
stroke-width="2"
d="m19 19-4-4m0-7A7 7 0 1 1 1 8a7 7 0 0 1 14 0Z"
/>
</svg>
<span class="sr-only">Search</span>
</button>
</div>
</form>
<slot /> <slot />

View file

@ -1,3 +1,4 @@
import { error } from "@sveltejs/kit";
import type { PageServerLoad } from "./$types"; import type { PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db"; import { db, schema } from "$lib/server/db";
const { precios } = schema; const { precios } = schema;
@ -5,11 +6,7 @@ import { sql } from "drizzle-orm";
export const load: PageServerLoad = async ({ params }) => { export const load: PageServerLoad = async ({ params }) => {
const q = db const q = db
.select({ .select({ ean: precios.ean, name: precios.name })
ean: precios.ean,
name: precios.name,
imageUrl: precios.imageUrl,
})
.from(precios) .from(precios)
.groupBy(precios.ean) .groupBy(precios.ean)
.having(sql`max(length(name))`) .having(sql`max(length(name))`)

View file

@ -1,5 +1,4 @@
<script lang="ts"> <script lang="ts">
import ProductPreview from "$lib/ProductPreview.svelte";
import type { PageData } from "./$types"; import type { PageData } from "./$types";
export let data: PageData; export let data: PageData;
@ -31,10 +30,12 @@
<section> <section>
<h2 class="text-lg font-bold">Random</h2> <h2 class="text-lg font-bold">Random</h2>
<ul class="grid grid-cols-1 gap-4 md:grid-cols-2 lg:grid-cols-3"> <ul>
{#each data.precios as product} {#each data.precios as product}
<li> <li>
<ProductPreview {product} /> <a href={`/ean/${product.ean}`}>
{product.name}
</a>
</li> </li>
{/each} {/each}
</ul> </ul>

View file

@ -22,7 +22,7 @@
{#if data.meta} {#if data.meta}
<h1 class="text-3xl font-bold">{data.meta.name}</h1> <h1 class="text-3xl font-bold">{data.meta.name}</h1>
<img src={data.meta.imageUrl} alt={data.meta.name} class="max-h-48" /> <img src={data.meta.imageUrl} class="max-h-48" />
<div class="flex gap-2"> <div class="flex gap-2">
{#each urls as [supermercado, url]} {#each urls as [supermercado, url]}
<a <a

View file

@ -42,7 +42,7 @@
} }
</script> </script>
<div class="h-[300px] w-full min-w-[500px] bg-neutral-200 dark:invert"> <div class="h-[300px] w-full min-w-[500px]">
<ChartJs <ChartJs
type="line" type="line"
data={{ datasets }} data={{ datasets }}

View file

@ -1,19 +0,0 @@
import { error } from "@sveltejs/kit";
import { eq, max, sql } from "drizzle-orm";
import type { PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db";
const { precios } = schema;
export const load: PageServerLoad = async ({ url }) => {
const query = url.searchParams.get("q");
let results: null | { ean: string; name: string; imageUrl: string }[] = null;
if (query) {
results = db.all(
sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
join precios p on p.ean = f.ean
where f.name match ${query};`,
);
}
return { query, results };
};

View file

@ -1,21 +0,0 @@
<script lang="ts">
import ProductPreview from "$lib/ProductPreview.svelte";
import type { PageData } from "./$types";
export let data: PageData;
</script>
{#if data.results}
<header class="my-2">
<h1 class="text-2xl font-bold">Resultados para "{data.query}"</h1>
</header>
<ul class="grid grid-cols-1 gap-4 md:grid-cols-2 lg:grid-cols-3">
{#each data.results as product}
<li>
<ProductPreview {product} />
</li>
{/each}
</ul>
{:else}
Probá buscando algo.
{/if}

View file

@ -1,5 +1,5 @@
import adapter from "@sveltejs/adapter-node"; // import adapter from "@sveltejs/adapter-node";
// import adapter from "svelte-adapter-bun"; import adapter from "svelte-adapter-bun";
import { vitePreprocess } from "@sveltejs/vite-plugin-svelte"; import { vitePreprocess } from "@sveltejs/vite-plugin-svelte";
/** @type {import('@sveltejs/kit').Config} */ /** @type {import('@sveltejs/kit').Config} */