Compare commits

..

No commits in common. "5dcc901a80fb765995bc795f3b009616dd404768" and "198e51fc97e799510ca725dfc597c9922af07e46" have entirely different histories.

45 changed files with 201 additions and 1159 deletions

BIN
bun.lockb

Binary file not shown.

View file

@ -1,44 +0,0 @@
import pMap from "p-map";
import { saveUrls } from "db-datos/urlHelpers.js";
await scrapBySitemap();
export async function scrapCarrefourProducts() {
await scrapBySitemap();
}
async function scrapBySitemap() {
// de https://www.carrefour.com.ar/sitemap.xml
const sitemaps = [
"https://www.carrefour.com.ar/sitemap/product-0.xml",
"https://www.carrefour.com.ar/sitemap/product-1.xml",
"https://www.carrefour.com.ar/sitemap/product-2.xml",
"https://www.carrefour.com.ar/sitemap/product-3.xml",
"https://www.carrefour.com.ar/sitemap/product-4.xml",
"https://www.carrefour.com.ar/sitemap/product-5.xml",
"https://www.carrefour.com.ar/sitemap/product-6.xml",
"https://www.carrefour.com.ar/sitemap/product-7.xml",
"https://www.carrefour.com.ar/sitemap/product-8.xml",
"https://www.carrefour.com.ar/sitemap/product-9.xml",
];
await pMap(
sitemaps,
async (sitemapUrl) => {
const res = await fetch(sitemapUrl);
const xml = await res.text();
let urls = new Set<string>();
new HTMLRewriter()
.on("loc", {
text(element) {
const txt = element.text.trim();
if (!txt) return;
urls.add(txt);
},
})
.transform(new Response(xml));
saveUrls(Array.from(urls));
},
{ concurrency: 3 }
);
}

View file

@ -1,17 +0,0 @@
{
"name": "carrefour-link-scraper",
"type": "module",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"linkedom": "^0.16.5",
"p-map": "^7.0.1"
}
}

View file

@ -1,24 +1,23 @@
import { getHtml } from "../scraper/fetch.js";
import { parseHTML } from "linkedom";
import PQueue from "p-queue";
import { saveUrls } from "db-datos/urlHelpers.js";
export async function scrapCotoProducts() {
// let fetched = new Set<string>();
{
const initial =
"https://www.cotodigital3.com.ar/sitios/cdigi/browse?Nf=product.endDate%7CGTEQ+1.7032032E12%7C%7Cproduct.startDate%7CLTEQ+1.7032032E12&No=2200&Nr=AND%28product.sDisp_200%3A1004%2Cproduct.language%3Aespa%C3%B1ol%2COR%28product.siteId%3ACotoDigital%29%29&Nrpp=200";
const queue = new PQueue({ concurrency: 4 });
const queue = new PQueue({ concurrency: 2 });
const pageSize = 300; // hasta 1000
const links = Array.from(
{ length: Math.ceil(29000 / pageSize) },
(x, i) => i
).map((i) => {
const links = Array.from({ length: Math.ceil(29000 / 300) }, (x, i) => i).map(
(i) => {
const url = new URL(initial);
url.searchParams.set("No", `${i * pageSize}`);
url.searchParams.set("Nrpp", `${pageSize}`);
return url.toString();
});
}
);
const promises = links.map((l) => queue.add(getPage(l)));
await Promise.all(promises);
@ -39,6 +38,22 @@ function getPage(url: string) {
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
(a) => new URL(a.href, url).toString()
);
saveUrls(hrefs);
hrefs.forEach((h) => process.stdout.write(h + "\n"));
// const nextLinks = Array.from(
// document.querySelectorAll<HTMLAnchorElement>(
// "#atg_store_pagination a[href]"
// ),
// (a) => new URL(a.href, url).toString()
// );
// await Promise.all(
// nextLinks
// .filter((l) => !fetched.has(l))
// .map((l) => {
// fetched.add(l);
// return queue.add(getPage(l));
// })
// );
};
}

View file

@ -12,6 +12,8 @@
"license": "ISC",
"dependencies": {
"linkedom": "^0.16.5",
"p-queue": "^8.0.1"
"p-queue": "^8.0.1",
"tsx": "^4.7.0",
"undici": "^6.2.0"
}
}

View file

@ -1,10 +0,0 @@
import { Database } from "bun:sqlite";
import { drizzle } from "drizzle-orm/bun-sqlite";
import { DB_PATH } from "./drizzle.config.js";
import { migrateDb } from "./migrate.js";
import * as schema from "./schema.js";
migrateDb();
export const sqlite = new Database(DB_PATH);
export const db = drizzle(sqlite, { schema });

View file

@ -1,3 +0,0 @@
-- Custom SQL migration file, put you code below! --
create virtual table precios_fts using fts5(ean, url, name, content=precios, content_rowid=id);

View file

@ -1,2 +0,0 @@
-- Custom SQL migration file, put you code below! --
insert into precios_fts(rowid,ean,url,name) select id,ean,url,name from precios;

View file

@ -1,7 +0,0 @@
-- Custom SQL migration file, put you code below! --
-- https://sqlite.org/fts5.html#external_content_and_contentless_tables
-- Triggers to keep the FTS index up to date.
CREATE TRIGGER precios_fts_ai AFTER INSERT ON precios BEGIN
INSERT INTO precios_fts(rowid, ean, url, name) VALUES (new.id, new.ean, new.url, new.name);
END;

View file

@ -1,6 +0,0 @@
-- Custom SQL migration file, put you code below! --
-- https://sqlite.org/fts5.html#external_content_and_contentless_tables
-- Triggers to keep the FTS index up to date.
CREATE TRIGGER precios_fts_ad AFTER DELETE ON precios BEGIN
INSERT INTO precios_fts(precios_fts, rowid, ean, url, name) VALUES('delete', old.id, old.ean, old.url, old.name);
END;

View file

@ -1,8 +0,0 @@
-- Custom SQL migration file, put you code below! --
-- https://sqlite.org/fts5.html#external_content_and_contentless_tables
-- Triggers to keep the FTS index up to date.
CREATE TRIGGER precios_fts_au AFTER UPDATE ON precios BEGIN
INSERT INTO precios_fts(precios_fts, rowid, ean, url, name) VALUES('delete', old.id, old.ean, old.url, old.name);
INSERT INTO precios_fts(rowid, ean, url, name) VALUES (new.id, new.ean, new.url, new.name);
END;

View file

@ -1,8 +0,0 @@
CREATE TABLE `producto_urls` (
`id` integer PRIMARY KEY AUTOINCREMENT NOT NULL,
`url` text NOT NULL,
`first_seen` integer NOT NULL,
`last_seen` integer NOT NULL
);
--> statement-breakpoint
CREATE UNIQUE INDEX `producto_urls_url_unique` ON `producto_urls` (`url`);

View file

@ -1,101 +0,0 @@
{
"id": "bf90a1cd-ae6a-4dba-a1aa-79f14a11d958",
"prevId": "e1217fdb-6f54-44c5-a04b-c5aebf202102",
"version": "5",
"dialect": "sqlite",
"tables": {
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {
"precios_ean_idx": {
"name": "precios_ean_idx",
"columns": [
"ean"
],
"isUnique": false
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"columns": {},
"schemas": {},
"tables": {}
}
}

View file

@ -1,101 +0,0 @@
{
"id": "f2cf47b9-e137-41c9-b7fb-6bc016588db0",
"prevId": "bf90a1cd-ae6a-4dba-a1aa-79f14a11d958",
"version": "5",
"dialect": "sqlite",
"tables": {
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {
"precios_ean_idx": {
"name": "precios_ean_idx",
"columns": [
"ean"
],
"isUnique": false
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"columns": {},
"schemas": {},
"tables": {}
}
}

View file

@ -1,101 +0,0 @@
{
"id": "ac099405-ecd0-4637-ae5e-fb29c9847e45",
"prevId": "f2cf47b9-e137-41c9-b7fb-6bc016588db0",
"version": "5",
"dialect": "sqlite",
"tables": {
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {
"precios_ean_idx": {
"name": "precios_ean_idx",
"columns": [
"ean"
],
"isUnique": false
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"columns": {},
"schemas": {},
"tables": {}
}
}

View file

@ -1,101 +0,0 @@
{
"id": "9d2f23bf-dc60-4adb-b1bd-ec75e90dda25",
"prevId": "ac099405-ecd0-4637-ae5e-fb29c9847e45",
"version": "5",
"dialect": "sqlite",
"tables": {
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {
"precios_ean_idx": {
"name": "precios_ean_idx",
"columns": [
"ean"
],
"isUnique": false
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"columns": {},
"schemas": {},
"tables": {}
}
}

View file

@ -1,101 +0,0 @@
{
"id": "082630a9-3744-4e33-bde5-06045ca57d36",
"prevId": "9d2f23bf-dc60-4adb-b1bd-ec75e90dda25",
"version": "5",
"dialect": "sqlite",
"tables": {
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {
"precios_ean_idx": {
"name": "precios_ean_idx",
"columns": [
"ean"
],
"isUnique": false
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"columns": {},
"schemas": {},
"tables": {}
}
}

View file

@ -1,146 +0,0 @@
{
"version": "5",
"dialect": "sqlite",
"id": "2e398920-ffaf-4d55-ae13-d906cb9e0efa",
"prevId": "082630a9-3744-4e33-bde5-06045ca57d36",
"tables": {
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {
"precios_ean_idx": {
"name": "precios_ean_idx",
"columns": [
"ean"
],
"isUnique": false
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
},
"producto_urls": {
"name": "producto_urls",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"first_seen": {
"name": "first_seen",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"last_seen": {
"name": "last_seen",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
}
},
"indexes": {
"producto_urls_url_unique": {
"name": "producto_urls_url_unique",
"columns": [
"url"
],
"isUnique": true
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"schemas": {},
"tables": {},
"columns": {}
}
}

View file

@ -29,48 +29,6 @@
"when": 1703521964385,
"tag": "0003_abandoned_landau",
"breakpoints": true
},
{
"idx": 4,
"version": "5",
"when": 1703726748364,
"tag": "0004_left_wolfsbane",
"breakpoints": true
},
{
"idx": 5,
"version": "5",
"when": 1703807455551,
"tag": "0005_lucky_epoch",
"breakpoints": true
},
{
"idx": 6,
"version": "5",
"when": 1703807457204,
"tag": "0006_jazzy_madripoor",
"breakpoints": true
},
{
"idx": 7,
"version": "5",
"when": 1703807458666,
"tag": "0007_bright_silvermane",
"breakpoints": true
},
{
"idx": 8,
"version": "5",
"when": 1703807460152,
"tag": "0008_funny_nighthawk",
"breakpoints": true
},
{
"idx": 9,
"version": "5",
"when": 1703895109501,
"tag": "0009_breezy_forge",
"breakpoints": true
}
]
}

View file

@ -1,16 +1,15 @@
import Database from "bun:sqlite";
import { join, dirname } from "node:path";
import { join } from "node:path";
import { drizzle } from "drizzle-orm/bun-sqlite";
import { migrate } from "drizzle-orm/bun-sqlite/migrator";
import * as schema from "./schema.js";
import { DB_PATH } from "./drizzle.config.js";
const url = new URL(import.meta.url);
export function migrateDb() {
const sqlite = new Database(DB_PATH);
const db = drizzle(sqlite, { schema });
migrate(db, { migrationsFolder: join(dirname(url.pathname), "drizzle") });
migrate(db, { migrationsFolder: join(import.meta.dir, "drizzle") });
sqlite.run(`
pragma journal_mode = WAL;
PRAGMA synchronous = NORMAL;

View file

@ -11,7 +11,7 @@
"author": "",
"license": "ISC",
"dependencies": {
"drizzle-orm": "=0.29.1"
"drizzle-orm": "^0.29.1"
},
"devDependencies": {
"@types/bun": "^1.0.0",

View file

@ -22,12 +22,3 @@ export const precios = sqliteTable(
);
export type Precio = typeof precios.$inferSelect;
export const productoUrls = sqliteTable("producto_urls", {
id: integer("id", { mode: "number" }).primaryKey({ autoIncrement: true }),
url: text("url").unique().notNull(),
firstSeen: integer("first_seen", { mode: "timestamp" }).notNull(),
lastSeen: integer("last_seen", { mode: "timestamp" }).notNull(),
});
export type ProductUrl = typeof productoUrls.$inferSelect;

View file

@ -1,25 +0,0 @@
import { sql } from "drizzle-orm";
import { db } from "./db.js";
import { productoUrls } from "./schema.js";
export function saveUrls(urls: string[]) {
db.transaction((tx) => {
const now = new Date();
const insertUrlTra = tx
.insert(productoUrls)
.values({
url: sql.placeholder("url"),
firstSeen: now,
lastSeen: now,
})
.onConflictDoUpdate({
target: productoUrls.url,
set: { lastSeen: now },
})
.prepare();
for (const href of urls) {
insertUrlTra.run({ url: href });
}
});
}

View file

@ -1,9 +1,8 @@
import pMap from "p-map";
import { parseHTML } from "linkedom";
import { getHtml } from "../scraper/fetch.js";
import { saveUrls } from "db-datos/urlHelpers.js";
const categorias = [
(async () => {
const categorias = [
"https://diaonline.supermercadosdia.com.ar/almacen",
"https://diaonline.supermercadosdia.com.ar/almacen/conservas",
"https://diaonline.supermercadosdia.com.ar/almacen/aceites-y-aderezos",
@ -65,46 +64,31 @@ const categorias = [
"https://diaonline.supermercadosdia.com.ar/4189?map=productClusterIds&order=OrderByBestDiscountDESC",
"https://diaonline.supermercadosdia.com.ar/4086?map=productClusterIds&order=OrderByBestDiscountDESC",
"https://diaonline.supermercadosdia.com.ar/2089?map=productClusterIds&order=OrderByBestDiscountDESC",
];
export async function scrapDiaProducts() {
await Promise.all([scrapBySite(), scrapBySitemap()]);
}
async function scrapBySitemap() {
// de https://diaonline.supermercadosdia.com.ar/sitemap.xml
const sitemaps = [
"https://diaonline.supermercadosdia.com.ar/sitemap/product-1.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-2.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-3.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-4.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
];
await pMap(sitemaps, async (sitemapUrl) => {
const res = await fetch(sitemapUrl);
const xml = await res.text();
let urls = new Set<string>();
new HTMLRewriter()
.on("loc", {
text(element) {
const txt = element.text.trim();
if (!txt) return;
urls.add(txt);
},
})
.transform(new Response(xml));
saveUrls(Array.from(urls));
});
}
async function scrapBySite() {
const links = categorias.flatMap((link) =>
const links = categorias.flatMap(
(link) =>
Array.from({ length: 51 }, (x, i) => i).map((i) => {
const url = new URL(link);
url.searchParams.set("page", `${i}`);
return url.toString();
})
// el order solo carga con el frontend :(
// .flatMap((link) =>
// [
// "OrderByNameASC",
// "OrderByNameDESC",
// "OrderByTopSaleDESC",
// "OrderByPriceDESC",
// "OrderByPriceASC",
// "",
// ].map((order) => {
// const url = new URL(link);
// url.searchParams.set("order", order);
// return url.toString();
// })
// )
);
await pMap(
@ -119,8 +103,8 @@ async function scrapBySite() {
),
(a) => new URL(a.href, url).toString()
);
saveUrls(hrefs);
hrefs.forEach((h) => process.stdout.write(h + "\n"));
},
{ concurrency: 32 }
);
}
})();

View file

@ -12,6 +12,8 @@
"license": "ISC",
"dependencies": {
"linkedom": "^0.16.5",
"p-map": "^7.0.0"
"p-map": "^7.0.0",
"tsx": "^4.7.0",
"undici": "^6.2.0"
}
}

View file

@ -4,7 +4,6 @@
"workspaces": [
"dia-link-scraper",
"coto-link-scraper",
"carrefour-link-scraper",
"scraper",
"sitio",
"db-datos"

View file

@ -1,20 +1,14 @@
import { mkdtemp, access, writeFile } from "node:fs/promises";
import { mkdtemp, access } from "node:fs/promises";
import { tmpdir } from "node:os";
import { join, resolve } from "node:path";
import { spawn } from "node:child_process";
import { Supermercado, hosts } from "db-datos/supermercado.js";
import { Supermercado } from "db-datos/supermercado.js";
import PQueue from "p-queue";
import { format, formatDuration, intervalToDuration } from "date-fns";
import { parseWarc } from "./scrap.js";
import { S3Client } from "@aws-sdk/client-s3";
import { Upload } from "@aws-sdk/lib-storage";
import { BunFile } from "bun";
import { db } from "db-datos/db.js";
import { like } from "drizzle-orm";
import { productoUrls } from "db-datos/schema.js";
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
const supermercados: Supermercado[] = [
Supermercado.Carrefour,
@ -77,40 +71,11 @@ class Auto {
}
async downloadList(supermercado: Supermercado) {
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-"));
let listPath: string;
{
const t0 = performance.now();
switch (supermercado) {
case "Dia":
await scrapDiaProducts();
break;
case "Coto":
await scrapCotoProducts();
break;
case "Carrefour":
await scrapCarrefourProducts();
break;
}
this.inform(
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
const listPath = resolve(
join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`)
);
}
listPath = join(ctxPath, `lista-${supermercado}.txt`);
const host = Object.entries(hosts).find(
([host, supe]) => supe === supermercado
)![0];
const results = await db.query.productoUrls
.findMany({
where: like(productoUrls.url, `%${host}%`),
})
.execute();
const urls = results.map((r) => r.url);
await writeFile(listPath, urls.join("\n") + "\n");
const date = new Date();
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-"));
const zstdWarcName = `${supermercado}-${format(
date,
"yyyy-MM-dd-HH:mm"
@ -133,7 +98,7 @@ class Auto {
const t0 = performance.now();
await subproc.exited;
this.inform(
`[wget] ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}`
`wget para ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}`
);
const gzippedWarcPath = join(ctxPath, "temp.warc.gz");
@ -222,6 +187,7 @@ class Auto {
stdio: ["pipe", null, null],
}
);
// @ts-expect-error a los types de bun no le gusta????
decompressor.stdout.pipe(compressor.stdin);
compressor.on("close", (code) => {
if (code !== 0) {

View file

@ -1,6 +1,32 @@
import { request } from "undici";
import { createBrotliDecompress, createUnzip } from "node:zlib";
import { pipeline } from "node:stream/promises";
export async function getHtml(url: string) {
const res = await fetch(url);
return readableToBuffer(res.body!);
const res = await request(url, {
headers: {
"Accept-Encoding": "gzip, deflate, br",
},
throwOnError: true,
bodyTimeout: 10 * 60 * 1000,
});
let output: Buffer;
switch (res.headers["content-encoding"]) {
case "gzip":
case "deflate":
output = await pipeline(res.body, createUnzip(), readableToBuffer);
break;
case "br":
output = await pipeline(
res.body,
createBrotliDecompress(),
readableToBuffer
);
break;
default:
output = await readableToBuffer(res.body);
}
return output;
}
async function readableToBuffer(source: AsyncIterable<any>) {

View file

@ -5,8 +5,7 @@
"description": "",
"main": "index.js",
"scripts": {
"build:container": "podman build -t gitea.nulo.in/nulo/preciazo/scraper -f ./Containerfile ..",
"push:container": "bun build:container && podman push gitea.nulo.in/nulo/preciazo/scraper"
"build:container": "podman build -t gitea.nulo.in/nulo/preciazo/scraper -f ./Containerfile .."
},
"keywords": [],
"author": "",
@ -16,10 +15,11 @@
"@aws-sdk/lib-storage": "^3.478.0",
"date-fns": "^3.0.6",
"db-datos": "workspace:^",
"drizzle-orm": "=0.29.1",
"drizzle-orm": "^0.29.1",
"linkedom": "^0.16.5",
"nanoid": "^5.0.4",
"p-queue": "^8.0.1",
"undici": "^6.2.0",
"warcio": "^2.2.1",
"zod": "^3.22.4"
},

View file

@ -34,11 +34,10 @@ export function getCotoProduct(html: string | Buffer): Precioish {
const ean = getEanFromText(dom);
const precioCentavos = getPriceFromText(dom);
const name = dom.document
.querySelector("h1.product_page")
?.textContent?.trim();
const imageUrl =
dom.document.querySelector<HTMLImageElement>(".zoom img")?.src;
const name = dom.document.querySelector("h1.product_page")?.textContent;
const imageUrl = dom.document.querySelector<HTMLImageElement>(
".productImageZoom img"
)?.src;
return { name, imageUrl, ean, precioCentavos };
}

View file

@ -1,3 +1,5 @@
import { Database } from "bun:sqlite";
import { drizzle } from "drizzle-orm/bun-sqlite";
import * as schema from "db-datos/schema.js";
import { WARCParser } from "warcio";
import { writeFile } from "fs/promises";
@ -7,10 +9,16 @@ import { getDiaProduct } from "./parsers/dia.js";
import { getCotoProduct } from "./parsers/coto.js";
import { join } from "path";
import { and, eq, sql } from "drizzle-orm";
import { db } from "db-datos/db.js";
import { DB_PATH } from "db-datos/drizzle.config.js";
import { migrateDb } from "db-datos/migrate.js";
const DEBUG = false;
const PARSER_VERSION = 4;
const PARSER_VERSION = 2;
migrateDb();
const sqlite = new Database(DB_PATH);
const db = drizzle(sqlite, { schema });
const getPrevPrecio = db
.select({ id: schema.precios.id })

View file

@ -1,24 +1,7 @@
FROM docker.io/oven/bun:1-alpine as build
RUN apk add --no-cache nodejs
WORKDIR /usr/src/app
COPY . .
WORKDIR /usr/src/app/sitio
RUN bun install && \
bun run build
# FROM docker.io/oven/bun:1-alpine as deps
# WORKDIR /usr/src/app/sitio
# RUN bun init && bun install "better-sqlite3"@"^9.2.2" "chart.js"@"^4.4.1" "chartjs-adapter-dayjs-4"@"^1.0.4" "dayjs"@"^1.11.10" "drizzle-orm"@"^0.29.1"
# COPY --from=build /usr/src/app/db-datos node_modules/db-datos
FROM docker.io/alpine:3.19
RUN apk add --no-cache tini nodejs npm jq
WORKDIR /app
COPY --from=build /usr/src/app/sitio/package.json package.real.json
RUN sh -c 'echo {\"name\":\"sitio\",\"type\":\"module\",\"dependencies\":$(jq .dependencies < package.real.json)} > package.json' && npm install
COPY --from=build /usr/src/app/db-datos node_modules/db-datos
COPY --from=build /usr/src/app/sitio/build .
FROM docker.io/oven/bun:1-alpine
COPY build/ .
RUN bun i
EXPOSE 3000
# https://github.com/gornostay25/svelte-adapter-bun/issues/39
ENV PROTOCOL_HEADER=x-forwarded-proto
@ -26,6 +9,5 @@ ENV HOST_HEADER=x-forwarded-host
VOLUME /db
ENV DB_PATH=/db/db.db
EXPOSE 3000
CMD ["tini", "node", "."]
CMD ["bun", "run", "start"]

View file

@ -5,7 +5,7 @@
"scripts": {
"dev": "vite dev",
"build": "vite build",
"build:container": "podman build -t gitea.nulo.in/nulo/preciazo/sitio -f ./Containerfile ..",
"build:container": "bun --bun vite build && podman build -t gitea.nulo.in/nulo/preciazo/sitio .",
"push:container": "bun build:container && podman push gitea.nulo.in/nulo/preciazo/sitio",
"preview": "vite preview",
"check": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json",
@ -16,6 +16,7 @@
"devDependencies": {
"@sveltejs/kit": "^2.0.0",
"@sveltejs/vite-plugin-svelte": "^3.0.0",
"@types/bun": "^1.0.0",
"autoprefixer": "^10.4.16",
"db-datos": "workspace:^",
"postcss": "^8.4.32",
@ -24,21 +25,18 @@
"prettier-plugin-svelte": "^3.1.2",
"prettier-plugin-tailwindcss": "^0.5.9",
"svelte": "^4.2.7",
"svelte-adapter-bun": "^0.5.1",
"svelte-check": "^3.6.0",
"tailwindcss": "^3.3.6",
"tslib": "^2.4.1",
"typescript": "^5.0.0",
"vite": "^5.0.3",
"@sveltejs/adapter-node": "^2.0.2",
"@types/better-sqlite3": "^7.6.8",
"@types/node": "^20.10.6"
"vite": "^5.0.3"
},
"type": "module",
"dependencies": {
"better-sqlite3": "^9.2.2",
"chart.js": "^4.4.1",
"chartjs-adapter-dayjs-4": "^1.0.4",
"dayjs": "^1.11.10",
"drizzle-orm": "=0.29.1"
"drizzle-orm": "^0.29.1"
}
}

View file

@ -6,10 +6,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1" />
%sveltekit.head%
</head>
<body
class="bg-neutral-100 dark:bg-neutral-900 dark:text-neutral-200"
data-sveltekit-preload-data="hover"
>
<body data-sveltekit-preload-data="hover">
<div style="display: contents">%sveltekit.body%</div>
</body>
</html>

View file

@ -1,8 +0,0 @@
<script lang="ts">
export let product: { ean: string; name: string; imageUrl: string };
</script>
<a href={`/ean/${product.ean}`} class="flex">
<img src={product.imageUrl} alt={product.name} class="max-h-48" />
<p class="text-xl">{product.name}</p>
</a>

View file

@ -1,10 +1,9 @@
import Database from "better-sqlite3";
import { drizzle } from "drizzle-orm/better-sqlite3";
import Database from "bun:sqlite";
import { drizzle } from "drizzle-orm/bun-sqlite";
import * as schema from "db-datos/schema.js";
import { env } from "$env/dynamic/private";
const sqlite = new Database(env.DB_PATH ?? "../scraper/sqlite.db");
const db = drizzle(sqlite, { schema });
export { db };
export const db = drizzle(sqlite, { schema });
export * as schema from "db-datos/schema.js";

View file

@ -1,14 +0,0 @@
import { countDistinct } from "drizzle-orm";
import type { PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db";
const { precios } = schema;
export const load: PageServerLoad = async () => {
const nProductosR = await db
.select({
count: countDistinct(precios.ean),
})
.from(precios);
const nProductos = nProductosR[0].count;
return { nProductos };
};

View file

@ -1,43 +1,5 @@
<script lang="ts">
<script>
import "../app.pcss";
import type { PageData } from "./$types";
export let data: PageData;
</script>
<!-- https://flowbite.com/docs/forms/search-input/ -->
<form method="GET" action="/search">
<div class="flex items-stretch p-4">
<input
type="search"
name="q"
class="block w-full rounded-l-lg border border-gray-300 bg-gray-50 p-2.5 text-sm text-gray-900 focus:border-blue-500 focus:ring-blue-500 dark:border-gray-600 dark:bg-gray-700 dark:text-white dark:placeholder-gray-400 dark:focus:border-blue-500"
placeholder={`Buscar entre ${data.nProductos} productos`}
required
/>
<button
type="submit"
class="block rounded-e-lg border border-blue-700 bg-blue-700 p-2.5 text-sm font-medium text-white hover:bg-blue-800 focus:outline-none focus:ring-4 focus:ring-blue-300 dark:bg-blue-600 dark:hover:bg-blue-700 dark:focus:ring-blue-800"
>
<svg
class="h-4 w-4"
aria-hidden="true"
xmlns="http://www.w3.org/2000/svg"
fill="none"
viewBox="0 0 20 20"
>
<path
stroke="currentColor"
stroke-linecap="round"
stroke-linejoin="round"
stroke-width="2"
d="m19 19-4-4m0-7A7 7 0 1 1 1 8a7 7 0 0 1 14 0Z"
/>
</svg>
<span class="sr-only">Search</span>
</button>
</div>
</form>
<slot />

View file

@ -1,3 +1,4 @@
import { error } from "@sveltejs/kit";
import type { PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db";
const { precios } = schema;
@ -5,11 +6,7 @@ import { sql } from "drizzle-orm";
export const load: PageServerLoad = async ({ params }) => {
const q = db
.select({
ean: precios.ean,
name: precios.name,
imageUrl: precios.imageUrl,
})
.select({ ean: precios.ean, name: precios.name })
.from(precios)
.groupBy(precios.ean)
.having(sql`max(length(name))`)

View file

@ -1,5 +1,4 @@
<script lang="ts">
import ProductPreview from "$lib/ProductPreview.svelte";
import type { PageData } from "./$types";
export let data: PageData;
@ -31,10 +30,12 @@
<section>
<h2 class="text-lg font-bold">Random</h2>
<ul class="grid grid-cols-1 gap-4 md:grid-cols-2 lg:grid-cols-3">
<ul>
{#each data.precios as product}
<li>
<ProductPreview {product} />
<a href={`/ean/${product.ean}`}>
{product.name}
</a>
</li>
{/each}
</ul>

View file

@ -22,7 +22,7 @@
{#if data.meta}
<h1 class="text-3xl font-bold">{data.meta.name}</h1>
<img src={data.meta.imageUrl} alt={data.meta.name} class="max-h-48" />
<img src={data.meta.imageUrl} class="max-h-48" />
<div class="flex gap-2">
{#each urls as [supermercado, url]}
<a

View file

@ -42,7 +42,7 @@
}
</script>
<div class="h-[300px] w-full min-w-[500px] bg-neutral-200 dark:invert">
<div class="h-[300px] w-full min-w-[500px]">
<ChartJs
type="line"
data={{ datasets }}

View file

@ -1,19 +0,0 @@
import { error } from "@sveltejs/kit";
import { eq, max, sql } from "drizzle-orm";
import type { PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db";
const { precios } = schema;
export const load: PageServerLoad = async ({ url }) => {
const query = url.searchParams.get("q");
let results: null | { ean: string; name: string; imageUrl: string }[] = null;
if (query) {
results = db.all(
sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
join precios p on p.ean = f.ean
where f.name match ${query};`,
);
}
return { query, results };
};

View file

@ -1,21 +0,0 @@
<script lang="ts">
import ProductPreview from "$lib/ProductPreview.svelte";
import type { PageData } from "./$types";
export let data: PageData;
</script>
{#if data.results}
<header class="my-2">
<h1 class="text-2xl font-bold">Resultados para "{data.query}"</h1>
</header>
<ul class="grid grid-cols-1 gap-4 md:grid-cols-2 lg:grid-cols-3">
{#each data.results as product}
<li>
<ProductPreview {product} />
</li>
{/each}
</ul>
{:else}
Probá buscando algo.
{/if}

View file

@ -1,5 +1,5 @@
import adapter from "@sveltejs/adapter-node";
// import adapter from "svelte-adapter-bun";
// import adapter from "@sveltejs/adapter-node";
import adapter from "svelte-adapter-bun";
import { vitePreprocess } from "@sveltejs/vite-plugin-svelte";
/** @type {import('@sveltejs/kit').Config} */