From 98a699e4547c734b4192253bdaaf6b0f09d360be Mon Sep 17 00:00:00 2001 From: Nulo Date: Fri, 29 Dec 2023 21:49:32 -0300 Subject: [PATCH] scrapear urls a BD --- coto-link-scraper/index.ts | 41 ++--- db-datos/db.ts | 10 ++ db-datos/drizzle/0009_breezy_forge.sql | 8 + db-datos/drizzle/meta/0009_snapshot.json | 146 ++++++++++++++++++ db-datos/drizzle/meta/_journal.json | 7 + db-datos/schema.ts | 9 ++ db-datos/urlHelpers.ts | 25 +++ dia-link-scraper/index.ts | 188 ++++++++++++----------- scraper/auto.ts | 50 +++++- scraper/fetch.ts | 30 +--- scraper/scrap.ts | 10 +- 11 files changed, 365 insertions(+), 159 deletions(-) create mode 100644 db-datos/db.ts create mode 100644 db-datos/drizzle/0009_breezy_forge.sql create mode 100644 db-datos/drizzle/meta/0009_snapshot.json create mode 100644 db-datos/urlHelpers.ts diff --git a/coto-link-scraper/index.ts b/coto-link-scraper/index.ts index e983d52..b25be9b 100644 --- a/coto-link-scraper/index.ts +++ b/coto-link-scraper/index.ts @@ -1,23 +1,24 @@ import { getHtml } from "../scraper/fetch.js"; import { parseHTML } from "linkedom"; import PQueue from "p-queue"; +import { saveUrls } from "db-datos/urlHelpers.js"; -// let fetched = new Set(); -{ +export async function scrapCotoProducts() { const initial = "https://www.cotodigital3.com.ar/sitios/cdigi/browse?Nf=product.endDate%7CGTEQ+1.7032032E12%7C%7Cproduct.startDate%7CLTEQ+1.7032032E12&No=2200&Nr=AND%28product.sDisp_200%3A1004%2Cproduct.language%3Aespa%C3%B1ol%2COR%28product.siteId%3ACotoDigital%29%29&Nrpp=200"; - const queue = new PQueue({ concurrency: 2 }); + const queue = new PQueue({ concurrency: 4 }); const pageSize = 300; // hasta 1000 - const links = Array.from({ length: Math.ceil(29000 / 300) }, (x, i) => i).map( - (i) => { - const url = new URL(initial); - url.searchParams.set("No", `${i * pageSize}`); - url.searchParams.set("Nrpp", `${pageSize}`); - return url.toString(); - } - ); + const links = Array.from( + { length: Math.ceil(29000 / pageSize) }, + (x, i) => i + ).map((i) => { + const url = new URL(initial); + url.searchParams.set("No", `${i * pageSize}`); + url.searchParams.set("Nrpp", `${pageSize}`); + return url.toString(); + }); const promises = links.map((l) => queue.add(getPage(l))); await Promise.all(promises); @@ -38,22 +39,6 @@ function getPage(url: string) { document.querySelectorAll(".product_info_container a"), (a) => new URL(a.href, url).toString() ); - hrefs.forEach((h) => process.stdout.write(h + "\n")); - - // const nextLinks = Array.from( - // document.querySelectorAll( - // "#atg_store_pagination a[href]" - // ), - // (a) => new URL(a.href, url).toString() - // ); - - // await Promise.all( - // nextLinks - // .filter((l) => !fetched.has(l)) - // .map((l) => { - // fetched.add(l); - // return queue.add(getPage(l)); - // }) - // ); + saveUrls(hrefs); }; } diff --git a/db-datos/db.ts b/db-datos/db.ts new file mode 100644 index 0000000..8781000 --- /dev/null +++ b/db-datos/db.ts @@ -0,0 +1,10 @@ +import { Database } from "bun:sqlite"; +import { drizzle } from "drizzle-orm/bun-sqlite"; +import { DB_PATH } from "./drizzle.config.js"; +import { migrateDb } from "./migrate.js"; +import * as schema from "./schema.js"; + +migrateDb(); + +export const sqlite = new Database(DB_PATH); +export const db = drizzle(sqlite, { schema }); diff --git a/db-datos/drizzle/0009_breezy_forge.sql b/db-datos/drizzle/0009_breezy_forge.sql new file mode 100644 index 0000000..b9839ff --- /dev/null +++ b/db-datos/drizzle/0009_breezy_forge.sql @@ -0,0 +1,8 @@ +CREATE TABLE `producto_urls` ( + `id` integer PRIMARY KEY AUTOINCREMENT NOT NULL, + `url` text NOT NULL, + `first_seen` integer NOT NULL, + `last_seen` integer NOT NULL +); +--> statement-breakpoint +CREATE UNIQUE INDEX `producto_urls_url_unique` ON `producto_urls` (`url`); \ No newline at end of file diff --git a/db-datos/drizzle/meta/0009_snapshot.json b/db-datos/drizzle/meta/0009_snapshot.json new file mode 100644 index 0000000..100a9ac --- /dev/null +++ b/db-datos/drizzle/meta/0009_snapshot.json @@ -0,0 +1,146 @@ +{ + "version": "5", + "dialect": "sqlite", + "id": "2e398920-ffaf-4d55-ae13-d906cb9e0efa", + "prevId": "082630a9-3744-4e33-bde5-06045ca57d36", + "tables": { + "precios": { + "name": "precios", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "autoincrement": true + }, + "ean": { + "name": "ean", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "fetched_at": { + "name": "fetched_at", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "precio_centavos": { + "name": "precio_centavos", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "in_stock": { + "name": "in_stock", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "url": { + "name": "url", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "warc_record_id": { + "name": "warc_record_id", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "parser_version": { + "name": "parser_version", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "image_url": { + "name": "image_url", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + } + }, + "indexes": { + "precios_ean_idx": { + "name": "precios_ean_idx", + "columns": [ + "ean" + ], + "isUnique": false + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + }, + "producto_urls": { + "name": "producto_urls", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "autoincrement": true + }, + "url": { + "name": "url", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "first_seen": { + "name": "first_seen", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "last_seen": { + "name": "last_seen", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": { + "producto_urls_url_unique": { + "name": "producto_urls_url_unique", + "columns": [ + "url" + ], + "isUnique": true + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + } + }, + "enums": {}, + "_meta": { + "schemas": {}, + "tables": {}, + "columns": {} + } +} \ No newline at end of file diff --git a/db-datos/drizzle/meta/_journal.json b/db-datos/drizzle/meta/_journal.json index a9d231b..bd847ef 100644 --- a/db-datos/drizzle/meta/_journal.json +++ b/db-datos/drizzle/meta/_journal.json @@ -64,6 +64,13 @@ "when": 1703807460152, "tag": "0008_funny_nighthawk", "breakpoints": true + }, + { + "idx": 9, + "version": "5", + "when": 1703895109501, + "tag": "0009_breezy_forge", + "breakpoints": true } ] } \ No newline at end of file diff --git a/db-datos/schema.ts b/db-datos/schema.ts index 3c5a833..a45efda 100644 --- a/db-datos/schema.ts +++ b/db-datos/schema.ts @@ -22,3 +22,12 @@ export const precios = sqliteTable( ); export type Precio = typeof precios.$inferSelect; + +export const productoUrls = sqliteTable("producto_urls", { + id: integer("id", { mode: "number" }).primaryKey({ autoIncrement: true }), + url: text("url").unique().notNull(), + firstSeen: integer("first_seen", { mode: "timestamp" }).notNull(), + lastSeen: integer("last_seen", { mode: "timestamp" }).notNull(), +}); + +export type ProductUrl = typeof productoUrls.$inferSelect; diff --git a/db-datos/urlHelpers.ts b/db-datos/urlHelpers.ts new file mode 100644 index 0000000..2ca2d03 --- /dev/null +++ b/db-datos/urlHelpers.ts @@ -0,0 +1,25 @@ +import { sql } from "drizzle-orm"; +import { db } from "./db.js"; +import { productoUrls } from "./schema.js"; + +export function saveUrls(urls: string[]) { + db.transaction((tx) => { + const now = new Date(); + const insertUrlTra = tx + .insert(productoUrls) + .values({ + url: sql.placeholder("url"), + firstSeen: now, + lastSeen: now, + }) + .onConflictDoUpdate({ + target: productoUrls.url, + set: { lastSeen: now }, + }) + .prepare(); + + for (const href of urls) { + insertUrlTra.run({ url: href }); + } + }); +} diff --git a/dia-link-scraper/index.ts b/dia-link-scraper/index.ts index 9bf4ce3..67709b0 100644 --- a/dia-link-scraper/index.ts +++ b/dia-link-scraper/index.ts @@ -1,94 +1,110 @@ import pMap from "p-map"; import { parseHTML } from "linkedom"; import { getHtml } from "../scraper/fetch.js"; -(async () => { - const categorias = [ - "https://diaonline.supermercadosdia.com.ar/almacen", - "https://diaonline.supermercadosdia.com.ar/almacen/conservas", - "https://diaonline.supermercadosdia.com.ar/almacen/aceites-y-aderezos", - "https://diaonline.supermercadosdia.com.ar/almacen/pastas-secas", - "https://diaonline.supermercadosdia.com.ar/almacen/arroz-y-legumbres", - "https://diaonline.supermercadosdia.com.ar/almacen/panaderia", - "https://diaonline.supermercadosdia.com.ar/almacen/golosinas-y-alfajores", - "https://diaonline.supermercadosdia.com.ar/almacen/reposteria", - "https://diaonline.supermercadosdia.com.ar/almacen/comidas-listas", - "https://diaonline.supermercadosdia.com.ar/almacen/harinas", - "https://diaonline.supermercadosdia.com.ar/almacen/picadas", - "https://diaonline.supermercadosdia.com.ar/almacen/panaderia/pan-rallado-y-rebozadores", - "https://diaonline.supermercadosdia.com.ar/desayuno", - "https://diaonline.supermercadosdia.com.ar/desayuno/galletitas-y-cereales", - "https://diaonline.supermercadosdia.com.ar/desayuno/infusiones-y-endulzantes", - "https://diaonline.supermercadosdia.com.ar/desayuno/para-untar", - "https://diaonline.supermercadosdia.com.ar/frescos", - "https://diaonline.supermercadosdia.com.ar/frescos/leches", - "https://diaonline.supermercadosdia.com.ar/frescos/fiambreria", - "https://diaonline.supermercadosdia.com.ar/frescos/lacteos", - "https://diaonline.supermercadosdia.com.ar/frescos/carniceria", - "https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras", - "https://diaonline.supermercadosdia.com.ar/frescos/pastas-frescas", - "https://diaonline.supermercadosdia.com.ar/frescos/listos-para-disfrutar", - "https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutas", - "https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/verduras", - "https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/huevos", - "https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutos-secos", - "https://diaonline.supermercadosdia.com.ar/bebidas", - "https://diaonline.supermercadosdia.com.ar/bebidas/gaseosas", - "https://diaonline.supermercadosdia.com.ar/bebidas/cervezas", - "https://diaonline.supermercadosdia.com.ar/bebidas/aguas", - "https://diaonline.supermercadosdia.com.ar/bebidas/bodega", - "https://diaonline.supermercadosdia.com.ar/bebidas/jugos-e-isot%C3%B3nicas", - "https://diaonline.supermercadosdia.com.ar/bebidas/aperitivos", - "https://diaonline.supermercadosdia.com.ar/bebidas/bebidas-blancas-y-licores", - "https://diaonline.supermercadosdia.com.ar/congelados", - "https://diaonline.supermercadosdia.com.ar/congelados/hamburguesas-y-medallones", - "https://diaonline.supermercadosdia.com.ar/congelados/rebozados", - "https://diaonline.supermercadosdia.com.ar/congelados/vegetales-congelados", - "https://diaonline.supermercadosdia.com.ar/congelados/postres-congelados", - "https://diaonline.supermercadosdia.com.ar/congelados/pescaderia", - "https://diaonline.supermercadosdia.com.ar/congelados/papas-congeladas", - "https://diaonline.supermercadosdia.com.ar/congelados/comidas-congeladas", - "https://diaonline.supermercadosdia.com.ar/congelados/hielo", - "https://diaonline.supermercadosdia.com.ar/limpieza", - "https://diaonline.supermercadosdia.com.ar/limpieza/cuidado-de-la-ropa", - "https://diaonline.supermercadosdia.com.ar/limpieza/papeleria", - "https://diaonline.supermercadosdia.com.ar/limpieza/limpiadores", - "https://diaonline.supermercadosdia.com.ar/limpieza/limpieza-de-cocina", - "https://diaonline.supermercadosdia.com.ar/limpieza/accesorios-de-limpieza", - "https://diaonline.supermercadosdia.com.ar/limpieza/desodorantes-de-ambiente", - "https://diaonline.supermercadosdia.com.ar/limpieza/insecticidas", - "https://diaonline.supermercadosdia.com.ar/limpieza/fosforos-y-velas", - "https://diaonline.supermercadosdia.com.ar/limpieza/bolsas", - "https://diaonline.supermercadosdia.com.ar/4160?map=productClusterIds&order=OrderByBestDiscountDESC", - "https://diaonline.supermercadosdia.com.ar/4136?map=productClusterIds&order=OrderByBestDiscountDESC", - "https://diaonline.supermercadosdia.com.ar/4143?map=productClusterIds&order=OrderByBestDiscountDESC", - "https://diaonline.supermercadosdia.com.ar/4189?map=productClusterIds&order=OrderByBestDiscountDESC", - "https://diaonline.supermercadosdia.com.ar/4086?map=productClusterIds&order=OrderByBestDiscountDESC", - "https://diaonline.supermercadosdia.com.ar/2089?map=productClusterIds&order=OrderByBestDiscountDESC", +import { saveUrls } from "db-datos/urlHelpers.js"; + +const categorias = [ + "https://diaonline.supermercadosdia.com.ar/almacen", + "https://diaonline.supermercadosdia.com.ar/almacen/conservas", + "https://diaonline.supermercadosdia.com.ar/almacen/aceites-y-aderezos", + "https://diaonline.supermercadosdia.com.ar/almacen/pastas-secas", + "https://diaonline.supermercadosdia.com.ar/almacen/arroz-y-legumbres", + "https://diaonline.supermercadosdia.com.ar/almacen/panaderia", + "https://diaonline.supermercadosdia.com.ar/almacen/golosinas-y-alfajores", + "https://diaonline.supermercadosdia.com.ar/almacen/reposteria", + "https://diaonline.supermercadosdia.com.ar/almacen/comidas-listas", + "https://diaonline.supermercadosdia.com.ar/almacen/harinas", + "https://diaonline.supermercadosdia.com.ar/almacen/picadas", + "https://diaonline.supermercadosdia.com.ar/almacen/panaderia/pan-rallado-y-rebozadores", + "https://diaonline.supermercadosdia.com.ar/desayuno", + "https://diaonline.supermercadosdia.com.ar/desayuno/galletitas-y-cereales", + "https://diaonline.supermercadosdia.com.ar/desayuno/infusiones-y-endulzantes", + "https://diaonline.supermercadosdia.com.ar/desayuno/para-untar", + "https://diaonline.supermercadosdia.com.ar/frescos", + "https://diaonline.supermercadosdia.com.ar/frescos/leches", + "https://diaonline.supermercadosdia.com.ar/frescos/fiambreria", + "https://diaonline.supermercadosdia.com.ar/frescos/lacteos", + "https://diaonline.supermercadosdia.com.ar/frescos/carniceria", + "https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras", + "https://diaonline.supermercadosdia.com.ar/frescos/pastas-frescas", + "https://diaonline.supermercadosdia.com.ar/frescos/listos-para-disfrutar", + "https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutas", + "https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/verduras", + "https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/huevos", + "https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutos-secos", + "https://diaonline.supermercadosdia.com.ar/bebidas", + "https://diaonline.supermercadosdia.com.ar/bebidas/gaseosas", + "https://diaonline.supermercadosdia.com.ar/bebidas/cervezas", + "https://diaonline.supermercadosdia.com.ar/bebidas/aguas", + "https://diaonline.supermercadosdia.com.ar/bebidas/bodega", + "https://diaonline.supermercadosdia.com.ar/bebidas/jugos-e-isot%C3%B3nicas", + "https://diaonline.supermercadosdia.com.ar/bebidas/aperitivos", + "https://diaonline.supermercadosdia.com.ar/bebidas/bebidas-blancas-y-licores", + "https://diaonline.supermercadosdia.com.ar/congelados", + "https://diaonline.supermercadosdia.com.ar/congelados/hamburguesas-y-medallones", + "https://diaonline.supermercadosdia.com.ar/congelados/rebozados", + "https://diaonline.supermercadosdia.com.ar/congelados/vegetales-congelados", + "https://diaonline.supermercadosdia.com.ar/congelados/postres-congelados", + "https://diaonline.supermercadosdia.com.ar/congelados/pescaderia", + "https://diaonline.supermercadosdia.com.ar/congelados/papas-congeladas", + "https://diaonline.supermercadosdia.com.ar/congelados/comidas-congeladas", + "https://diaonline.supermercadosdia.com.ar/congelados/hielo", + "https://diaonline.supermercadosdia.com.ar/limpieza", + "https://diaonline.supermercadosdia.com.ar/limpieza/cuidado-de-la-ropa", + "https://diaonline.supermercadosdia.com.ar/limpieza/papeleria", + "https://diaonline.supermercadosdia.com.ar/limpieza/limpiadores", + "https://diaonline.supermercadosdia.com.ar/limpieza/limpieza-de-cocina", + "https://diaonline.supermercadosdia.com.ar/limpieza/accesorios-de-limpieza", + "https://diaonline.supermercadosdia.com.ar/limpieza/desodorantes-de-ambiente", + "https://diaonline.supermercadosdia.com.ar/limpieza/insecticidas", + "https://diaonline.supermercadosdia.com.ar/limpieza/fosforos-y-velas", + "https://diaonline.supermercadosdia.com.ar/limpieza/bolsas", + "https://diaonline.supermercadosdia.com.ar/4160?map=productClusterIds&order=OrderByBestDiscountDESC", + "https://diaonline.supermercadosdia.com.ar/4136?map=productClusterIds&order=OrderByBestDiscountDESC", + "https://diaonline.supermercadosdia.com.ar/4143?map=productClusterIds&order=OrderByBestDiscountDESC", + "https://diaonline.supermercadosdia.com.ar/4189?map=productClusterIds&order=OrderByBestDiscountDESC", + "https://diaonline.supermercadosdia.com.ar/4086?map=productClusterIds&order=OrderByBestDiscountDESC", + "https://diaonline.supermercadosdia.com.ar/2089?map=productClusterIds&order=OrderByBestDiscountDESC", +]; + +export async function scrapDiaProducts() { + await Promise.all([scrapBySite(), scrapBySitemap()]); +} + +async function scrapBySitemap() { + // de https://diaonline.supermercadosdia.com.ar/sitemap.xml + const sitemaps = [ + "https://diaonline.supermercadosdia.com.ar/sitemap/product-1.xml", + "https://diaonline.supermercadosdia.com.ar/sitemap/product-2.xml", + "https://diaonline.supermercadosdia.com.ar/sitemap/product-3.xml", + "https://diaonline.supermercadosdia.com.ar/sitemap/product-4.xml", + "https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml", ]; - const links = categorias.flatMap( - (link) => - Array.from({ length: 51 }, (x, i) => i).map((i) => { - const url = new URL(link); - url.searchParams.set("page", `${i}`); - return url.toString(); + await pMap(sitemaps, async (sitemapUrl) => { + const res = await fetch(sitemapUrl); + const xml = await res.text(); + let urls = new Set(); + new HTMLRewriter() + .on("loc", { + text(element) { + const txt = element.text.trim(); + if (!txt) return; + urls.add(txt); + }, }) + .transform(new Response(xml)); + saveUrls(Array.from(urls)); + }); +} - // el order solo carga con el frontend :( - // .flatMap((link) => - // [ - // "OrderByNameASC", - // "OrderByNameDESC", - // "OrderByTopSaleDESC", - // "OrderByPriceDESC", - // "OrderByPriceASC", - // "", - // ].map((order) => { - // const url = new URL(link); - // url.searchParams.set("order", order); - // return url.toString(); - // }) - // ) +async function scrapBySite() { + const links = categorias.flatMap((link) => + Array.from({ length: 51 }, (x, i) => i).map((i) => { + const url = new URL(link); + url.searchParams.set("page", `${i}`); + return url.toString(); + }) ); await pMap( @@ -103,8 +119,8 @@ import { getHtml } from "../scraper/fetch.js"; ), (a) => new URL(a.href, url).toString() ); - hrefs.forEach((h) => process.stdout.write(h + "\n")); + saveUrls(hrefs); }, { concurrency: 32 } ); -})(); +} diff --git a/scraper/auto.ts b/scraper/auto.ts index 05618a4..b82b86a 100644 --- a/scraper/auto.ts +++ b/scraper/auto.ts @@ -1,14 +1,19 @@ -import { mkdtemp, access } from "node:fs/promises"; +import { mkdtemp, access, writeFile } from "node:fs/promises"; import { tmpdir } from "node:os"; import { join, resolve } from "node:path"; import { spawn } from "node:child_process"; -import { Supermercado } from "db-datos/supermercado.js"; +import { Supermercado, hosts } from "db-datos/supermercado.js"; import PQueue from "p-queue"; import { format, formatDuration, intervalToDuration } from "date-fns"; import { parseWarc } from "./scrap.js"; import { S3Client } from "@aws-sdk/client-s3"; import { Upload } from "@aws-sdk/lib-storage"; import { BunFile } from "bun"; +import { db } from "db-datos/db.js"; +import { like } from "drizzle-orm"; +import { productoUrls } from "db-datos/schema.js"; +import { scrapDiaProducts } from "../dia-link-scraper/index.js"; +import { scrapCotoProducts } from "../coto-link-scraper/index.js"; const supermercados: Supermercado[] = [ Supermercado.Carrefour, @@ -71,11 +76,41 @@ class Auto { } async downloadList(supermercado: Supermercado) { - const listPath = resolve( - join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`) - ); - const date = new Date(); const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-")); + + let listPath: string; + if (supermercado === "Carrefour") { + // TODO: carrefour todavía no tiene un scraper que guarde a la BD + listPath = resolve( + join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`) + ); + } else { + const t0 = performance.now(); + switch (supermercado) { + case "Dia": + await scrapDiaProducts(); + break; + case "Coto": + await scrapCotoProducts(); + break; + } + this.inform( + `[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}` + ); + + listPath = join(ctxPath, `lista-${supermercado}.txt`); + const host = Object.entries(hosts).find( + ([host, supe]) => supe === supermercado + )![0]; + const results = await db.query.productoUrls + .findMany({ + where: like(productoUrls.url, `%${host}%`), + }) + .execute(); + const urls = results.map((r) => r.url); + await writeFile(listPath, urls.join("\n") + "\n"); + } + const date = new Date(); const zstdWarcName = `${supermercado}-${format( date, "yyyy-MM-dd-HH:mm" @@ -98,7 +133,7 @@ class Auto { const t0 = performance.now(); await subproc.exited; this.inform( - `wget para ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}` + `[wget] ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}` ); const gzippedWarcPath = join(ctxPath, "temp.warc.gz"); @@ -187,7 +222,6 @@ class Auto { stdio: ["pipe", null, null], } ); - // @ts-expect-error a los types de bun no le gusta???? decompressor.stdout.pipe(compressor.stdin); compressor.on("close", (code) => { if (code !== 0) { diff --git a/scraper/fetch.ts b/scraper/fetch.ts index 733eacc..59bffb2 100644 --- a/scraper/fetch.ts +++ b/scraper/fetch.ts @@ -1,32 +1,6 @@ -import { request } from "undici"; -import { createBrotliDecompress, createUnzip } from "node:zlib"; -import { pipeline } from "node:stream/promises"; - export async function getHtml(url: string) { - const res = await request(url, { - headers: { - "Accept-Encoding": "gzip, deflate, br", - }, - throwOnError: true, - bodyTimeout: 10 * 60 * 1000, - }); - let output: Buffer; - switch (res.headers["content-encoding"]) { - case "gzip": - case "deflate": - output = await pipeline(res.body, createUnzip(), readableToBuffer); - break; - case "br": - output = await pipeline( - res.body, - createBrotliDecompress(), - readableToBuffer - ); - break; - default: - output = await readableToBuffer(res.body); - } - return output; + const res = await fetch(url); + return readableToBuffer(res.body!); } async function readableToBuffer(source: AsyncIterable) { diff --git a/scraper/scrap.ts b/scraper/scrap.ts index f6928ed..61d16a5 100644 --- a/scraper/scrap.ts +++ b/scraper/scrap.ts @@ -1,5 +1,3 @@ -import { Database } from "bun:sqlite"; -import { drizzle } from "drizzle-orm/bun-sqlite"; import * as schema from "db-datos/schema.js"; import { WARCParser } from "warcio"; import { writeFile } from "fs/promises"; @@ -9,17 +7,11 @@ import { getDiaProduct } from "./parsers/dia.js"; import { getCotoProduct } from "./parsers/coto.js"; import { join } from "path"; import { and, eq, sql } from "drizzle-orm"; -import { DB_PATH } from "db-datos/drizzle.config.js"; -import { migrateDb } from "db-datos/migrate.js"; +import { db } from "db-datos/db.js"; const DEBUG = false; const PARSER_VERSION = 3; -migrateDb(); - -const sqlite = new Database(DB_PATH); -const db = drizzle(sqlite, { schema }); - const getPrevPrecio = db .select({ id: schema.precios.id }) .from(schema.precios)