mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-22 14:16:19 +00:00
scrapear urls a BD
This commit is contained in:
parent
de3bf4900c
commit
98a699e454
11 changed files with 365 additions and 159 deletions
|
@ -1,23 +1,24 @@
|
||||||
import { getHtml } from "../scraper/fetch.js";
|
import { getHtml } from "../scraper/fetch.js";
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import PQueue from "p-queue";
|
import PQueue from "p-queue";
|
||||||
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
|
|
||||||
// let fetched = new Set<string>();
|
export async function scrapCotoProducts() {
|
||||||
{
|
|
||||||
const initial =
|
const initial =
|
||||||
"https://www.cotodigital3.com.ar/sitios/cdigi/browse?Nf=product.endDate%7CGTEQ+1.7032032E12%7C%7Cproduct.startDate%7CLTEQ+1.7032032E12&No=2200&Nr=AND%28product.sDisp_200%3A1004%2Cproduct.language%3Aespa%C3%B1ol%2COR%28product.siteId%3ACotoDigital%29%29&Nrpp=200";
|
"https://www.cotodigital3.com.ar/sitios/cdigi/browse?Nf=product.endDate%7CGTEQ+1.7032032E12%7C%7Cproduct.startDate%7CLTEQ+1.7032032E12&No=2200&Nr=AND%28product.sDisp_200%3A1004%2Cproduct.language%3Aespa%C3%B1ol%2COR%28product.siteId%3ACotoDigital%29%29&Nrpp=200";
|
||||||
|
|
||||||
const queue = new PQueue({ concurrency: 2 });
|
const queue = new PQueue({ concurrency: 4 });
|
||||||
|
|
||||||
const pageSize = 300; // hasta 1000
|
const pageSize = 300; // hasta 1000
|
||||||
const links = Array.from({ length: Math.ceil(29000 / 300) }, (x, i) => i).map(
|
const links = Array.from(
|
||||||
(i) => {
|
{ length: Math.ceil(29000 / pageSize) },
|
||||||
const url = new URL(initial);
|
(x, i) => i
|
||||||
url.searchParams.set("No", `${i * pageSize}`);
|
).map((i) => {
|
||||||
url.searchParams.set("Nrpp", `${pageSize}`);
|
const url = new URL(initial);
|
||||||
return url.toString();
|
url.searchParams.set("No", `${i * pageSize}`);
|
||||||
}
|
url.searchParams.set("Nrpp", `${pageSize}`);
|
||||||
);
|
return url.toString();
|
||||||
|
});
|
||||||
|
|
||||||
const promises = links.map((l) => queue.add(getPage(l)));
|
const promises = links.map((l) => queue.add(getPage(l)));
|
||||||
await Promise.all(promises);
|
await Promise.all(promises);
|
||||||
|
@ -38,22 +39,6 @@ function getPage(url: string) {
|
||||||
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
|
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
|
||||||
(a) => new URL(a.href, url).toString()
|
(a) => new URL(a.href, url).toString()
|
||||||
);
|
);
|
||||||
hrefs.forEach((h) => process.stdout.write(h + "\n"));
|
saveUrls(hrefs);
|
||||||
|
|
||||||
// const nextLinks = Array.from(
|
|
||||||
// document.querySelectorAll<HTMLAnchorElement>(
|
|
||||||
// "#atg_store_pagination a[href]"
|
|
||||||
// ),
|
|
||||||
// (a) => new URL(a.href, url).toString()
|
|
||||||
// );
|
|
||||||
|
|
||||||
// await Promise.all(
|
|
||||||
// nextLinks
|
|
||||||
// .filter((l) => !fetched.has(l))
|
|
||||||
// .map((l) => {
|
|
||||||
// fetched.add(l);
|
|
||||||
// return queue.add(getPage(l));
|
|
||||||
// })
|
|
||||||
// );
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
10
db-datos/db.ts
Normal file
10
db-datos/db.ts
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
import { Database } from "bun:sqlite";
|
||||||
|
import { drizzle } from "drizzle-orm/bun-sqlite";
|
||||||
|
import { DB_PATH } from "./drizzle.config.js";
|
||||||
|
import { migrateDb } from "./migrate.js";
|
||||||
|
import * as schema from "./schema.js";
|
||||||
|
|
||||||
|
migrateDb();
|
||||||
|
|
||||||
|
export const sqlite = new Database(DB_PATH);
|
||||||
|
export const db = drizzle(sqlite, { schema });
|
8
db-datos/drizzle/0009_breezy_forge.sql
Normal file
8
db-datos/drizzle/0009_breezy_forge.sql
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
CREATE TABLE `producto_urls` (
|
||||||
|
`id` integer PRIMARY KEY AUTOINCREMENT NOT NULL,
|
||||||
|
`url` text NOT NULL,
|
||||||
|
`first_seen` integer NOT NULL,
|
||||||
|
`last_seen` integer NOT NULL
|
||||||
|
);
|
||||||
|
--> statement-breakpoint
|
||||||
|
CREATE UNIQUE INDEX `producto_urls_url_unique` ON `producto_urls` (`url`);
|
146
db-datos/drizzle/meta/0009_snapshot.json
Normal file
146
db-datos/drizzle/meta/0009_snapshot.json
Normal file
|
@ -0,0 +1,146 @@
|
||||||
|
{
|
||||||
|
"version": "5",
|
||||||
|
"dialect": "sqlite",
|
||||||
|
"id": "2e398920-ffaf-4d55-ae13-d906cb9e0efa",
|
||||||
|
"prevId": "082630a9-3744-4e33-bde5-06045ca57d36",
|
||||||
|
"tables": {
|
||||||
|
"precios": {
|
||||||
|
"name": "precios",
|
||||||
|
"columns": {
|
||||||
|
"id": {
|
||||||
|
"name": "id",
|
||||||
|
"type": "integer",
|
||||||
|
"primaryKey": true,
|
||||||
|
"notNull": true,
|
||||||
|
"autoincrement": true
|
||||||
|
},
|
||||||
|
"ean": {
|
||||||
|
"name": "ean",
|
||||||
|
"type": "text",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true,
|
||||||
|
"autoincrement": false
|
||||||
|
},
|
||||||
|
"fetched_at": {
|
||||||
|
"name": "fetched_at",
|
||||||
|
"type": "integer",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true,
|
||||||
|
"autoincrement": false
|
||||||
|
},
|
||||||
|
"precio_centavos": {
|
||||||
|
"name": "precio_centavos",
|
||||||
|
"type": "integer",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": false,
|
||||||
|
"autoincrement": false
|
||||||
|
},
|
||||||
|
"in_stock": {
|
||||||
|
"name": "in_stock",
|
||||||
|
"type": "integer",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": false,
|
||||||
|
"autoincrement": false
|
||||||
|
},
|
||||||
|
"url": {
|
||||||
|
"name": "url",
|
||||||
|
"type": "text",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true,
|
||||||
|
"autoincrement": false
|
||||||
|
},
|
||||||
|
"warc_record_id": {
|
||||||
|
"name": "warc_record_id",
|
||||||
|
"type": "text",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": false,
|
||||||
|
"autoincrement": false
|
||||||
|
},
|
||||||
|
"parser_version": {
|
||||||
|
"name": "parser_version",
|
||||||
|
"type": "integer",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": false,
|
||||||
|
"autoincrement": false
|
||||||
|
},
|
||||||
|
"name": {
|
||||||
|
"name": "name",
|
||||||
|
"type": "text",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": false,
|
||||||
|
"autoincrement": false
|
||||||
|
},
|
||||||
|
"image_url": {
|
||||||
|
"name": "image_url",
|
||||||
|
"type": "text",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": false,
|
||||||
|
"autoincrement": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"indexes": {
|
||||||
|
"precios_ean_idx": {
|
||||||
|
"name": "precios_ean_idx",
|
||||||
|
"columns": [
|
||||||
|
"ean"
|
||||||
|
],
|
||||||
|
"isUnique": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"foreignKeys": {},
|
||||||
|
"compositePrimaryKeys": {},
|
||||||
|
"uniqueConstraints": {}
|
||||||
|
},
|
||||||
|
"producto_urls": {
|
||||||
|
"name": "producto_urls",
|
||||||
|
"columns": {
|
||||||
|
"id": {
|
||||||
|
"name": "id",
|
||||||
|
"type": "integer",
|
||||||
|
"primaryKey": true,
|
||||||
|
"notNull": true,
|
||||||
|
"autoincrement": true
|
||||||
|
},
|
||||||
|
"url": {
|
||||||
|
"name": "url",
|
||||||
|
"type": "text",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true,
|
||||||
|
"autoincrement": false
|
||||||
|
},
|
||||||
|
"first_seen": {
|
||||||
|
"name": "first_seen",
|
||||||
|
"type": "integer",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true,
|
||||||
|
"autoincrement": false
|
||||||
|
},
|
||||||
|
"last_seen": {
|
||||||
|
"name": "last_seen",
|
||||||
|
"type": "integer",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true,
|
||||||
|
"autoincrement": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"indexes": {
|
||||||
|
"producto_urls_url_unique": {
|
||||||
|
"name": "producto_urls_url_unique",
|
||||||
|
"columns": [
|
||||||
|
"url"
|
||||||
|
],
|
||||||
|
"isUnique": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"foreignKeys": {},
|
||||||
|
"compositePrimaryKeys": {},
|
||||||
|
"uniqueConstraints": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"enums": {},
|
||||||
|
"_meta": {
|
||||||
|
"schemas": {},
|
||||||
|
"tables": {},
|
||||||
|
"columns": {}
|
||||||
|
}
|
||||||
|
}
|
|
@ -64,6 +64,13 @@
|
||||||
"when": 1703807460152,
|
"when": 1703807460152,
|
||||||
"tag": "0008_funny_nighthawk",
|
"tag": "0008_funny_nighthawk",
|
||||||
"breakpoints": true
|
"breakpoints": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"idx": 9,
|
||||||
|
"version": "5",
|
||||||
|
"when": 1703895109501,
|
||||||
|
"tag": "0009_breezy_forge",
|
||||||
|
"breakpoints": true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
|
@ -22,3 +22,12 @@ export const precios = sqliteTable(
|
||||||
);
|
);
|
||||||
|
|
||||||
export type Precio = typeof precios.$inferSelect;
|
export type Precio = typeof precios.$inferSelect;
|
||||||
|
|
||||||
|
export const productoUrls = sqliteTable("producto_urls", {
|
||||||
|
id: integer("id", { mode: "number" }).primaryKey({ autoIncrement: true }),
|
||||||
|
url: text("url").unique().notNull(),
|
||||||
|
firstSeen: integer("first_seen", { mode: "timestamp" }).notNull(),
|
||||||
|
lastSeen: integer("last_seen", { mode: "timestamp" }).notNull(),
|
||||||
|
});
|
||||||
|
|
||||||
|
export type ProductUrl = typeof productoUrls.$inferSelect;
|
||||||
|
|
25
db-datos/urlHelpers.ts
Normal file
25
db-datos/urlHelpers.ts
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
import { sql } from "drizzle-orm";
|
||||||
|
import { db } from "./db.js";
|
||||||
|
import { productoUrls } from "./schema.js";
|
||||||
|
|
||||||
|
export function saveUrls(urls: string[]) {
|
||||||
|
db.transaction((tx) => {
|
||||||
|
const now = new Date();
|
||||||
|
const insertUrlTra = tx
|
||||||
|
.insert(productoUrls)
|
||||||
|
.values({
|
||||||
|
url: sql.placeholder("url"),
|
||||||
|
firstSeen: now,
|
||||||
|
lastSeen: now,
|
||||||
|
})
|
||||||
|
.onConflictDoUpdate({
|
||||||
|
target: productoUrls.url,
|
||||||
|
set: { lastSeen: now },
|
||||||
|
})
|
||||||
|
.prepare();
|
||||||
|
|
||||||
|
for (const href of urls) {
|
||||||
|
insertUrlTra.run({ url: href });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
|
@ -1,94 +1,110 @@
|
||||||
import pMap from "p-map";
|
import pMap from "p-map";
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import { getHtml } from "../scraper/fetch.js";
|
import { getHtml } from "../scraper/fetch.js";
|
||||||
(async () => {
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
const categorias = [
|
|
||||||
"https://diaonline.supermercadosdia.com.ar/almacen",
|
const categorias = [
|
||||||
"https://diaonline.supermercadosdia.com.ar/almacen/conservas",
|
"https://diaonline.supermercadosdia.com.ar/almacen",
|
||||||
"https://diaonline.supermercadosdia.com.ar/almacen/aceites-y-aderezos",
|
"https://diaonline.supermercadosdia.com.ar/almacen/conservas",
|
||||||
"https://diaonline.supermercadosdia.com.ar/almacen/pastas-secas",
|
"https://diaonline.supermercadosdia.com.ar/almacen/aceites-y-aderezos",
|
||||||
"https://diaonline.supermercadosdia.com.ar/almacen/arroz-y-legumbres",
|
"https://diaonline.supermercadosdia.com.ar/almacen/pastas-secas",
|
||||||
"https://diaonline.supermercadosdia.com.ar/almacen/panaderia",
|
"https://diaonline.supermercadosdia.com.ar/almacen/arroz-y-legumbres",
|
||||||
"https://diaonline.supermercadosdia.com.ar/almacen/golosinas-y-alfajores",
|
"https://diaonline.supermercadosdia.com.ar/almacen/panaderia",
|
||||||
"https://diaonline.supermercadosdia.com.ar/almacen/reposteria",
|
"https://diaonline.supermercadosdia.com.ar/almacen/golosinas-y-alfajores",
|
||||||
"https://diaonline.supermercadosdia.com.ar/almacen/comidas-listas",
|
"https://diaonline.supermercadosdia.com.ar/almacen/reposteria",
|
||||||
"https://diaonline.supermercadosdia.com.ar/almacen/harinas",
|
"https://diaonline.supermercadosdia.com.ar/almacen/comidas-listas",
|
||||||
"https://diaonline.supermercadosdia.com.ar/almacen/picadas",
|
"https://diaonline.supermercadosdia.com.ar/almacen/harinas",
|
||||||
"https://diaonline.supermercadosdia.com.ar/almacen/panaderia/pan-rallado-y-rebozadores",
|
"https://diaonline.supermercadosdia.com.ar/almacen/picadas",
|
||||||
"https://diaonline.supermercadosdia.com.ar/desayuno",
|
"https://diaonline.supermercadosdia.com.ar/almacen/panaderia/pan-rallado-y-rebozadores",
|
||||||
"https://diaonline.supermercadosdia.com.ar/desayuno/galletitas-y-cereales",
|
"https://diaonline.supermercadosdia.com.ar/desayuno",
|
||||||
"https://diaonline.supermercadosdia.com.ar/desayuno/infusiones-y-endulzantes",
|
"https://diaonline.supermercadosdia.com.ar/desayuno/galletitas-y-cereales",
|
||||||
"https://diaonline.supermercadosdia.com.ar/desayuno/para-untar",
|
"https://diaonline.supermercadosdia.com.ar/desayuno/infusiones-y-endulzantes",
|
||||||
"https://diaonline.supermercadosdia.com.ar/frescos",
|
"https://diaonline.supermercadosdia.com.ar/desayuno/para-untar",
|
||||||
"https://diaonline.supermercadosdia.com.ar/frescos/leches",
|
"https://diaonline.supermercadosdia.com.ar/frescos",
|
||||||
"https://diaonline.supermercadosdia.com.ar/frescos/fiambreria",
|
"https://diaonline.supermercadosdia.com.ar/frescos/leches",
|
||||||
"https://diaonline.supermercadosdia.com.ar/frescos/lacteos",
|
"https://diaonline.supermercadosdia.com.ar/frescos/fiambreria",
|
||||||
"https://diaonline.supermercadosdia.com.ar/frescos/carniceria",
|
"https://diaonline.supermercadosdia.com.ar/frescos/lacteos",
|
||||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras",
|
"https://diaonline.supermercadosdia.com.ar/frescos/carniceria",
|
||||||
"https://diaonline.supermercadosdia.com.ar/frescos/pastas-frescas",
|
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras",
|
||||||
"https://diaonline.supermercadosdia.com.ar/frescos/listos-para-disfrutar",
|
"https://diaonline.supermercadosdia.com.ar/frescos/pastas-frescas",
|
||||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutas",
|
"https://diaonline.supermercadosdia.com.ar/frescos/listos-para-disfrutar",
|
||||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/verduras",
|
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutas",
|
||||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/huevos",
|
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/verduras",
|
||||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutos-secos",
|
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/huevos",
|
||||||
"https://diaonline.supermercadosdia.com.ar/bebidas",
|
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutos-secos",
|
||||||
"https://diaonline.supermercadosdia.com.ar/bebidas/gaseosas",
|
"https://diaonline.supermercadosdia.com.ar/bebidas",
|
||||||
"https://diaonline.supermercadosdia.com.ar/bebidas/cervezas",
|
"https://diaonline.supermercadosdia.com.ar/bebidas/gaseosas",
|
||||||
"https://diaonline.supermercadosdia.com.ar/bebidas/aguas",
|
"https://diaonline.supermercadosdia.com.ar/bebidas/cervezas",
|
||||||
"https://diaonline.supermercadosdia.com.ar/bebidas/bodega",
|
"https://diaonline.supermercadosdia.com.ar/bebidas/aguas",
|
||||||
"https://diaonline.supermercadosdia.com.ar/bebidas/jugos-e-isot%C3%B3nicas",
|
"https://diaonline.supermercadosdia.com.ar/bebidas/bodega",
|
||||||
"https://diaonline.supermercadosdia.com.ar/bebidas/aperitivos",
|
"https://diaonline.supermercadosdia.com.ar/bebidas/jugos-e-isot%C3%B3nicas",
|
||||||
"https://diaonline.supermercadosdia.com.ar/bebidas/bebidas-blancas-y-licores",
|
"https://diaonline.supermercadosdia.com.ar/bebidas/aperitivos",
|
||||||
"https://diaonline.supermercadosdia.com.ar/congelados",
|
"https://diaonline.supermercadosdia.com.ar/bebidas/bebidas-blancas-y-licores",
|
||||||
"https://diaonline.supermercadosdia.com.ar/congelados/hamburguesas-y-medallones",
|
"https://diaonline.supermercadosdia.com.ar/congelados",
|
||||||
"https://diaonline.supermercadosdia.com.ar/congelados/rebozados",
|
"https://diaonline.supermercadosdia.com.ar/congelados/hamburguesas-y-medallones",
|
||||||
"https://diaonline.supermercadosdia.com.ar/congelados/vegetales-congelados",
|
"https://diaonline.supermercadosdia.com.ar/congelados/rebozados",
|
||||||
"https://diaonline.supermercadosdia.com.ar/congelados/postres-congelados",
|
"https://diaonline.supermercadosdia.com.ar/congelados/vegetales-congelados",
|
||||||
"https://diaonline.supermercadosdia.com.ar/congelados/pescaderia",
|
"https://diaonline.supermercadosdia.com.ar/congelados/postres-congelados",
|
||||||
"https://diaonline.supermercadosdia.com.ar/congelados/papas-congeladas",
|
"https://diaonline.supermercadosdia.com.ar/congelados/pescaderia",
|
||||||
"https://diaonline.supermercadosdia.com.ar/congelados/comidas-congeladas",
|
"https://diaonline.supermercadosdia.com.ar/congelados/papas-congeladas",
|
||||||
"https://diaonline.supermercadosdia.com.ar/congelados/hielo",
|
"https://diaonline.supermercadosdia.com.ar/congelados/comidas-congeladas",
|
||||||
"https://diaonline.supermercadosdia.com.ar/limpieza",
|
"https://diaonline.supermercadosdia.com.ar/congelados/hielo",
|
||||||
"https://diaonline.supermercadosdia.com.ar/limpieza/cuidado-de-la-ropa",
|
"https://diaonline.supermercadosdia.com.ar/limpieza",
|
||||||
"https://diaonline.supermercadosdia.com.ar/limpieza/papeleria",
|
"https://diaonline.supermercadosdia.com.ar/limpieza/cuidado-de-la-ropa",
|
||||||
"https://diaonline.supermercadosdia.com.ar/limpieza/limpiadores",
|
"https://diaonline.supermercadosdia.com.ar/limpieza/papeleria",
|
||||||
"https://diaonline.supermercadosdia.com.ar/limpieza/limpieza-de-cocina",
|
"https://diaonline.supermercadosdia.com.ar/limpieza/limpiadores",
|
||||||
"https://diaonline.supermercadosdia.com.ar/limpieza/accesorios-de-limpieza",
|
"https://diaonline.supermercadosdia.com.ar/limpieza/limpieza-de-cocina",
|
||||||
"https://diaonline.supermercadosdia.com.ar/limpieza/desodorantes-de-ambiente",
|
"https://diaonline.supermercadosdia.com.ar/limpieza/accesorios-de-limpieza",
|
||||||
"https://diaonline.supermercadosdia.com.ar/limpieza/insecticidas",
|
"https://diaonline.supermercadosdia.com.ar/limpieza/desodorantes-de-ambiente",
|
||||||
"https://diaonline.supermercadosdia.com.ar/limpieza/fosforos-y-velas",
|
"https://diaonline.supermercadosdia.com.ar/limpieza/insecticidas",
|
||||||
"https://diaonline.supermercadosdia.com.ar/limpieza/bolsas",
|
"https://diaonline.supermercadosdia.com.ar/limpieza/fosforos-y-velas",
|
||||||
"https://diaonline.supermercadosdia.com.ar/4160?map=productClusterIds&order=OrderByBestDiscountDESC",
|
"https://diaonline.supermercadosdia.com.ar/limpieza/bolsas",
|
||||||
"https://diaonline.supermercadosdia.com.ar/4136?map=productClusterIds&order=OrderByBestDiscountDESC",
|
"https://diaonline.supermercadosdia.com.ar/4160?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||||
"https://diaonline.supermercadosdia.com.ar/4143?map=productClusterIds&order=OrderByBestDiscountDESC",
|
"https://diaonline.supermercadosdia.com.ar/4136?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||||
"https://diaonline.supermercadosdia.com.ar/4189?map=productClusterIds&order=OrderByBestDiscountDESC",
|
"https://diaonline.supermercadosdia.com.ar/4143?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||||
"https://diaonline.supermercadosdia.com.ar/4086?map=productClusterIds&order=OrderByBestDiscountDESC",
|
"https://diaonline.supermercadosdia.com.ar/4189?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||||
"https://diaonline.supermercadosdia.com.ar/2089?map=productClusterIds&order=OrderByBestDiscountDESC",
|
"https://diaonline.supermercadosdia.com.ar/4086?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/2089?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||||
|
];
|
||||||
|
|
||||||
|
export async function scrapDiaProducts() {
|
||||||
|
await Promise.all([scrapBySite(), scrapBySitemap()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapBySitemap() {
|
||||||
|
// de https://diaonline.supermercadosdia.com.ar/sitemap.xml
|
||||||
|
const sitemaps = [
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/sitemap/product-1.xml",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/sitemap/product-2.xml",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/sitemap/product-3.xml",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/sitemap/product-4.xml",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
|
||||||
];
|
];
|
||||||
|
|
||||||
const links = categorias.flatMap(
|
await pMap(sitemaps, async (sitemapUrl) => {
|
||||||
(link) =>
|
const res = await fetch(sitemapUrl);
|
||||||
Array.from({ length: 51 }, (x, i) => i).map((i) => {
|
const xml = await res.text();
|
||||||
const url = new URL(link);
|
let urls = new Set<string>();
|
||||||
url.searchParams.set("page", `${i}`);
|
new HTMLRewriter()
|
||||||
return url.toString();
|
.on("loc", {
|
||||||
|
text(element) {
|
||||||
|
const txt = element.text.trim();
|
||||||
|
if (!txt) return;
|
||||||
|
urls.add(txt);
|
||||||
|
},
|
||||||
})
|
})
|
||||||
|
.transform(new Response(xml));
|
||||||
|
saveUrls(Array.from(urls));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// el order solo carga con el frontend :(
|
async function scrapBySite() {
|
||||||
// .flatMap((link) =>
|
const links = categorias.flatMap((link) =>
|
||||||
// [
|
Array.from({ length: 51 }, (x, i) => i).map((i) => {
|
||||||
// "OrderByNameASC",
|
const url = new URL(link);
|
||||||
// "OrderByNameDESC",
|
url.searchParams.set("page", `${i}`);
|
||||||
// "OrderByTopSaleDESC",
|
return url.toString();
|
||||||
// "OrderByPriceDESC",
|
})
|
||||||
// "OrderByPriceASC",
|
|
||||||
// "",
|
|
||||||
// ].map((order) => {
|
|
||||||
// const url = new URL(link);
|
|
||||||
// url.searchParams.set("order", order);
|
|
||||||
// return url.toString();
|
|
||||||
// })
|
|
||||||
// )
|
|
||||||
);
|
);
|
||||||
|
|
||||||
await pMap(
|
await pMap(
|
||||||
|
@ -103,8 +119,8 @@ import { getHtml } from "../scraper/fetch.js";
|
||||||
),
|
),
|
||||||
(a) => new URL(a.href, url).toString()
|
(a) => new URL(a.href, url).toString()
|
||||||
);
|
);
|
||||||
hrefs.forEach((h) => process.stdout.write(h + "\n"));
|
saveUrls(hrefs);
|
||||||
},
|
},
|
||||||
{ concurrency: 32 }
|
{ concurrency: 32 }
|
||||||
);
|
);
|
||||||
})();
|
}
|
||||||
|
|
|
@ -1,14 +1,19 @@
|
||||||
import { mkdtemp, access } from "node:fs/promises";
|
import { mkdtemp, access, writeFile } from "node:fs/promises";
|
||||||
import { tmpdir } from "node:os";
|
import { tmpdir } from "node:os";
|
||||||
import { join, resolve } from "node:path";
|
import { join, resolve } from "node:path";
|
||||||
import { spawn } from "node:child_process";
|
import { spawn } from "node:child_process";
|
||||||
import { Supermercado } from "db-datos/supermercado.js";
|
import { Supermercado, hosts } from "db-datos/supermercado.js";
|
||||||
import PQueue from "p-queue";
|
import PQueue from "p-queue";
|
||||||
import { format, formatDuration, intervalToDuration } from "date-fns";
|
import { format, formatDuration, intervalToDuration } from "date-fns";
|
||||||
import { parseWarc } from "./scrap.js";
|
import { parseWarc } from "./scrap.js";
|
||||||
import { S3Client } from "@aws-sdk/client-s3";
|
import { S3Client } from "@aws-sdk/client-s3";
|
||||||
import { Upload } from "@aws-sdk/lib-storage";
|
import { Upload } from "@aws-sdk/lib-storage";
|
||||||
import { BunFile } from "bun";
|
import { BunFile } from "bun";
|
||||||
|
import { db } from "db-datos/db.js";
|
||||||
|
import { like } from "drizzle-orm";
|
||||||
|
import { productoUrls } from "db-datos/schema.js";
|
||||||
|
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
|
||||||
|
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
|
||||||
|
|
||||||
const supermercados: Supermercado[] = [
|
const supermercados: Supermercado[] = [
|
||||||
Supermercado.Carrefour,
|
Supermercado.Carrefour,
|
||||||
|
@ -71,11 +76,41 @@ class Auto {
|
||||||
}
|
}
|
||||||
|
|
||||||
async downloadList(supermercado: Supermercado) {
|
async downloadList(supermercado: Supermercado) {
|
||||||
const listPath = resolve(
|
|
||||||
join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`)
|
|
||||||
);
|
|
||||||
const date = new Date();
|
|
||||||
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-"));
|
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-"));
|
||||||
|
|
||||||
|
let listPath: string;
|
||||||
|
if (supermercado === "Carrefour") {
|
||||||
|
// TODO: carrefour todavía no tiene un scraper que guarde a la BD
|
||||||
|
listPath = resolve(
|
||||||
|
join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`)
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
const t0 = performance.now();
|
||||||
|
switch (supermercado) {
|
||||||
|
case "Dia":
|
||||||
|
await scrapDiaProducts();
|
||||||
|
break;
|
||||||
|
case "Coto":
|
||||||
|
await scrapCotoProducts();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
this.inform(
|
||||||
|
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
|
||||||
|
);
|
||||||
|
|
||||||
|
listPath = join(ctxPath, `lista-${supermercado}.txt`);
|
||||||
|
const host = Object.entries(hosts).find(
|
||||||
|
([host, supe]) => supe === supermercado
|
||||||
|
)![0];
|
||||||
|
const results = await db.query.productoUrls
|
||||||
|
.findMany({
|
||||||
|
where: like(productoUrls.url, `%${host}%`),
|
||||||
|
})
|
||||||
|
.execute();
|
||||||
|
const urls = results.map((r) => r.url);
|
||||||
|
await writeFile(listPath, urls.join("\n") + "\n");
|
||||||
|
}
|
||||||
|
const date = new Date();
|
||||||
const zstdWarcName = `${supermercado}-${format(
|
const zstdWarcName = `${supermercado}-${format(
|
||||||
date,
|
date,
|
||||||
"yyyy-MM-dd-HH:mm"
|
"yyyy-MM-dd-HH:mm"
|
||||||
|
@ -98,7 +133,7 @@ class Auto {
|
||||||
const t0 = performance.now();
|
const t0 = performance.now();
|
||||||
await subproc.exited;
|
await subproc.exited;
|
||||||
this.inform(
|
this.inform(
|
||||||
`wget para ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}`
|
`[wget] ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}`
|
||||||
);
|
);
|
||||||
|
|
||||||
const gzippedWarcPath = join(ctxPath, "temp.warc.gz");
|
const gzippedWarcPath = join(ctxPath, "temp.warc.gz");
|
||||||
|
@ -187,7 +222,6 @@ class Auto {
|
||||||
stdio: ["pipe", null, null],
|
stdio: ["pipe", null, null],
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
// @ts-expect-error a los types de bun no le gusta????
|
|
||||||
decompressor.stdout.pipe(compressor.stdin);
|
decompressor.stdout.pipe(compressor.stdin);
|
||||||
compressor.on("close", (code) => {
|
compressor.on("close", (code) => {
|
||||||
if (code !== 0) {
|
if (code !== 0) {
|
||||||
|
|
|
@ -1,32 +1,6 @@
|
||||||
import { request } from "undici";
|
|
||||||
import { createBrotliDecompress, createUnzip } from "node:zlib";
|
|
||||||
import { pipeline } from "node:stream/promises";
|
|
||||||
|
|
||||||
export async function getHtml(url: string) {
|
export async function getHtml(url: string) {
|
||||||
const res = await request(url, {
|
const res = await fetch(url);
|
||||||
headers: {
|
return readableToBuffer(res.body!);
|
||||||
"Accept-Encoding": "gzip, deflate, br",
|
|
||||||
},
|
|
||||||
throwOnError: true,
|
|
||||||
bodyTimeout: 10 * 60 * 1000,
|
|
||||||
});
|
|
||||||
let output: Buffer;
|
|
||||||
switch (res.headers["content-encoding"]) {
|
|
||||||
case "gzip":
|
|
||||||
case "deflate":
|
|
||||||
output = await pipeline(res.body, createUnzip(), readableToBuffer);
|
|
||||||
break;
|
|
||||||
case "br":
|
|
||||||
output = await pipeline(
|
|
||||||
res.body,
|
|
||||||
createBrotliDecompress(),
|
|
||||||
readableToBuffer
|
|
||||||
);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
output = await readableToBuffer(res.body);
|
|
||||||
}
|
|
||||||
return output;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function readableToBuffer(source: AsyncIterable<any>) {
|
async function readableToBuffer(source: AsyncIterable<any>) {
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
import { Database } from "bun:sqlite";
|
|
||||||
import { drizzle } from "drizzle-orm/bun-sqlite";
|
|
||||||
import * as schema from "db-datos/schema.js";
|
import * as schema from "db-datos/schema.js";
|
||||||
import { WARCParser } from "warcio";
|
import { WARCParser } from "warcio";
|
||||||
import { writeFile } from "fs/promises";
|
import { writeFile } from "fs/promises";
|
||||||
|
@ -9,17 +7,11 @@ import { getDiaProduct } from "./parsers/dia.js";
|
||||||
import { getCotoProduct } from "./parsers/coto.js";
|
import { getCotoProduct } from "./parsers/coto.js";
|
||||||
import { join } from "path";
|
import { join } from "path";
|
||||||
import { and, eq, sql } from "drizzle-orm";
|
import { and, eq, sql } from "drizzle-orm";
|
||||||
import { DB_PATH } from "db-datos/drizzle.config.js";
|
import { db } from "db-datos/db.js";
|
||||||
import { migrateDb } from "db-datos/migrate.js";
|
|
||||||
|
|
||||||
const DEBUG = false;
|
const DEBUG = false;
|
||||||
const PARSER_VERSION = 3;
|
const PARSER_VERSION = 3;
|
||||||
|
|
||||||
migrateDb();
|
|
||||||
|
|
||||||
const sqlite = new Database(DB_PATH);
|
|
||||||
const db = drizzle(sqlite, { schema });
|
|
||||||
|
|
||||||
const getPrevPrecio = db
|
const getPrevPrecio = db
|
||||||
.select({ id: schema.precios.id })
|
.select({ id: schema.precios.id })
|
||||||
.from(schema.precios)
|
.from(schema.precios)
|
||||||
|
|
Loading…
Reference in a new issue