scrapear urls a BD

This commit is contained in:
Cat /dev/Nulo 2023-12-29 21:49:32 -03:00
parent de3bf4900c
commit 98a699e454
11 changed files with 365 additions and 159 deletions

View file

@ -1,23 +1,24 @@
import { getHtml } from "../scraper/fetch.js"; import { getHtml } from "../scraper/fetch.js";
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import PQueue from "p-queue"; import PQueue from "p-queue";
import { saveUrls } from "db-datos/urlHelpers.js";
// let fetched = new Set<string>(); export async function scrapCotoProducts() {
{
const initial = const initial =
"https://www.cotodigital3.com.ar/sitios/cdigi/browse?Nf=product.endDate%7CGTEQ+1.7032032E12%7C%7Cproduct.startDate%7CLTEQ+1.7032032E12&No=2200&Nr=AND%28product.sDisp_200%3A1004%2Cproduct.language%3Aespa%C3%B1ol%2COR%28product.siteId%3ACotoDigital%29%29&Nrpp=200"; "https://www.cotodigital3.com.ar/sitios/cdigi/browse?Nf=product.endDate%7CGTEQ+1.7032032E12%7C%7Cproduct.startDate%7CLTEQ+1.7032032E12&No=2200&Nr=AND%28product.sDisp_200%3A1004%2Cproduct.language%3Aespa%C3%B1ol%2COR%28product.siteId%3ACotoDigital%29%29&Nrpp=200";
const queue = new PQueue({ concurrency: 2 }); const queue = new PQueue({ concurrency: 4 });
const pageSize = 300; // hasta 1000 const pageSize = 300; // hasta 1000
const links = Array.from({ length: Math.ceil(29000 / 300) }, (x, i) => i).map( const links = Array.from(
(i) => { { length: Math.ceil(29000 / pageSize) },
(x, i) => i
).map((i) => {
const url = new URL(initial); const url = new URL(initial);
url.searchParams.set("No", `${i * pageSize}`); url.searchParams.set("No", `${i * pageSize}`);
url.searchParams.set("Nrpp", `${pageSize}`); url.searchParams.set("Nrpp", `${pageSize}`);
return url.toString(); return url.toString();
} });
);
const promises = links.map((l) => queue.add(getPage(l))); const promises = links.map((l) => queue.add(getPage(l)));
await Promise.all(promises); await Promise.all(promises);
@ -38,22 +39,6 @@ function getPage(url: string) {
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"), document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
(a) => new URL(a.href, url).toString() (a) => new URL(a.href, url).toString()
); );
hrefs.forEach((h) => process.stdout.write(h + "\n")); saveUrls(hrefs);
// const nextLinks = Array.from(
// document.querySelectorAll<HTMLAnchorElement>(
// "#atg_store_pagination a[href]"
// ),
// (a) => new URL(a.href, url).toString()
// );
// await Promise.all(
// nextLinks
// .filter((l) => !fetched.has(l))
// .map((l) => {
// fetched.add(l);
// return queue.add(getPage(l));
// })
// );
}; };
} }

10
db-datos/db.ts Normal file
View file

@ -0,0 +1,10 @@
import { Database } from "bun:sqlite";
import { drizzle } from "drizzle-orm/bun-sqlite";
import { DB_PATH } from "./drizzle.config.js";
import { migrateDb } from "./migrate.js";
import * as schema from "./schema.js";
migrateDb();
export const sqlite = new Database(DB_PATH);
export const db = drizzle(sqlite, { schema });

View file

@ -0,0 +1,8 @@
CREATE TABLE `producto_urls` (
`id` integer PRIMARY KEY AUTOINCREMENT NOT NULL,
`url` text NOT NULL,
`first_seen` integer NOT NULL,
`last_seen` integer NOT NULL
);
--> statement-breakpoint
CREATE UNIQUE INDEX `producto_urls_url_unique` ON `producto_urls` (`url`);

View file

@ -0,0 +1,146 @@
{
"version": "5",
"dialect": "sqlite",
"id": "2e398920-ffaf-4d55-ae13-d906cb9e0efa",
"prevId": "082630a9-3744-4e33-bde5-06045ca57d36",
"tables": {
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {
"precios_ean_idx": {
"name": "precios_ean_idx",
"columns": [
"ean"
],
"isUnique": false
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
},
"producto_urls": {
"name": "producto_urls",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"first_seen": {
"name": "first_seen",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"last_seen": {
"name": "last_seen",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
}
},
"indexes": {
"producto_urls_url_unique": {
"name": "producto_urls_url_unique",
"columns": [
"url"
],
"isUnique": true
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"schemas": {},
"tables": {},
"columns": {}
}
}

View file

@ -64,6 +64,13 @@
"when": 1703807460152, "when": 1703807460152,
"tag": "0008_funny_nighthawk", "tag": "0008_funny_nighthawk",
"breakpoints": true "breakpoints": true
},
{
"idx": 9,
"version": "5",
"when": 1703895109501,
"tag": "0009_breezy_forge",
"breakpoints": true
} }
] ]
} }

View file

@ -22,3 +22,12 @@ export const precios = sqliteTable(
); );
export type Precio = typeof precios.$inferSelect; export type Precio = typeof precios.$inferSelect;
export const productoUrls = sqliteTable("producto_urls", {
id: integer("id", { mode: "number" }).primaryKey({ autoIncrement: true }),
url: text("url").unique().notNull(),
firstSeen: integer("first_seen", { mode: "timestamp" }).notNull(),
lastSeen: integer("last_seen", { mode: "timestamp" }).notNull(),
});
export type ProductUrl = typeof productoUrls.$inferSelect;

25
db-datos/urlHelpers.ts Normal file
View file

@ -0,0 +1,25 @@
import { sql } from "drizzle-orm";
import { db } from "./db.js";
import { productoUrls } from "./schema.js";
export function saveUrls(urls: string[]) {
db.transaction((tx) => {
const now = new Date();
const insertUrlTra = tx
.insert(productoUrls)
.values({
url: sql.placeholder("url"),
firstSeen: now,
lastSeen: now,
})
.onConflictDoUpdate({
target: productoUrls.url,
set: { lastSeen: now },
})
.prepare();
for (const href of urls) {
insertUrlTra.run({ url: href });
}
});
}

View file

@ -1,7 +1,8 @@
import pMap from "p-map"; import pMap from "p-map";
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import { getHtml } from "../scraper/fetch.js"; import { getHtml } from "../scraper/fetch.js";
(async () => { import { saveUrls } from "db-datos/urlHelpers.js";
const categorias = [ const categorias = [
"https://diaonline.supermercadosdia.com.ar/almacen", "https://diaonline.supermercadosdia.com.ar/almacen",
"https://diaonline.supermercadosdia.com.ar/almacen/conservas", "https://diaonline.supermercadosdia.com.ar/almacen/conservas",
@ -66,29 +67,44 @@ import { getHtml } from "../scraper/fetch.js";
"https://diaonline.supermercadosdia.com.ar/2089?map=productClusterIds&order=OrderByBestDiscountDESC", "https://diaonline.supermercadosdia.com.ar/2089?map=productClusterIds&order=OrderByBestDiscountDESC",
]; ];
const links = categorias.flatMap( export async function scrapDiaProducts() {
(link) => await Promise.all([scrapBySite(), scrapBySitemap()]);
}
async function scrapBySitemap() {
// de https://diaonline.supermercadosdia.com.ar/sitemap.xml
const sitemaps = [
"https://diaonline.supermercadosdia.com.ar/sitemap/product-1.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-2.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-3.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-4.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
];
await pMap(sitemaps, async (sitemapUrl) => {
const res = await fetch(sitemapUrl);
const xml = await res.text();
let urls = new Set<string>();
new HTMLRewriter()
.on("loc", {
text(element) {
const txt = element.text.trim();
if (!txt) return;
urls.add(txt);
},
})
.transform(new Response(xml));
saveUrls(Array.from(urls));
});
}
async function scrapBySite() {
const links = categorias.flatMap((link) =>
Array.from({ length: 51 }, (x, i) => i).map((i) => { Array.from({ length: 51 }, (x, i) => i).map((i) => {
const url = new URL(link); const url = new URL(link);
url.searchParams.set("page", `${i}`); url.searchParams.set("page", `${i}`);
return url.toString(); return url.toString();
}) })
// el order solo carga con el frontend :(
// .flatMap((link) =>
// [
// "OrderByNameASC",
// "OrderByNameDESC",
// "OrderByTopSaleDESC",
// "OrderByPriceDESC",
// "OrderByPriceASC",
// "",
// ].map((order) => {
// const url = new URL(link);
// url.searchParams.set("order", order);
// return url.toString();
// })
// )
); );
await pMap( await pMap(
@ -103,8 +119,8 @@ import { getHtml } from "../scraper/fetch.js";
), ),
(a) => new URL(a.href, url).toString() (a) => new URL(a.href, url).toString()
); );
hrefs.forEach((h) => process.stdout.write(h + "\n")); saveUrls(hrefs);
}, },
{ concurrency: 32 } { concurrency: 32 }
); );
})(); }

View file

@ -1,14 +1,19 @@
import { mkdtemp, access } from "node:fs/promises"; import { mkdtemp, access, writeFile } from "node:fs/promises";
import { tmpdir } from "node:os"; import { tmpdir } from "node:os";
import { join, resolve } from "node:path"; import { join, resolve } from "node:path";
import { spawn } from "node:child_process"; import { spawn } from "node:child_process";
import { Supermercado } from "db-datos/supermercado.js"; import { Supermercado, hosts } from "db-datos/supermercado.js";
import PQueue from "p-queue"; import PQueue from "p-queue";
import { format, formatDuration, intervalToDuration } from "date-fns"; import { format, formatDuration, intervalToDuration } from "date-fns";
import { parseWarc } from "./scrap.js"; import { parseWarc } from "./scrap.js";
import { S3Client } from "@aws-sdk/client-s3"; import { S3Client } from "@aws-sdk/client-s3";
import { Upload } from "@aws-sdk/lib-storage"; import { Upload } from "@aws-sdk/lib-storage";
import { BunFile } from "bun"; import { BunFile } from "bun";
import { db } from "db-datos/db.js";
import { like } from "drizzle-orm";
import { productoUrls } from "db-datos/schema.js";
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
const supermercados: Supermercado[] = [ const supermercados: Supermercado[] = [
Supermercado.Carrefour, Supermercado.Carrefour,
@ -71,11 +76,41 @@ class Auto {
} }
async downloadList(supermercado: Supermercado) { async downloadList(supermercado: Supermercado) {
const listPath = resolve( const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-"));
let listPath: string;
if (supermercado === "Carrefour") {
// TODO: carrefour todavía no tiene un scraper que guarde a la BD
listPath = resolve(
join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`) join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`)
); );
} else {
const t0 = performance.now();
switch (supermercado) {
case "Dia":
await scrapDiaProducts();
break;
case "Coto":
await scrapCotoProducts();
break;
}
this.inform(
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
);
listPath = join(ctxPath, `lista-${supermercado}.txt`);
const host = Object.entries(hosts).find(
([host, supe]) => supe === supermercado
)![0];
const results = await db.query.productoUrls
.findMany({
where: like(productoUrls.url, `%${host}%`),
})
.execute();
const urls = results.map((r) => r.url);
await writeFile(listPath, urls.join("\n") + "\n");
}
const date = new Date(); const date = new Date();
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-"));
const zstdWarcName = `${supermercado}-${format( const zstdWarcName = `${supermercado}-${format(
date, date,
"yyyy-MM-dd-HH:mm" "yyyy-MM-dd-HH:mm"
@ -98,7 +133,7 @@ class Auto {
const t0 = performance.now(); const t0 = performance.now();
await subproc.exited; await subproc.exited;
this.inform( this.inform(
`wget para ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}` `[wget] ${zstdWarcName} tardó ${formatMs(performance.now() - t0)}`
); );
const gzippedWarcPath = join(ctxPath, "temp.warc.gz"); const gzippedWarcPath = join(ctxPath, "temp.warc.gz");
@ -187,7 +222,6 @@ class Auto {
stdio: ["pipe", null, null], stdio: ["pipe", null, null],
} }
); );
// @ts-expect-error a los types de bun no le gusta????
decompressor.stdout.pipe(compressor.stdin); decompressor.stdout.pipe(compressor.stdin);
compressor.on("close", (code) => { compressor.on("close", (code) => {
if (code !== 0) { if (code !== 0) {

View file

@ -1,32 +1,6 @@
import { request } from "undici";
import { createBrotliDecompress, createUnzip } from "node:zlib";
import { pipeline } from "node:stream/promises";
export async function getHtml(url: string) { export async function getHtml(url: string) {
const res = await request(url, { const res = await fetch(url);
headers: { return readableToBuffer(res.body!);
"Accept-Encoding": "gzip, deflate, br",
},
throwOnError: true,
bodyTimeout: 10 * 60 * 1000,
});
let output: Buffer;
switch (res.headers["content-encoding"]) {
case "gzip":
case "deflate":
output = await pipeline(res.body, createUnzip(), readableToBuffer);
break;
case "br":
output = await pipeline(
res.body,
createBrotliDecompress(),
readableToBuffer
);
break;
default:
output = await readableToBuffer(res.body);
}
return output;
} }
async function readableToBuffer(source: AsyncIterable<any>) { async function readableToBuffer(source: AsyncIterable<any>) {

View file

@ -1,5 +1,3 @@
import { Database } from "bun:sqlite";
import { drizzle } from "drizzle-orm/bun-sqlite";
import * as schema from "db-datos/schema.js"; import * as schema from "db-datos/schema.js";
import { WARCParser } from "warcio"; import { WARCParser } from "warcio";
import { writeFile } from "fs/promises"; import { writeFile } from "fs/promises";
@ -9,17 +7,11 @@ import { getDiaProduct } from "./parsers/dia.js";
import { getCotoProduct } from "./parsers/coto.js"; import { getCotoProduct } from "./parsers/coto.js";
import { join } from "path"; import { join } from "path";
import { and, eq, sql } from "drizzle-orm"; import { and, eq, sql } from "drizzle-orm";
import { DB_PATH } from "db-datos/drizzle.config.js"; import { db } from "db-datos/db.js";
import { migrateDb } from "db-datos/migrate.js";
const DEBUG = false; const DEBUG = false;
const PARSER_VERSION = 3; const PARSER_VERSION = 3;
migrateDb();
const sqlite = new Database(DB_PATH);
const db = drizzle(sqlite, { schema });
const getPrevPrecio = db const getPrevPrecio = db
.select({ id: schema.precios.id }) .select({ id: schema.precios.id })
.from(schema.precios) .from(schema.precios)