diff --git a/bun.lockb b/bun.lockb index 66e0313..39be541 100755 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/carrefour-link-scraper/package.json b/carrefour-link-scraper/package.json deleted file mode 100644 index a404536..0000000 --- a/carrefour-link-scraper/package.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "name": "carrefour-link-scraper", - "type": "module", - "version": "1.0.0", - "description": "", - "main": "index.js", - "scripts": { - "test": "echo \"Error: no test specified\" && exit 1" - }, - "keywords": [], - "author": "", - "license": "ISC", - "dependencies": { - "linkedom": "^0.16.5", - "p-map": "^7.0.1" - } -} diff --git a/dia-link-scraper/package.json b/dia-link-scraper/package.json deleted file mode 100644 index 57ff6fd..0000000 --- a/dia-link-scraper/package.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "name": "dia-link-scraper", - "type": "module", - "version": "1.0.0", - "description": "", - "main": "index.js", - "scripts": { - "test": "echo \"Error: no test specified\" && exit 1" - }, - "keywords": [], - "author": "", - "license": "ISC", - "dependencies": { - "linkedom": "^0.16.5", - "p-map": "^7.0.0" - } -} diff --git a/carrefour-link-scraper/index.ts b/link-scrapers/carrefour.ts similarity index 100% rename from carrefour-link-scraper/index.ts rename to link-scrapers/carrefour.ts diff --git a/coto-link-scraper/index.ts b/link-scrapers/coto.ts similarity index 89% rename from coto-link-scraper/index.ts rename to link-scrapers/coto.ts index b25be9b..d3de22d 100644 --- a/coto-link-scraper/index.ts +++ b/link-scrapers/coto.ts @@ -1,4 +1,3 @@ -import { getHtml } from "../scraper/fetch.js"; import { parseHTML } from "linkedom"; import PQueue from "p-queue"; import { saveUrls } from "db-datos/urlHelpers.js"; @@ -28,12 +27,13 @@ function getPage(url: string) { return async () => { let html; try { - html = await getHtml(url); + const res = await fetch(url); + html = await res.text(); } catch (error) { await getPage(url)(); return; } - const { document } = parseHTML(html.toString("utf-8")); + const { document } = parseHTML(html); const hrefs = Array.from( document.querySelectorAll(".product_info_container a"), diff --git a/dia-link-scraper/index.ts b/link-scrapers/dia.ts similarity index 97% rename from dia-link-scraper/index.ts rename to link-scrapers/dia.ts index 09b825c..5d77c52 100644 --- a/dia-link-scraper/index.ts +++ b/link-scrapers/dia.ts @@ -1,7 +1,6 @@ import pMap from "p-map"; import { decodeXML } from "entities"; import { parseHTML } from "linkedom"; -import { getHtml } from "../scraper/fetch.js"; import { saveUrls } from "db-datos/urlHelpers.js"; const categorias = [ @@ -111,8 +110,9 @@ async function scrapBySite() { await pMap( links, async (url) => { - const html = await getHtml(url); - const { document } = parseHTML(html.toString("utf-8")); + const res = await fetch(url); + const html = await res.text(); + const { document } = parseHTML(html); const hrefs = Array.from( document.querySelectorAll( diff --git a/coto-link-scraper/package.json b/link-scrapers/package.json similarity index 90% rename from coto-link-scraper/package.json rename to link-scrapers/package.json index 04e7eac..ce7f074 100644 --- a/coto-link-scraper/package.json +++ b/link-scrapers/package.json @@ -1,5 +1,5 @@ { - "name": "coto-link-scraper", + "name": "link-scrapers", "type": "module", "version": "1.0.0", "description": "", diff --git a/package.json b/package.json index 30324d4..6cc95ea 100644 --- a/package.json +++ b/package.json @@ -2,9 +2,7 @@ "name": "preciazo", "private": true, "workspaces": [ - "dia-link-scraper", - "coto-link-scraper", - "carrefour-link-scraper", + "link-scrapers", "scraper", "sitio", "db-datos" diff --git a/readme.md b/readme.md index 18c1a50..847b3fb 100644 --- a/readme.md +++ b/readme.md @@ -4,7 +4,7 @@ scrapeo "masivo" de precios y datos en supermercados argentinos ## componentes (en orden de proceso) -- los link scrapers ([coto-link-scraper](./coto-link-scraper/), [dia-link-scraper](./dia-link-scraper/) y [carrefour-link-scraper](./carrefour-link-scraper)) crean listas de links a productos para scrapear +- los link scrapers ([link-scrapers/](./link-scrapers/)) crean listas de links a productos para scrapear (no hace falta correrlos porque ya hay listas armadas en [data/](./data/)) diff --git a/scraper/auto.ts b/scraper/auto.ts index b4bd8d5..21643fc 100644 --- a/scraper/auto.ts +++ b/scraper/auto.ts @@ -8,9 +8,9 @@ import { downloadList } from "./scrap.js"; import { db } from "db-datos/db.js"; import { like } from "drizzle-orm"; import { productoUrls } from "db-datos/schema.js"; -import { scrapDiaProducts } from "../dia-link-scraper/index.js"; -import { scrapCotoProducts } from "../coto-link-scraper/index.js"; -import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js"; +import { scrapDiaProducts } from "../link-scrapers/dia.js"; +import { scrapCotoProducts } from "../link-scrapers/coto.js"; +import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js"; const supermercados: Supermercado[] = [ Supermercado.Carrefour, diff --git a/scraper/cli.ts b/scraper/cli.ts index b68bda7..0304ba8 100644 --- a/scraper/cli.ts +++ b/scraper/cli.ts @@ -1,6 +1,6 @@ -import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js"; -import { scrapCotoProducts } from "../coto-link-scraper/index.js"; -import { scrapDiaProducts } from "../dia-link-scraper/index.js"; +import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js"; +import { scrapCotoProducts } from "../link-scrapers/coto.js"; +import { scrapDiaProducts } from "../link-scrapers/dia.js"; import { auto } from "./auto.js"; import { downloadList, getProduct } from "./scrap.js"; diff --git a/scraper/fetch.ts b/scraper/fetch.ts deleted file mode 100644 index 59bffb2..0000000 --- a/scraper/fetch.ts +++ /dev/null @@ -1,13 +0,0 @@ -export async function getHtml(url: string) { - const res = await fetch(url); - return readableToBuffer(res.body!); -} - -async function readableToBuffer(source: AsyncIterable) { - // https://stackoverflow.com/a/72891118 - const buffers = []; - for await (const data of source) { - buffers.push(data); - } - return Buffer.concat(buffers); -} diff --git a/scraper/parsers/carrefour.ts b/scraper/parsers/carrefour.ts index e3f74fa..b025f62 100644 --- a/scraper/parsers/carrefour.ts +++ b/scraper/parsers/carrefour.ts @@ -1,6 +1,6 @@ import { parseHTML } from "linkedom"; import { Precioish } from "../scrap.js"; -import { getProductJsonLd, priceFromMeta, stockFromMeta } from "../common.js"; +import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js"; function parseScriptJson(dom: Window, varname: string): T { const script = dom.window.document.querySelector( diff --git a/scraper/common.ts b/scraper/parsers/common.ts similarity index 100% rename from scraper/common.ts rename to scraper/parsers/common.ts diff --git a/scraper/parsers/dia.ts b/scraper/parsers/dia.ts index be3e2c5..5fdd1ca 100644 --- a/scraper/parsers/dia.ts +++ b/scraper/parsers/dia.ts @@ -1,6 +1,6 @@ import { parseHTML } from "linkedom"; import { type Precioish } from "../scrap.js"; -import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js"; +import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js"; export function getDiaProduct(html: string | Buffer): Precioish { const dom = parseHTML(html);