diff --git a/bun.lockb b/bun.lockb index 39be541..b925de1 100755 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/link-scrapers/carrefour.ts b/link-scrapers/carrefour.ts index 6779c81..ce92306 100644 --- a/link-scrapers/carrefour.ts +++ b/link-scrapers/carrefour.ts @@ -1,6 +1,6 @@ import pMap from "p-map"; -import { decodeXML } from "entities"; import { saveUrls } from "db-datos/urlHelpers.js"; +import { getUrlsFromSitemap } from "./common.js"; export async function scrapCarrefourProducts() { await scrapBySitemap(); @@ -26,17 +26,7 @@ async function scrapBySitemap() { async (sitemapUrl) => { const res = await fetch(sitemapUrl); const xml = await res.text(); - let urls = new Set(); - new HTMLRewriter() - .on("loc", { - text(element) { - const txt = element.text.trim(); - if (!txt) return; - urls.add(decodeXML(txt)); - }, - }) - .transform(new Response(xml)); - saveUrls(Array.from(urls)); + saveUrls(getUrlsFromSitemap(xml)); }, { concurrency: 3 } ); diff --git a/link-scrapers/common.ts b/link-scrapers/common.ts new file mode 100644 index 0000000..f4107a0 --- /dev/null +++ b/link-scrapers/common.ts @@ -0,0 +1,14 @@ +import { decodeXML } from "entities"; +export function getUrlsFromSitemap(xml: string) { + let urls = new Set(); + new HTMLRewriter() + .on("loc", { + text(element) { + const txt = element.text.trim(); + if (!txt) return; + urls.add(decodeXML(txt)); + }, + }) + .transform(new Response(xml)); + return Array.from(urls); +} diff --git a/link-scrapers/dia.ts b/link-scrapers/dia.ts index 5d77c52..5b469f6 100644 --- a/link-scrapers/dia.ts +++ b/link-scrapers/dia.ts @@ -1,7 +1,7 @@ import pMap from "p-map"; -import { decodeXML } from "entities"; import { parseHTML } from "linkedom"; import { saveUrls } from "db-datos/urlHelpers.js"; +import { getUrlsFromSitemap } from "./common.js"; const categorias = [ "https://diaonline.supermercadosdia.com.ar/almacen", @@ -81,21 +81,15 @@ async function scrapBySitemap() { "https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml", ]; - await pMap(sitemaps, async (sitemapUrl) => { - const res = await fetch(sitemapUrl); - const xml = await res.text(); - let urls = new Set(); - new HTMLRewriter() - .on("loc", { - text(element) { - const txt = element.text.trim(); - if (!txt) return; - urls.add(decodeXML(txt)); - }, - }) - .transform(new Response(xml)); - saveUrls(Array.from(urls)); - }); + await pMap( + sitemaps, + async (sitemapUrl) => { + const res = await fetch(sitemapUrl); + const xml = await res.text(); + saveUrls(getUrlsFromSitemap(xml)); + }, + { concurrency: 3 } + ); } async function scrapBySite() { diff --git a/link-scrapers/package.json b/link-scrapers/package.json index ce7f074..9ae66f9 100644 --- a/link-scrapers/package.json +++ b/link-scrapers/package.json @@ -11,6 +11,7 @@ "author": "", "license": "ISC", "dependencies": { + "entities": "^4.5.0", "linkedom": "^0.16.5", "p-queue": "^8.0.1" } diff --git a/scraper/package.json b/scraper/package.json index a3351af..edaf0ca 100644 --- a/scraper/package.json +++ b/scraper/package.json @@ -17,7 +17,6 @@ "date-fns": "^3.0.6", "db-datos": "workspace:^", "drizzle-orm": "=0.29.1", - "entities": "^4.5.0", "linkedom": "^0.16.5", "nanoid": "^5.0.4", "p-map": "^7.0.1",