link-scrapers: reutilizar codigo sitemaps

This commit is contained in:
Cat /dev/Nulo 2024-01-04 18:12:55 -03:00
parent fa6de68f60
commit f0798e8620
6 changed files with 27 additions and 29 deletions

BIN
bun.lockb

Binary file not shown.

View file

@ -1,6 +1,6 @@
import pMap from "p-map"; import pMap from "p-map";
import { decodeXML } from "entities";
import { saveUrls } from "db-datos/urlHelpers.js"; import { saveUrls } from "db-datos/urlHelpers.js";
import { getUrlsFromSitemap } from "./common.js";
export async function scrapCarrefourProducts() { export async function scrapCarrefourProducts() {
await scrapBySitemap(); await scrapBySitemap();
@ -26,17 +26,7 @@ async function scrapBySitemap() {
async (sitemapUrl) => { async (sitemapUrl) => {
const res = await fetch(sitemapUrl); const res = await fetch(sitemapUrl);
const xml = await res.text(); const xml = await res.text();
let urls = new Set<string>(); saveUrls(getUrlsFromSitemap(xml));
new HTMLRewriter()
.on("loc", {
text(element) {
const txt = element.text.trim();
if (!txt) return;
urls.add(decodeXML(txt));
},
})
.transform(new Response(xml));
saveUrls(Array.from(urls));
}, },
{ concurrency: 3 } { concurrency: 3 }
); );

14
link-scrapers/common.ts Normal file
View file

@ -0,0 +1,14 @@
import { decodeXML } from "entities";
export function getUrlsFromSitemap(xml: string) {
let urls = new Set<string>();
new HTMLRewriter()
.on("loc", {
text(element) {
const txt = element.text.trim();
if (!txt) return;
urls.add(decodeXML(txt));
},
})
.transform(new Response(xml));
return Array.from(urls);
}

View file

@ -1,7 +1,7 @@
import pMap from "p-map"; import pMap from "p-map";
import { decodeXML } from "entities";
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import { saveUrls } from "db-datos/urlHelpers.js"; import { saveUrls } from "db-datos/urlHelpers.js";
import { getUrlsFromSitemap } from "./common.js";
const categorias = [ const categorias = [
"https://diaonline.supermercadosdia.com.ar/almacen", "https://diaonline.supermercadosdia.com.ar/almacen",
@ -81,21 +81,15 @@ async function scrapBySitemap() {
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml", "https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
]; ];
await pMap(sitemaps, async (sitemapUrl) => { await pMap(
sitemaps,
async (sitemapUrl) => {
const res = await fetch(sitemapUrl); const res = await fetch(sitemapUrl);
const xml = await res.text(); const xml = await res.text();
let urls = new Set<string>(); saveUrls(getUrlsFromSitemap(xml));
new HTMLRewriter()
.on("loc", {
text(element) {
const txt = element.text.trim();
if (!txt) return;
urls.add(decodeXML(txt));
}, },
}) { concurrency: 3 }
.transform(new Response(xml)); );
saveUrls(Array.from(urls));
});
} }
async function scrapBySite() { async function scrapBySite() {

View file

@ -11,6 +11,7 @@
"author": "", "author": "",
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"entities": "^4.5.0",
"linkedom": "^0.16.5", "linkedom": "^0.16.5",
"p-queue": "^8.0.1" "p-queue": "^8.0.1"
} }

View file

@ -17,7 +17,6 @@
"date-fns": "^3.0.6", "date-fns": "^3.0.6",
"db-datos": "workspace:^", "db-datos": "workspace:^",
"drizzle-orm": "=0.29.1", "drizzle-orm": "=0.29.1",
"entities": "^4.5.0",
"linkedom": "^0.16.5", "linkedom": "^0.16.5",
"nanoid": "^5.0.4", "nanoid": "^5.0.4",
"p-map": "^7.0.1", "p-map": "^7.0.1",