mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-22 06:16:18 +00:00
link-scrapers: reutilizar codigo sitemaps
This commit is contained in:
parent
fa6de68f60
commit
f0798e8620
6 changed files with 27 additions and 29 deletions
BIN
bun.lockb
BIN
bun.lockb
Binary file not shown.
|
@ -1,6 +1,6 @@
|
||||||
import pMap from "p-map";
|
import pMap from "p-map";
|
||||||
import { decodeXML } from "entities";
|
|
||||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
|
import { getUrlsFromSitemap } from "./common.js";
|
||||||
|
|
||||||
export async function scrapCarrefourProducts() {
|
export async function scrapCarrefourProducts() {
|
||||||
await scrapBySitemap();
|
await scrapBySitemap();
|
||||||
|
@ -26,17 +26,7 @@ async function scrapBySitemap() {
|
||||||
async (sitemapUrl) => {
|
async (sitemapUrl) => {
|
||||||
const res = await fetch(sitemapUrl);
|
const res = await fetch(sitemapUrl);
|
||||||
const xml = await res.text();
|
const xml = await res.text();
|
||||||
let urls = new Set<string>();
|
saveUrls(getUrlsFromSitemap(xml));
|
||||||
new HTMLRewriter()
|
|
||||||
.on("loc", {
|
|
||||||
text(element) {
|
|
||||||
const txt = element.text.trim();
|
|
||||||
if (!txt) return;
|
|
||||||
urls.add(decodeXML(txt));
|
|
||||||
},
|
|
||||||
})
|
|
||||||
.transform(new Response(xml));
|
|
||||||
saveUrls(Array.from(urls));
|
|
||||||
},
|
},
|
||||||
{ concurrency: 3 }
|
{ concurrency: 3 }
|
||||||
);
|
);
|
||||||
|
|
14
link-scrapers/common.ts
Normal file
14
link-scrapers/common.ts
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
import { decodeXML } from "entities";
|
||||||
|
export function getUrlsFromSitemap(xml: string) {
|
||||||
|
let urls = new Set<string>();
|
||||||
|
new HTMLRewriter()
|
||||||
|
.on("loc", {
|
||||||
|
text(element) {
|
||||||
|
const txt = element.text.trim();
|
||||||
|
if (!txt) return;
|
||||||
|
urls.add(decodeXML(txt));
|
||||||
|
},
|
||||||
|
})
|
||||||
|
.transform(new Response(xml));
|
||||||
|
return Array.from(urls);
|
||||||
|
}
|
|
@ -1,7 +1,7 @@
|
||||||
import pMap from "p-map";
|
import pMap from "p-map";
|
||||||
import { decodeXML } from "entities";
|
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
|
import { getUrlsFromSitemap } from "./common.js";
|
||||||
|
|
||||||
const categorias = [
|
const categorias = [
|
||||||
"https://diaonline.supermercadosdia.com.ar/almacen",
|
"https://diaonline.supermercadosdia.com.ar/almacen",
|
||||||
|
@ -81,21 +81,15 @@ async function scrapBySitemap() {
|
||||||
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
|
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
|
||||||
];
|
];
|
||||||
|
|
||||||
await pMap(sitemaps, async (sitemapUrl) => {
|
await pMap(
|
||||||
const res = await fetch(sitemapUrl);
|
sitemaps,
|
||||||
const xml = await res.text();
|
async (sitemapUrl) => {
|
||||||
let urls = new Set<string>();
|
const res = await fetch(sitemapUrl);
|
||||||
new HTMLRewriter()
|
const xml = await res.text();
|
||||||
.on("loc", {
|
saveUrls(getUrlsFromSitemap(xml));
|
||||||
text(element) {
|
},
|
||||||
const txt = element.text.trim();
|
{ concurrency: 3 }
|
||||||
if (!txt) return;
|
);
|
||||||
urls.add(decodeXML(txt));
|
|
||||||
},
|
|
||||||
})
|
|
||||||
.transform(new Response(xml));
|
|
||||||
saveUrls(Array.from(urls));
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapBySite() {
|
async function scrapBySite() {
|
||||||
|
|
|
@ -11,6 +11,7 @@
|
||||||
"author": "",
|
"author": "",
|
||||||
"license": "ISC",
|
"license": "ISC",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"entities": "^4.5.0",
|
||||||
"linkedom": "^0.16.5",
|
"linkedom": "^0.16.5",
|
||||||
"p-queue": "^8.0.1"
|
"p-queue": "^8.0.1"
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,7 +17,6 @@
|
||||||
"date-fns": "^3.0.6",
|
"date-fns": "^3.0.6",
|
||||||
"db-datos": "workspace:^",
|
"db-datos": "workspace:^",
|
||||||
"drizzle-orm": "=0.29.1",
|
"drizzle-orm": "=0.29.1",
|
||||||
"entities": "^4.5.0",
|
|
||||||
"linkedom": "^0.16.5",
|
"linkedom": "^0.16.5",
|
||||||
"nanoid": "^5.0.4",
|
"nanoid": "^5.0.4",
|
||||||
"p-map": "^7.0.1",
|
"p-map": "^7.0.1",
|
||||||
|
|
Loading…
Reference in a new issue