scrapear links de coto sin puppeteer

This commit is contained in:
Cat /dev/Nulo 2023-12-22 15:46:36 -03:00
parent d85b54f837
commit 1b74a16fff
2 changed files with 58 additions and 34 deletions

View file

@ -1,37 +1,59 @@
import puppeteer from "puppeteer"; import { getHtml } from "../scraper/fetch.js";
import { parseHTML } from "linkedom";
import PQueue from "p-queue";
(async () => { // let fetched = new Set<string>();
const browser = await puppeteer.launch(); {
const page = await browser.newPage(); const initial =
await page.goto( "https://www.cotodigital3.com.ar/sitios/cdigi/browse?Nf=product.endDate%7CGTEQ+1.7032032E12%7C%7Cproduct.startDate%7CLTEQ+1.7032032E12&No=2200&Nr=AND%28product.sDisp_200%3A1004%2Cproduct.language%3Aespa%C3%B1ol%2COR%28product.siteId%3ACotoDigital%29%29&Nrpp=200";
"https://www.cotodigital3.com.ar/sitios/cdigi/browse/catalogo-almac%C3%A9n/"
);
async function getHrefs() { const queue = new PQueue({ concurrency: 2 });
const element = await page.waitForSelector(".product_info_container a");
await element?.dispose(); const pageSize = 300; // hasta 1000
const hrefs = await page.evaluate(() => const links = Array.from({ length: Math.ceil(29000 / 300) }, (x, i) => i).map(
Array.from( (i) => {
document.querySelectorAll<HTMLAnchorElement>( const url = new URL(initial);
".product_info_container a" url.searchParams.set("No", `${i * pageSize}`);
), url.searchParams.set("Nrpp", `${pageSize}`);
(a) => new URL(a.href).toString() return url.toString();
)
);
return hrefs;
} }
);
const promises = links.map((l) => queue.add(getPage(l)));
await Promise.all(promises);
}
function getPage(url: string) {
return async () => {
let html;
try { try {
while (true) { html = await getHtml(url);
const hrefs = await getHrefs(); } catch (error) {
hrefs.forEach((href) => console.log(href)); await getPage(url)();
return;
}
const { document } = parseHTML(html.toString("utf-8"));
const btn = await page.waitForSelector('a[title="Siguiente"]', { const hrefs = Array.from(
timeout: 5000, document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
}); (a) => new URL(a.href, url).toString()
await btn?.click(); );
await btn?.dispose(); hrefs.forEach((h) => process.stdout.write(h + "\n"));
}
} finally { // const nextLinks = Array.from(
await browser.close(); // document.querySelectorAll<HTMLAnchorElement>(
} // "#atg_store_pagination a[href]"
})(); // ),
// (a) => new URL(a.href, url).toString()
// );
// await Promise.all(
// nextLinks
// .filter((l) => !fetched.has(l))
// .map((l) => {
// fetched.add(l);
// return queue.add(getPage(l));
// })
// );
};
}

View file

@ -11,7 +11,9 @@
"author": "", "author": "",
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"puppeteer": "^21.6.1", "linkedom": "^0.16.5",
"tsx": "^4.7.0" "p-queue": "^8.0.1",
"tsx": "^4.7.0",
"undici": "^6.2.0"
} }
} }