mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-23 06:36:19 +00:00
scrapear links de coto sin puppeteer
This commit is contained in:
parent
d85b54f837
commit
1b74a16fff
2 changed files with 58 additions and 34 deletions
|
@ -1,37 +1,59 @@
|
|||
import puppeteer from "puppeteer";
|
||||
import { getHtml } from "../scraper/fetch.js";
|
||||
import { parseHTML } from "linkedom";
|
||||
import PQueue from "p-queue";
|
||||
|
||||
(async () => {
|
||||
const browser = await puppeteer.launch();
|
||||
const page = await browser.newPage();
|
||||
await page.goto(
|
||||
"https://www.cotodigital3.com.ar/sitios/cdigi/browse/catalogo-almac%C3%A9n/"
|
||||
);
|
||||
// let fetched = new Set<string>();
|
||||
{
|
||||
const initial =
|
||||
"https://www.cotodigital3.com.ar/sitios/cdigi/browse?Nf=product.endDate%7CGTEQ+1.7032032E12%7C%7Cproduct.startDate%7CLTEQ+1.7032032E12&No=2200&Nr=AND%28product.sDisp_200%3A1004%2Cproduct.language%3Aespa%C3%B1ol%2COR%28product.siteId%3ACotoDigital%29%29&Nrpp=200";
|
||||
|
||||
async function getHrefs() {
|
||||
const element = await page.waitForSelector(".product_info_container a");
|
||||
await element?.dispose();
|
||||
const hrefs = await page.evaluate(() =>
|
||||
Array.from(
|
||||
document.querySelectorAll<HTMLAnchorElement>(
|
||||
".product_info_container a"
|
||||
),
|
||||
(a) => new URL(a.href).toString()
|
||||
)
|
||||
);
|
||||
return hrefs;
|
||||
const queue = new PQueue({ concurrency: 2 });
|
||||
|
||||
const pageSize = 300; // hasta 1000
|
||||
const links = Array.from({ length: Math.ceil(29000 / 300) }, (x, i) => i).map(
|
||||
(i) => {
|
||||
const url = new URL(initial);
|
||||
url.searchParams.set("No", `${i * pageSize}`);
|
||||
url.searchParams.set("Nrpp", `${pageSize}`);
|
||||
return url.toString();
|
||||
}
|
||||
);
|
||||
|
||||
const promises = links.map((l) => queue.add(getPage(l)));
|
||||
await Promise.all(promises);
|
||||
}
|
||||
|
||||
function getPage(url: string) {
|
||||
return async () => {
|
||||
let html;
|
||||
try {
|
||||
while (true) {
|
||||
const hrefs = await getHrefs();
|
||||
hrefs.forEach((href) => console.log(href));
|
||||
html = await getHtml(url);
|
||||
} catch (error) {
|
||||
await getPage(url)();
|
||||
return;
|
||||
}
|
||||
const { document } = parseHTML(html.toString("utf-8"));
|
||||
|
||||
const btn = await page.waitForSelector('a[title="Siguiente"]', {
|
||||
timeout: 5000,
|
||||
});
|
||||
await btn?.click();
|
||||
await btn?.dispose();
|
||||
}
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
})();
|
||||
const hrefs = Array.from(
|
||||
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
|
||||
(a) => new URL(a.href, url).toString()
|
||||
);
|
||||
hrefs.forEach((h) => process.stdout.write(h + "\n"));
|
||||
|
||||
// const nextLinks = Array.from(
|
||||
// document.querySelectorAll<HTMLAnchorElement>(
|
||||
// "#atg_store_pagination a[href]"
|
||||
// ),
|
||||
// (a) => new URL(a.href, url).toString()
|
||||
// );
|
||||
|
||||
// await Promise.all(
|
||||
// nextLinks
|
||||
// .filter((l) => !fetched.has(l))
|
||||
// .map((l) => {
|
||||
// fetched.add(l);
|
||||
// return queue.add(getPage(l));
|
||||
// })
|
||||
// );
|
||||
};
|
||||
}
|
||||
|
|
|
@ -11,7 +11,9 @@
|
|||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"puppeteer": "^21.6.1",
|
||||
"tsx": "^4.7.0"
|
||||
"linkedom": "^0.16.5",
|
||||
"p-queue": "^8.0.1",
|
||||
"tsx": "^4.7.0",
|
||||
"undici": "^6.2.0"
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue