scraper links coto

This commit is contained in:
Cat /dev/Nulo 2023-12-21 22:44:18 -03:00
parent 3f7520393a
commit 32d64b83d4
5 changed files with 1137 additions and 0 deletions

29
.vscode/launch.json vendored Normal file
View file

@ -0,0 +1,29 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"type": "node",
"request": "launch",
"name": "Launch Program",
"skipFiles": ["<node_internals>/**"],
"cwd": "${workspaceFolder}/scraper",
"runtimeArgs": ["--import", "tsx/esm"],
"program": "${workspaceFolder}/scraper/scrap.ts",
"args": ["carrefour.warc.gz"],
"outFiles": ["${workspaceFolder}/**/*.js"]
},
{
"type": "node",
"request": "launch",
"name": "coto-link-scraper",
"skipFiles": ["<node_internals>/**"],
"cwd": "${workspaceFolder}/coto-link-scraper",
"runtimeArgs": ["--import", "tsx/esm"],
"program": "${workspaceFolder}/coto-link-scraper/index.ts",
"outFiles": ["${workspaceFolder}/**/*.js"]
},
]
}

View file

@ -0,0 +1,38 @@
// Import puppeteer
import puppeteer from "puppeteer";
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(
"https://www.cotodigital3.com.ar/sitios/cdigi/browse/catalogo-almac%C3%A9n/"
);
async function getHrefs() {
const element = await page.waitForSelector(".product_info_container a");
await element?.dispose();
const hrefs = await page.evaluate(() =>
Array.from(
document.querySelectorAll<HTMLAnchorElement>(
".product_info_container a"
),
(a) => new URL(a.href).toString()
)
);
return hrefs;
}
try {
while (true) {
const hrefs = await getHrefs();
hrefs.forEach((href) => console.log(href));
const btn = await page.waitForSelector('a[title="Siguiente"]', {
timeout: 5000,
});
await btn?.click();
await btn?.dispose();
}
} finally {
await browser.close();
}
})();

View file

@ -0,0 +1,17 @@
{
"name": "coto-link-scraper",
"type": "module",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"puppeteer": "^21.6.1",
"tsx": "^4.7.0"
}
}

File diff suppressed because it is too large Load diff