mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-21 22:16:18 +00:00
WIP: coto-crawlee
This commit is contained in:
parent
258346e3d6
commit
9a82a556f9
8 changed files with 234 additions and 0 deletions
8
coto-crawlee/.dockerignore
Normal file
8
coto-crawlee/.dockerignore
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
# configurations
|
||||||
|
.idea
|
||||||
|
|
||||||
|
# crawlee storage folder
|
||||||
|
storage
|
||||||
|
|
||||||
|
# installed files
|
||||||
|
node_modules
|
6
coto-crawlee/.gitignore
vendored
Normal file
6
coto-crawlee/.gitignore
vendored
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
# This file tells Git which files shouldn't be added to source control
|
||||||
|
|
||||||
|
.idea
|
||||||
|
dist
|
||||||
|
node_modules
|
||||||
|
storage
|
51
coto-crawlee/Dockerfile
Normal file
51
coto-crawlee/Dockerfile
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
# Specify the base Docker image. You can read more about
|
||||||
|
# the available images at https://crawlee.dev/docs/guides/docker-images
|
||||||
|
# You can also use any other image from Docker Hub.
|
||||||
|
FROM apify/actor-node-playwright-chrome:20 AS builder
|
||||||
|
|
||||||
|
# Copy just package.json and package-lock.json
|
||||||
|
# to speed up the build using Docker layer cache.
|
||||||
|
COPY --chown=myuser package*.json ./
|
||||||
|
|
||||||
|
# Install all dependencies. Don't audit to speed up the installation.
|
||||||
|
RUN npm install --include=dev --audit=false
|
||||||
|
|
||||||
|
# Next, copy the source files using the user set
|
||||||
|
# in the base image.
|
||||||
|
COPY --chown=myuser . ./
|
||||||
|
|
||||||
|
# Install all dependencies and build the project.
|
||||||
|
# Don't audit to speed up the installation.
|
||||||
|
RUN npm run build
|
||||||
|
|
||||||
|
# Create final image
|
||||||
|
FROM apify/actor-node-playwright-chrome:20
|
||||||
|
|
||||||
|
# Copy only built JS files from builder image
|
||||||
|
COPY --from=builder --chown=myuser /home/myuser/dist ./dist
|
||||||
|
|
||||||
|
# Copy just package.json and package-lock.json
|
||||||
|
# to speed up the build using Docker layer cache.
|
||||||
|
COPY --chown=myuser package*.json ./
|
||||||
|
|
||||||
|
# Install NPM packages, skip optional and development dependencies to
|
||||||
|
# keep the image small. Avoid logging too much and print the dependency
|
||||||
|
# tree for debugging
|
||||||
|
RUN npm --quiet set progress=false \
|
||||||
|
&& npm install --omit=dev --omit=optional \
|
||||||
|
&& echo "Installed NPM packages:" \
|
||||||
|
&& (npm list --omit=dev --all || true) \
|
||||||
|
&& echo "Node.js version:" \
|
||||||
|
&& node --version \
|
||||||
|
&& echo "NPM version:" \
|
||||||
|
&& npm --version
|
||||||
|
|
||||||
|
# Next, copy the remaining files and directories with the source code.
|
||||||
|
# Since we do this after NPM install, quick build will be really fast
|
||||||
|
# for most source file changes.
|
||||||
|
COPY --chown=myuser . ./
|
||||||
|
|
||||||
|
|
||||||
|
# Run the image. If you know you won't need headful browsers,
|
||||||
|
# you can remove the XVFB start script for a micro perf gain.
|
||||||
|
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
|
9
coto-crawlee/README.md
Normal file
9
coto-crawlee/README.md
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
# Getting started with Crawlee
|
||||||
|
|
||||||
|
This example uses `PlaywrightCrawler` to recursively crawl https://crawlee.dev using the browser automation library [Playwright](https://playwright.dev).
|
||||||
|
|
||||||
|
You can find more examples and documentation at the following links:
|
||||||
|
|
||||||
|
- [Step-by-step tutorial](https://crawlee.dev/docs/introduction) for Crawlee
|
||||||
|
- `PlaywrightCrawler` [API documentation](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler)
|
||||||
|
- Other [examples](https://crawlee.dev/docs/examples/playwright-crawler)
|
BIN
coto-crawlee/bun.lockb
Executable file
BIN
coto-crawlee/bun.lockb
Executable file
Binary file not shown.
25
coto-crawlee/package.json
Normal file
25
coto-crawlee/package.json
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
{
|
||||||
|
"name": "coto-crawlee",
|
||||||
|
"version": "0.0.1",
|
||||||
|
"type": "module",
|
||||||
|
"description": "This is an example of a Crawlee project.",
|
||||||
|
"dependencies": {
|
||||||
|
"crawlee": "^3.0.0",
|
||||||
|
"playwright": "*"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@apify/tsconfig": "^0.1.0",
|
||||||
|
"tsx": "^4.4.0",
|
||||||
|
"typescript": "~5.5.0",
|
||||||
|
"@types/node": "^20.0.0"
|
||||||
|
},
|
||||||
|
"scripts": {
|
||||||
|
"start": "npm run start:dev",
|
||||||
|
"start:prod": "node dist/main.js",
|
||||||
|
"start:dev": "tsx src/main.ts",
|
||||||
|
"build": "tsc",
|
||||||
|
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
|
||||||
|
},
|
||||||
|
"author": "It's not you it's me",
|
||||||
|
"license": "ISC"
|
||||||
|
}
|
123
coto-crawlee/src/main.ts
Normal file
123
coto-crawlee/src/main.ts
Normal file
|
@ -0,0 +1,123 @@
|
||||||
|
// For more information, see https://crawlee.dev/
|
||||||
|
import {
|
||||||
|
CheerioCrawler,
|
||||||
|
createCheerioRouter,
|
||||||
|
createPlaywrightRouter,
|
||||||
|
enqueueLinks,
|
||||||
|
PlaywrightCrawler,
|
||||||
|
ProxyConfiguration,
|
||||||
|
} from "crawlee";
|
||||||
|
import { readFileSync } from "fs";
|
||||||
|
|
||||||
|
const proxyUrls = readFileSync("proxies.txt", "utf-8")
|
||||||
|
.split(/\r?\n/)
|
||||||
|
.filter((x) => x.trim().length > 0)
|
||||||
|
.map((x) => {
|
||||||
|
const [ip, port, username, password] = x.split(":");
|
||||||
|
return `http://${username}:${password}@${ip}:${port}`;
|
||||||
|
});
|
||||||
|
console.log(proxyUrls);
|
||||||
|
// const scrapoxyConfig = {
|
||||||
|
// username: "asdf",
|
||||||
|
// password: "asdf",
|
||||||
|
// proxyUrl: "partido-obrero:8888",
|
||||||
|
// apiUrl: "partido-obrero:8890",
|
||||||
|
// };
|
||||||
|
|
||||||
|
const proxyConf = new ProxyConfiguration({
|
||||||
|
proxyUrls: proxyUrls, // proxyUrls: Array(100).fill(
|
||||||
|
// `http://${scrapoxyConfig.username}:${scrapoxyConfig.password}@${scrapoxyConfig.proxyUrl}/`
|
||||||
|
// // "http://asdfasdf-rotate:asdfasdf@p.webshare.io"
|
||||||
|
// ),
|
||||||
|
});
|
||||||
|
|
||||||
|
const router = createCheerioRouter();
|
||||||
|
router.addHandler("DETAIL", async ({ request, parseWithCheerio, pushData }) => {
|
||||||
|
const $ = await parseWithCheerio();
|
||||||
|
|
||||||
|
const name = $("h1.product_page").text().trim();
|
||||||
|
await pushData({ name, url: request.loadedUrl }, "products");
|
||||||
|
});
|
||||||
|
router.addHandler(
|
||||||
|
"CATEGORY",
|
||||||
|
async ({ request, enqueueLinks, log, pushData, parseWithCheerio }) => {
|
||||||
|
// const title = await page.title();
|
||||||
|
// log.info(`Title of ${request.loadedUrl} is '${title}'`);
|
||||||
|
const $ = await parseWithCheerio();
|
||||||
|
|
||||||
|
await enqueueLinks({
|
||||||
|
selector: 'a[href^="/sitios/cdigi/producto"]',
|
||||||
|
label: "DETAIL",
|
||||||
|
});
|
||||||
|
|
||||||
|
const productsEls = $("ul#products").children("li");
|
||||||
|
|
||||||
|
for (const el of productsEls) {
|
||||||
|
const title = $(el)
|
||||||
|
.find(".atg_store_productTitle .descrip_full")
|
||||||
|
.text()
|
||||||
|
.trim();
|
||||||
|
const href = $(el).find('a[href^="/sitios/cdigi/producto"]');
|
||||||
|
await pushData({ title, url: href.attr("href") }, "product-list");
|
||||||
|
}
|
||||||
|
// Save results as JSON to ./storage/datasets/default
|
||||||
|
|
||||||
|
// Extract links from the current page
|
||||||
|
// and add them to the crawling queue.
|
||||||
|
await enqueueLinks({
|
||||||
|
selector: "[title=Siguiente]",
|
||||||
|
label: "CATEGORY",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
);
|
||||||
|
router.addDefaultHandler(async ({ enqueueLinks }) => {
|
||||||
|
await enqueueLinks({
|
||||||
|
urls: ["https://www.cotodigital3.com.ar/sitios/cdigi/browse"],
|
||||||
|
label: "CATEGORY",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// PlaywrightCrawler crawls the web using a headless
|
||||||
|
// browser controlled by the Playwright library.
|
||||||
|
const crawler = new CheerioCrawler({
|
||||||
|
proxyConfiguration: proxyConf,
|
||||||
|
ignoreSslErrors: true,
|
||||||
|
|
||||||
|
useSessionPool: true,
|
||||||
|
sessionPoolOptions: {
|
||||||
|
blockedStatusCodes: [401, 403, 429, 500],
|
||||||
|
},
|
||||||
|
minConcurrency: 10,
|
||||||
|
maxConcurrency: 50,
|
||||||
|
maxRequestRetries: 50,
|
||||||
|
requestHandlerTimeoutSecs: 30,
|
||||||
|
requestHandler: router,
|
||||||
|
// async errorHandler({ request, response, log }) {
|
||||||
|
// if (!response || !("statusCode" in response)) {
|
||||||
|
// log.error("Response has no statusCode", { response });
|
||||||
|
// return;
|
||||||
|
// }
|
||||||
|
// if (response.statusCode === 557) {
|
||||||
|
// log.warning("No proxy available, waiting");
|
||||||
|
// await new Promise((resolve) => setTimeout(resolve, 30 * 1000));
|
||||||
|
// return;
|
||||||
|
// }
|
||||||
|
// const proxyName = response.headers["x-scrapoxy-proxyname"];
|
||||||
|
// log.warning(`Resetting proxy`, { proxyName, headers: response.headers });
|
||||||
|
// const res = await fetch(
|
||||||
|
// `http://${scrapoxyConfig.apiUrl}/api/scraper/project/proxies/remove`,
|
||||||
|
// {
|
||||||
|
// method: "POST",
|
||||||
|
// body: JSON.stringify([{ id: proxyName, force: false }]),
|
||||||
|
// headers: {
|
||||||
|
// Authorization: `Basic ${btoa(`${scrapoxyConfig.username}:${scrapoxyConfig.password}`)}`,
|
||||||
|
// "Content-Type": "application/json",
|
||||||
|
// },
|
||||||
|
// }
|
||||||
|
// );
|
||||||
|
// if (!res.ok)
|
||||||
|
// log.error(`status code ${res.status}`, { json: await res.json() });
|
||||||
|
// },
|
||||||
|
});
|
||||||
|
|
||||||
|
await crawler.run(["https://www.cotodigital3.com.ar/"]);
|
12
coto-crawlee/tsconfig.json
Normal file
12
coto-crawlee/tsconfig.json
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
{
|
||||||
|
"extends": "@apify/tsconfig",
|
||||||
|
"compilerOptions": {
|
||||||
|
"module": "NodeNext",
|
||||||
|
"moduleResolution": "NodeNext",
|
||||||
|
"target": "ES2022",
|
||||||
|
"outDir": "dist",
|
||||||
|
"noUnusedLocals": false,
|
||||||
|
"lib": ["DOM"]
|
||||||
|
},
|
||||||
|
"include": ["./src/**/*"]
|
||||||
|
}
|
Loading…
Reference in a new issue