mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-21 14:16:18 +00:00
rm coto-crawlee
This commit is contained in:
parent
9a82a556f9
commit
9cb7c0e27e
8 changed files with 0 additions and 234 deletions
|
@ -1,8 +0,0 @@
|
||||||
# configurations
|
|
||||||
.idea
|
|
||||||
|
|
||||||
# crawlee storage folder
|
|
||||||
storage
|
|
||||||
|
|
||||||
# installed files
|
|
||||||
node_modules
|
|
6
coto-crawlee/.gitignore
vendored
6
coto-crawlee/.gitignore
vendored
|
@ -1,6 +0,0 @@
|
||||||
# This file tells Git which files shouldn't be added to source control
|
|
||||||
|
|
||||||
.idea
|
|
||||||
dist
|
|
||||||
node_modules
|
|
||||||
storage
|
|
|
@ -1,51 +0,0 @@
|
||||||
# Specify the base Docker image. You can read more about
|
|
||||||
# the available images at https://crawlee.dev/docs/guides/docker-images
|
|
||||||
# You can also use any other image from Docker Hub.
|
|
||||||
FROM apify/actor-node-playwright-chrome:20 AS builder
|
|
||||||
|
|
||||||
# Copy just package.json and package-lock.json
|
|
||||||
# to speed up the build using Docker layer cache.
|
|
||||||
COPY --chown=myuser package*.json ./
|
|
||||||
|
|
||||||
# Install all dependencies. Don't audit to speed up the installation.
|
|
||||||
RUN npm install --include=dev --audit=false
|
|
||||||
|
|
||||||
# Next, copy the source files using the user set
|
|
||||||
# in the base image.
|
|
||||||
COPY --chown=myuser . ./
|
|
||||||
|
|
||||||
# Install all dependencies and build the project.
|
|
||||||
# Don't audit to speed up the installation.
|
|
||||||
RUN npm run build
|
|
||||||
|
|
||||||
# Create final image
|
|
||||||
FROM apify/actor-node-playwright-chrome:20
|
|
||||||
|
|
||||||
# Copy only built JS files from builder image
|
|
||||||
COPY --from=builder --chown=myuser /home/myuser/dist ./dist
|
|
||||||
|
|
||||||
# Copy just package.json and package-lock.json
|
|
||||||
# to speed up the build using Docker layer cache.
|
|
||||||
COPY --chown=myuser package*.json ./
|
|
||||||
|
|
||||||
# Install NPM packages, skip optional and development dependencies to
|
|
||||||
# keep the image small. Avoid logging too much and print the dependency
|
|
||||||
# tree for debugging
|
|
||||||
RUN npm --quiet set progress=false \
|
|
||||||
&& npm install --omit=dev --omit=optional \
|
|
||||||
&& echo "Installed NPM packages:" \
|
|
||||||
&& (npm list --omit=dev --all || true) \
|
|
||||||
&& echo "Node.js version:" \
|
|
||||||
&& node --version \
|
|
||||||
&& echo "NPM version:" \
|
|
||||||
&& npm --version
|
|
||||||
|
|
||||||
# Next, copy the remaining files and directories with the source code.
|
|
||||||
# Since we do this after NPM install, quick build will be really fast
|
|
||||||
# for most source file changes.
|
|
||||||
COPY --chown=myuser . ./
|
|
||||||
|
|
||||||
|
|
||||||
# Run the image. If you know you won't need headful browsers,
|
|
||||||
# you can remove the XVFB start script for a micro perf gain.
|
|
||||||
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
|
|
|
@ -1,9 +0,0 @@
|
||||||
# Getting started with Crawlee
|
|
||||||
|
|
||||||
This example uses `PlaywrightCrawler` to recursively crawl https://crawlee.dev using the browser automation library [Playwright](https://playwright.dev).
|
|
||||||
|
|
||||||
You can find more examples and documentation at the following links:
|
|
||||||
|
|
||||||
- [Step-by-step tutorial](https://crawlee.dev/docs/introduction) for Crawlee
|
|
||||||
- `PlaywrightCrawler` [API documentation](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler)
|
|
||||||
- Other [examples](https://crawlee.dev/docs/examples/playwright-crawler)
|
|
Binary file not shown.
|
@ -1,25 +0,0 @@
|
||||||
{
|
|
||||||
"name": "coto-crawlee",
|
|
||||||
"version": "0.0.1",
|
|
||||||
"type": "module",
|
|
||||||
"description": "This is an example of a Crawlee project.",
|
|
||||||
"dependencies": {
|
|
||||||
"crawlee": "^3.0.0",
|
|
||||||
"playwright": "*"
|
|
||||||
},
|
|
||||||
"devDependencies": {
|
|
||||||
"@apify/tsconfig": "^0.1.0",
|
|
||||||
"tsx": "^4.4.0",
|
|
||||||
"typescript": "~5.5.0",
|
|
||||||
"@types/node": "^20.0.0"
|
|
||||||
},
|
|
||||||
"scripts": {
|
|
||||||
"start": "npm run start:dev",
|
|
||||||
"start:prod": "node dist/main.js",
|
|
||||||
"start:dev": "tsx src/main.ts",
|
|
||||||
"build": "tsc",
|
|
||||||
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
|
|
||||||
},
|
|
||||||
"author": "It's not you it's me",
|
|
||||||
"license": "ISC"
|
|
||||||
}
|
|
|
@ -1,123 +0,0 @@
|
||||||
// For more information, see https://crawlee.dev/
|
|
||||||
import {
|
|
||||||
CheerioCrawler,
|
|
||||||
createCheerioRouter,
|
|
||||||
createPlaywrightRouter,
|
|
||||||
enqueueLinks,
|
|
||||||
PlaywrightCrawler,
|
|
||||||
ProxyConfiguration,
|
|
||||||
} from "crawlee";
|
|
||||||
import { readFileSync } from "fs";
|
|
||||||
|
|
||||||
const proxyUrls = readFileSync("proxies.txt", "utf-8")
|
|
||||||
.split(/\r?\n/)
|
|
||||||
.filter((x) => x.trim().length > 0)
|
|
||||||
.map((x) => {
|
|
||||||
const [ip, port, username, password] = x.split(":");
|
|
||||||
return `http://${username}:${password}@${ip}:${port}`;
|
|
||||||
});
|
|
||||||
console.log(proxyUrls);
|
|
||||||
// const scrapoxyConfig = {
|
|
||||||
// username: "asdf",
|
|
||||||
// password: "asdf",
|
|
||||||
// proxyUrl: "partido-obrero:8888",
|
|
||||||
// apiUrl: "partido-obrero:8890",
|
|
||||||
// };
|
|
||||||
|
|
||||||
const proxyConf = new ProxyConfiguration({
|
|
||||||
proxyUrls: proxyUrls, // proxyUrls: Array(100).fill(
|
|
||||||
// `http://${scrapoxyConfig.username}:${scrapoxyConfig.password}@${scrapoxyConfig.proxyUrl}/`
|
|
||||||
// // "http://asdfasdf-rotate:asdfasdf@p.webshare.io"
|
|
||||||
// ),
|
|
||||||
});
|
|
||||||
|
|
||||||
const router = createCheerioRouter();
|
|
||||||
router.addHandler("DETAIL", async ({ request, parseWithCheerio, pushData }) => {
|
|
||||||
const $ = await parseWithCheerio();
|
|
||||||
|
|
||||||
const name = $("h1.product_page").text().trim();
|
|
||||||
await pushData({ name, url: request.loadedUrl }, "products");
|
|
||||||
});
|
|
||||||
router.addHandler(
|
|
||||||
"CATEGORY",
|
|
||||||
async ({ request, enqueueLinks, log, pushData, parseWithCheerio }) => {
|
|
||||||
// const title = await page.title();
|
|
||||||
// log.info(`Title of ${request.loadedUrl} is '${title}'`);
|
|
||||||
const $ = await parseWithCheerio();
|
|
||||||
|
|
||||||
await enqueueLinks({
|
|
||||||
selector: 'a[href^="/sitios/cdigi/producto"]',
|
|
||||||
label: "DETAIL",
|
|
||||||
});
|
|
||||||
|
|
||||||
const productsEls = $("ul#products").children("li");
|
|
||||||
|
|
||||||
for (const el of productsEls) {
|
|
||||||
const title = $(el)
|
|
||||||
.find(".atg_store_productTitle .descrip_full")
|
|
||||||
.text()
|
|
||||||
.trim();
|
|
||||||
const href = $(el).find('a[href^="/sitios/cdigi/producto"]');
|
|
||||||
await pushData({ title, url: href.attr("href") }, "product-list");
|
|
||||||
}
|
|
||||||
// Save results as JSON to ./storage/datasets/default
|
|
||||||
|
|
||||||
// Extract links from the current page
|
|
||||||
// and add them to the crawling queue.
|
|
||||||
await enqueueLinks({
|
|
||||||
selector: "[title=Siguiente]",
|
|
||||||
label: "CATEGORY",
|
|
||||||
});
|
|
||||||
}
|
|
||||||
);
|
|
||||||
router.addDefaultHandler(async ({ enqueueLinks }) => {
|
|
||||||
await enqueueLinks({
|
|
||||||
urls: ["https://www.cotodigital3.com.ar/sitios/cdigi/browse"],
|
|
||||||
label: "CATEGORY",
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
// PlaywrightCrawler crawls the web using a headless
|
|
||||||
// browser controlled by the Playwright library.
|
|
||||||
const crawler = new CheerioCrawler({
|
|
||||||
proxyConfiguration: proxyConf,
|
|
||||||
ignoreSslErrors: true,
|
|
||||||
|
|
||||||
useSessionPool: true,
|
|
||||||
sessionPoolOptions: {
|
|
||||||
blockedStatusCodes: [401, 403, 429, 500],
|
|
||||||
},
|
|
||||||
minConcurrency: 10,
|
|
||||||
maxConcurrency: 50,
|
|
||||||
maxRequestRetries: 50,
|
|
||||||
requestHandlerTimeoutSecs: 30,
|
|
||||||
requestHandler: router,
|
|
||||||
// async errorHandler({ request, response, log }) {
|
|
||||||
// if (!response || !("statusCode" in response)) {
|
|
||||||
// log.error("Response has no statusCode", { response });
|
|
||||||
// return;
|
|
||||||
// }
|
|
||||||
// if (response.statusCode === 557) {
|
|
||||||
// log.warning("No proxy available, waiting");
|
|
||||||
// await new Promise((resolve) => setTimeout(resolve, 30 * 1000));
|
|
||||||
// return;
|
|
||||||
// }
|
|
||||||
// const proxyName = response.headers["x-scrapoxy-proxyname"];
|
|
||||||
// log.warning(`Resetting proxy`, { proxyName, headers: response.headers });
|
|
||||||
// const res = await fetch(
|
|
||||||
// `http://${scrapoxyConfig.apiUrl}/api/scraper/project/proxies/remove`,
|
|
||||||
// {
|
|
||||||
// method: "POST",
|
|
||||||
// body: JSON.stringify([{ id: proxyName, force: false }]),
|
|
||||||
// headers: {
|
|
||||||
// Authorization: `Basic ${btoa(`${scrapoxyConfig.username}:${scrapoxyConfig.password}`)}`,
|
|
||||||
// "Content-Type": "application/json",
|
|
||||||
// },
|
|
||||||
// }
|
|
||||||
// );
|
|
||||||
// if (!res.ok)
|
|
||||||
// log.error(`status code ${res.status}`, { json: await res.json() });
|
|
||||||
// },
|
|
||||||
});
|
|
||||||
|
|
||||||
await crawler.run(["https://www.cotodigital3.com.ar/"]);
|
|
|
@ -1,12 +0,0 @@
|
||||||
{
|
|
||||||
"extends": "@apify/tsconfig",
|
|
||||||
"compilerOptions": {
|
|
||||||
"module": "NodeNext",
|
|
||||||
"moduleResolution": "NodeNext",
|
|
||||||
"target": "ES2022",
|
|
||||||
"outDir": "dist",
|
|
||||||
"noUnusedLocals": false,
|
|
||||||
"lib": ["DOM"]
|
|
||||||
},
|
|
||||||
"include": ["./src/**/*"]
|
|
||||||
}
|
|
Loading…
Reference in a new issue