mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-21 14:16:18 +00:00
rm coto-crawlee
This commit is contained in:
parent
9a82a556f9
commit
9cb7c0e27e
8 changed files with 0 additions and 234 deletions
|
@ -1,8 +0,0 @@
|
|||
# configurations
|
||||
.idea
|
||||
|
||||
# crawlee storage folder
|
||||
storage
|
||||
|
||||
# installed files
|
||||
node_modules
|
6
coto-crawlee/.gitignore
vendored
6
coto-crawlee/.gitignore
vendored
|
@ -1,6 +0,0 @@
|
|||
# This file tells Git which files shouldn't be added to source control
|
||||
|
||||
.idea
|
||||
dist
|
||||
node_modules
|
||||
storage
|
|
@ -1,51 +0,0 @@
|
|||
# Specify the base Docker image. You can read more about
|
||||
# the available images at https://crawlee.dev/docs/guides/docker-images
|
||||
# You can also use any other image from Docker Hub.
|
||||
FROM apify/actor-node-playwright-chrome:20 AS builder
|
||||
|
||||
# Copy just package.json and package-lock.json
|
||||
# to speed up the build using Docker layer cache.
|
||||
COPY --chown=myuser package*.json ./
|
||||
|
||||
# Install all dependencies. Don't audit to speed up the installation.
|
||||
RUN npm install --include=dev --audit=false
|
||||
|
||||
# Next, copy the source files using the user set
|
||||
# in the base image.
|
||||
COPY --chown=myuser . ./
|
||||
|
||||
# Install all dependencies and build the project.
|
||||
# Don't audit to speed up the installation.
|
||||
RUN npm run build
|
||||
|
||||
# Create final image
|
||||
FROM apify/actor-node-playwright-chrome:20
|
||||
|
||||
# Copy only built JS files from builder image
|
||||
COPY --from=builder --chown=myuser /home/myuser/dist ./dist
|
||||
|
||||
# Copy just package.json and package-lock.json
|
||||
# to speed up the build using Docker layer cache.
|
||||
COPY --chown=myuser package*.json ./
|
||||
|
||||
# Install NPM packages, skip optional and development dependencies to
|
||||
# keep the image small. Avoid logging too much and print the dependency
|
||||
# tree for debugging
|
||||
RUN npm --quiet set progress=false \
|
||||
&& npm install --omit=dev --omit=optional \
|
||||
&& echo "Installed NPM packages:" \
|
||||
&& (npm list --omit=dev --all || true) \
|
||||
&& echo "Node.js version:" \
|
||||
&& node --version \
|
||||
&& echo "NPM version:" \
|
||||
&& npm --version
|
||||
|
||||
# Next, copy the remaining files and directories with the source code.
|
||||
# Since we do this after NPM install, quick build will be really fast
|
||||
# for most source file changes.
|
||||
COPY --chown=myuser . ./
|
||||
|
||||
|
||||
# Run the image. If you know you won't need headful browsers,
|
||||
# you can remove the XVFB start script for a micro perf gain.
|
||||
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
|
|
@ -1,9 +0,0 @@
|
|||
# Getting started with Crawlee
|
||||
|
||||
This example uses `PlaywrightCrawler` to recursively crawl https://crawlee.dev using the browser automation library [Playwright](https://playwright.dev).
|
||||
|
||||
You can find more examples and documentation at the following links:
|
||||
|
||||
- [Step-by-step tutorial](https://crawlee.dev/docs/introduction) for Crawlee
|
||||
- `PlaywrightCrawler` [API documentation](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler)
|
||||
- Other [examples](https://crawlee.dev/docs/examples/playwright-crawler)
|
Binary file not shown.
|
@ -1,25 +0,0 @@
|
|||
{
|
||||
"name": "coto-crawlee",
|
||||
"version": "0.0.1",
|
||||
"type": "module",
|
||||
"description": "This is an example of a Crawlee project.",
|
||||
"dependencies": {
|
||||
"crawlee": "^3.0.0",
|
||||
"playwright": "*"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@apify/tsconfig": "^0.1.0",
|
||||
"tsx": "^4.4.0",
|
||||
"typescript": "~5.5.0",
|
||||
"@types/node": "^20.0.0"
|
||||
},
|
||||
"scripts": {
|
||||
"start": "npm run start:dev",
|
||||
"start:prod": "node dist/main.js",
|
||||
"start:dev": "tsx src/main.ts",
|
||||
"build": "tsc",
|
||||
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
|
||||
},
|
||||
"author": "It's not you it's me",
|
||||
"license": "ISC"
|
||||
}
|
|
@ -1,123 +0,0 @@
|
|||
// For more information, see https://crawlee.dev/
|
||||
import {
|
||||
CheerioCrawler,
|
||||
createCheerioRouter,
|
||||
createPlaywrightRouter,
|
||||
enqueueLinks,
|
||||
PlaywrightCrawler,
|
||||
ProxyConfiguration,
|
||||
} from "crawlee";
|
||||
import { readFileSync } from "fs";
|
||||
|
||||
const proxyUrls = readFileSync("proxies.txt", "utf-8")
|
||||
.split(/\r?\n/)
|
||||
.filter((x) => x.trim().length > 0)
|
||||
.map((x) => {
|
||||
const [ip, port, username, password] = x.split(":");
|
||||
return `http://${username}:${password}@${ip}:${port}`;
|
||||
});
|
||||
console.log(proxyUrls);
|
||||
// const scrapoxyConfig = {
|
||||
// username: "asdf",
|
||||
// password: "asdf",
|
||||
// proxyUrl: "partido-obrero:8888",
|
||||
// apiUrl: "partido-obrero:8890",
|
||||
// };
|
||||
|
||||
const proxyConf = new ProxyConfiguration({
|
||||
proxyUrls: proxyUrls, // proxyUrls: Array(100).fill(
|
||||
// `http://${scrapoxyConfig.username}:${scrapoxyConfig.password}@${scrapoxyConfig.proxyUrl}/`
|
||||
// // "http://asdfasdf-rotate:asdfasdf@p.webshare.io"
|
||||
// ),
|
||||
});
|
||||
|
||||
const router = createCheerioRouter();
|
||||
router.addHandler("DETAIL", async ({ request, parseWithCheerio, pushData }) => {
|
||||
const $ = await parseWithCheerio();
|
||||
|
||||
const name = $("h1.product_page").text().trim();
|
||||
await pushData({ name, url: request.loadedUrl }, "products");
|
||||
});
|
||||
router.addHandler(
|
||||
"CATEGORY",
|
||||
async ({ request, enqueueLinks, log, pushData, parseWithCheerio }) => {
|
||||
// const title = await page.title();
|
||||
// log.info(`Title of ${request.loadedUrl} is '${title}'`);
|
||||
const $ = await parseWithCheerio();
|
||||
|
||||
await enqueueLinks({
|
||||
selector: 'a[href^="/sitios/cdigi/producto"]',
|
||||
label: "DETAIL",
|
||||
});
|
||||
|
||||
const productsEls = $("ul#products").children("li");
|
||||
|
||||
for (const el of productsEls) {
|
||||
const title = $(el)
|
||||
.find(".atg_store_productTitle .descrip_full")
|
||||
.text()
|
||||
.trim();
|
||||
const href = $(el).find('a[href^="/sitios/cdigi/producto"]');
|
||||
await pushData({ title, url: href.attr("href") }, "product-list");
|
||||
}
|
||||
// Save results as JSON to ./storage/datasets/default
|
||||
|
||||
// Extract links from the current page
|
||||
// and add them to the crawling queue.
|
||||
await enqueueLinks({
|
||||
selector: "[title=Siguiente]",
|
||||
label: "CATEGORY",
|
||||
});
|
||||
}
|
||||
);
|
||||
router.addDefaultHandler(async ({ enqueueLinks }) => {
|
||||
await enqueueLinks({
|
||||
urls: ["https://www.cotodigital3.com.ar/sitios/cdigi/browse"],
|
||||
label: "CATEGORY",
|
||||
});
|
||||
});
|
||||
|
||||
// PlaywrightCrawler crawls the web using a headless
|
||||
// browser controlled by the Playwright library.
|
||||
const crawler = new CheerioCrawler({
|
||||
proxyConfiguration: proxyConf,
|
||||
ignoreSslErrors: true,
|
||||
|
||||
useSessionPool: true,
|
||||
sessionPoolOptions: {
|
||||
blockedStatusCodes: [401, 403, 429, 500],
|
||||
},
|
||||
minConcurrency: 10,
|
||||
maxConcurrency: 50,
|
||||
maxRequestRetries: 50,
|
||||
requestHandlerTimeoutSecs: 30,
|
||||
requestHandler: router,
|
||||
// async errorHandler({ request, response, log }) {
|
||||
// if (!response || !("statusCode" in response)) {
|
||||
// log.error("Response has no statusCode", { response });
|
||||
// return;
|
||||
// }
|
||||
// if (response.statusCode === 557) {
|
||||
// log.warning("No proxy available, waiting");
|
||||
// await new Promise((resolve) => setTimeout(resolve, 30 * 1000));
|
||||
// return;
|
||||
// }
|
||||
// const proxyName = response.headers["x-scrapoxy-proxyname"];
|
||||
// log.warning(`Resetting proxy`, { proxyName, headers: response.headers });
|
||||
// const res = await fetch(
|
||||
// `http://${scrapoxyConfig.apiUrl}/api/scraper/project/proxies/remove`,
|
||||
// {
|
||||
// method: "POST",
|
||||
// body: JSON.stringify([{ id: proxyName, force: false }]),
|
||||
// headers: {
|
||||
// Authorization: `Basic ${btoa(`${scrapoxyConfig.username}:${scrapoxyConfig.password}`)}`,
|
||||
// "Content-Type": "application/json",
|
||||
// },
|
||||
// }
|
||||
// );
|
||||
// if (!res.ok)
|
||||
// log.error(`status code ${res.status}`, { json: await res.json() });
|
||||
// },
|
||||
});
|
||||
|
||||
await crawler.run(["https://www.cotodigital3.com.ar/"]);
|
|
@ -1,12 +0,0 @@
|
|||
{
|
||||
"extends": "@apify/tsconfig",
|
||||
"compilerOptions": {
|
||||
"module": "NodeNext",
|
||||
"moduleResolution": "NodeNext",
|
||||
"target": "ES2022",
|
||||
"outDir": "dist",
|
||||
"noUnusedLocals": false,
|
||||
"lib": ["DOM"]
|
||||
},
|
||||
"include": ["./src/**/*"]
|
||||
}
|
Loading…
Reference in a new issue