rm coto-crawlee

This commit is contained in:
Cat /dev/Nulo 2024-11-13 09:49:44 -03:00
parent 9a82a556f9
commit 9cb7c0e27e
8 changed files with 0 additions and 234 deletions

View file

@ -1,8 +0,0 @@
# configurations
.idea
# crawlee storage folder
storage
# installed files
node_modules

View file

@ -1,6 +0,0 @@
# This file tells Git which files shouldn't be added to source control
.idea
dist
node_modules
storage

View file

@ -1,51 +0,0 @@
# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:20 AS builder
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install all dependencies. Don't audit to speed up the installation.
RUN npm install --include=dev --audit=false
# Next, copy the source files using the user set
# in the base image.
COPY --chown=myuser . ./
# Install all dependencies and build the project.
# Don't audit to speed up the installation.
RUN npm run build
# Create final image
FROM apify/actor-node-playwright-chrome:20
# Copy only built JS files from builder image
COPY --from=builder --chown=myuser /home/myuser/dist ./dist
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./
# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent

View file

@ -1,9 +0,0 @@
# Getting started with Crawlee
This example uses `PlaywrightCrawler` to recursively crawl https://crawlee.dev using the browser automation library [Playwright](https://playwright.dev).
You can find more examples and documentation at the following links:
- [Step-by-step tutorial](https://crawlee.dev/docs/introduction) for Crawlee
- `PlaywrightCrawler` [API documentation](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler)
- Other [examples](https://crawlee.dev/docs/examples/playwright-crawler)

Binary file not shown.

View file

@ -1,25 +0,0 @@
{
"name": "coto-crawlee",
"version": "0.0.1",
"type": "module",
"description": "This is an example of a Crawlee project.",
"dependencies": {
"crawlee": "^3.0.0",
"playwright": "*"
},
"devDependencies": {
"@apify/tsconfig": "^0.1.0",
"tsx": "^4.4.0",
"typescript": "~5.5.0",
"@types/node": "^20.0.0"
},
"scripts": {
"start": "npm run start:dev",
"start:prod": "node dist/main.js",
"start:dev": "tsx src/main.ts",
"build": "tsc",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
"license": "ISC"
}

View file

@ -1,123 +0,0 @@
// For more information, see https://crawlee.dev/
import {
CheerioCrawler,
createCheerioRouter,
createPlaywrightRouter,
enqueueLinks,
PlaywrightCrawler,
ProxyConfiguration,
} from "crawlee";
import { readFileSync } from "fs";
const proxyUrls = readFileSync("proxies.txt", "utf-8")
.split(/\r?\n/)
.filter((x) => x.trim().length > 0)
.map((x) => {
const [ip, port, username, password] = x.split(":");
return `http://${username}:${password}@${ip}:${port}`;
});
console.log(proxyUrls);
// const scrapoxyConfig = {
// username: "asdf",
// password: "asdf",
// proxyUrl: "partido-obrero:8888",
// apiUrl: "partido-obrero:8890",
// };
const proxyConf = new ProxyConfiguration({
proxyUrls: proxyUrls, // proxyUrls: Array(100).fill(
// `http://${scrapoxyConfig.username}:${scrapoxyConfig.password}@${scrapoxyConfig.proxyUrl}/`
// // "http://asdfasdf-rotate:asdfasdf@p.webshare.io"
// ),
});
const router = createCheerioRouter();
router.addHandler("DETAIL", async ({ request, parseWithCheerio, pushData }) => {
const $ = await parseWithCheerio();
const name = $("h1.product_page").text().trim();
await pushData({ name, url: request.loadedUrl }, "products");
});
router.addHandler(
"CATEGORY",
async ({ request, enqueueLinks, log, pushData, parseWithCheerio }) => {
// const title = await page.title();
// log.info(`Title of ${request.loadedUrl} is '${title}'`);
const $ = await parseWithCheerio();
await enqueueLinks({
selector: 'a[href^="/sitios/cdigi/producto"]',
label: "DETAIL",
});
const productsEls = $("ul#products").children("li");
for (const el of productsEls) {
const title = $(el)
.find(".atg_store_productTitle .descrip_full")
.text()
.trim();
const href = $(el).find('a[href^="/sitios/cdigi/producto"]');
await pushData({ title, url: href.attr("href") }, "product-list");
}
// Save results as JSON to ./storage/datasets/default
// Extract links from the current page
// and add them to the crawling queue.
await enqueueLinks({
selector: "[title=Siguiente]",
label: "CATEGORY",
});
}
);
router.addDefaultHandler(async ({ enqueueLinks }) => {
await enqueueLinks({
urls: ["https://www.cotodigital3.com.ar/sitios/cdigi/browse"],
label: "CATEGORY",
});
});
// PlaywrightCrawler crawls the web using a headless
// browser controlled by the Playwright library.
const crawler = new CheerioCrawler({
proxyConfiguration: proxyConf,
ignoreSslErrors: true,
useSessionPool: true,
sessionPoolOptions: {
blockedStatusCodes: [401, 403, 429, 500],
},
minConcurrency: 10,
maxConcurrency: 50,
maxRequestRetries: 50,
requestHandlerTimeoutSecs: 30,
requestHandler: router,
// async errorHandler({ request, response, log }) {
// if (!response || !("statusCode" in response)) {
// log.error("Response has no statusCode", { response });
// return;
// }
// if (response.statusCode === 557) {
// log.warning("No proxy available, waiting");
// await new Promise((resolve) => setTimeout(resolve, 30 * 1000));
// return;
// }
// const proxyName = response.headers["x-scrapoxy-proxyname"];
// log.warning(`Resetting proxy`, { proxyName, headers: response.headers });
// const res = await fetch(
// `http://${scrapoxyConfig.apiUrl}/api/scraper/project/proxies/remove`,
// {
// method: "POST",
// body: JSON.stringify([{ id: proxyName, force: false }]),
// headers: {
// Authorization: `Basic ${btoa(`${scrapoxyConfig.username}:${scrapoxyConfig.password}`)}`,
// "Content-Type": "application/json",
// },
// }
// );
// if (!res.ok)
// log.error(`status code ${res.status}`, { json: await res.json() });
// },
});
await crawler.run(["https://www.cotodigital3.com.ar/"]);

View file

@ -1,12 +0,0 @@
{
"extends": "@apify/tsconfig",
"compilerOptions": {
"module": "NodeNext",
"moduleResolution": "NodeNext",
"target": "ES2022",
"outDir": "dist",
"noUnusedLocals": false,
"lib": ["DOM"]
},
"include": ["./src/**/*"]
}