diff --git a/coto-crawlee/.dockerignore b/coto-crawlee/.dockerignore deleted file mode 100644 index 2710668..0000000 --- a/coto-crawlee/.dockerignore +++ /dev/null @@ -1,8 +0,0 @@ -# configurations -.idea - -# crawlee storage folder -storage - -# installed files -node_modules diff --git a/coto-crawlee/.gitignore b/coto-crawlee/.gitignore deleted file mode 100644 index e0996e6..0000000 --- a/coto-crawlee/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -# This file tells Git which files shouldn't be added to source control - -.idea -dist -node_modules -storage diff --git a/coto-crawlee/Dockerfile b/coto-crawlee/Dockerfile deleted file mode 100644 index 1fe6784..0000000 --- a/coto-crawlee/Dockerfile +++ /dev/null @@ -1,51 +0,0 @@ -# Specify the base Docker image. You can read more about -# the available images at https://crawlee.dev/docs/guides/docker-images -# You can also use any other image from Docker Hub. -FROM apify/actor-node-playwright-chrome:20 AS builder - -# Copy just package.json and package-lock.json -# to speed up the build using Docker layer cache. -COPY --chown=myuser package*.json ./ - -# Install all dependencies. Don't audit to speed up the installation. -RUN npm install --include=dev --audit=false - -# Next, copy the source files using the user set -# in the base image. -COPY --chown=myuser . ./ - -# Install all dependencies and build the project. -# Don't audit to speed up the installation. -RUN npm run build - -# Create final image -FROM apify/actor-node-playwright-chrome:20 - -# Copy only built JS files from builder image -COPY --from=builder --chown=myuser /home/myuser/dist ./dist - -# Copy just package.json and package-lock.json -# to speed up the build using Docker layer cache. -COPY --chown=myuser package*.json ./ - -# Install NPM packages, skip optional and development dependencies to -# keep the image small. Avoid logging too much and print the dependency -# tree for debugging -RUN npm --quiet set progress=false \ - && npm install --omit=dev --omit=optional \ - && echo "Installed NPM packages:" \ - && (npm list --omit=dev --all || true) \ - && echo "Node.js version:" \ - && node --version \ - && echo "NPM version:" \ - && npm --version - -# Next, copy the remaining files and directories with the source code. -# Since we do this after NPM install, quick build will be really fast -# for most source file changes. -COPY --chown=myuser . ./ - - -# Run the image. If you know you won't need headful browsers, -# you can remove the XVFB start script for a micro perf gain. -CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent diff --git a/coto-crawlee/README.md b/coto-crawlee/README.md deleted file mode 100644 index 6a404f0..0000000 --- a/coto-crawlee/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# Getting started with Crawlee - -This example uses `PlaywrightCrawler` to recursively crawl https://crawlee.dev using the browser automation library [Playwright](https://playwright.dev). - -You can find more examples and documentation at the following links: - -- [Step-by-step tutorial](https://crawlee.dev/docs/introduction) for Crawlee -- `PlaywrightCrawler` [API documentation](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler) -- Other [examples](https://crawlee.dev/docs/examples/playwright-crawler) diff --git a/coto-crawlee/bun.lockb b/coto-crawlee/bun.lockb deleted file mode 100755 index ae88d9d..0000000 Binary files a/coto-crawlee/bun.lockb and /dev/null differ diff --git a/coto-crawlee/package.json b/coto-crawlee/package.json deleted file mode 100644 index b07d258..0000000 --- a/coto-crawlee/package.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "name": "coto-crawlee", - "version": "0.0.1", - "type": "module", - "description": "This is an example of a Crawlee project.", - "dependencies": { - "crawlee": "^3.0.0", - "playwright": "*" - }, - "devDependencies": { - "@apify/tsconfig": "^0.1.0", - "tsx": "^4.4.0", - "typescript": "~5.5.0", - "@types/node": "^20.0.0" - }, - "scripts": { - "start": "npm run start:dev", - "start:prod": "node dist/main.js", - "start:dev": "tsx src/main.ts", - "build": "tsc", - "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" - }, - "author": "It's not you it's me", - "license": "ISC" -} diff --git a/coto-crawlee/src/main.ts b/coto-crawlee/src/main.ts deleted file mode 100644 index 579132a..0000000 --- a/coto-crawlee/src/main.ts +++ /dev/null @@ -1,123 +0,0 @@ -// For more information, see https://crawlee.dev/ -import { - CheerioCrawler, - createCheerioRouter, - createPlaywrightRouter, - enqueueLinks, - PlaywrightCrawler, - ProxyConfiguration, -} from "crawlee"; -import { readFileSync } from "fs"; - -const proxyUrls = readFileSync("proxies.txt", "utf-8") - .split(/\r?\n/) - .filter((x) => x.trim().length > 0) - .map((x) => { - const [ip, port, username, password] = x.split(":"); - return `http://${username}:${password}@${ip}:${port}`; - }); -console.log(proxyUrls); -// const scrapoxyConfig = { -// username: "asdf", -// password: "asdf", -// proxyUrl: "partido-obrero:8888", -// apiUrl: "partido-obrero:8890", -// }; - -const proxyConf = new ProxyConfiguration({ - proxyUrls: proxyUrls, // proxyUrls: Array(100).fill( - // `http://${scrapoxyConfig.username}:${scrapoxyConfig.password}@${scrapoxyConfig.proxyUrl}/` - // // "http://asdfasdf-rotate:asdfasdf@p.webshare.io" - // ), -}); - -const router = createCheerioRouter(); -router.addHandler("DETAIL", async ({ request, parseWithCheerio, pushData }) => { - const $ = await parseWithCheerio(); - - const name = $("h1.product_page").text().trim(); - await pushData({ name, url: request.loadedUrl }, "products"); -}); -router.addHandler( - "CATEGORY", - async ({ request, enqueueLinks, log, pushData, parseWithCheerio }) => { - // const title = await page.title(); - // log.info(`Title of ${request.loadedUrl} is '${title}'`); - const $ = await parseWithCheerio(); - - await enqueueLinks({ - selector: 'a[href^="/sitios/cdigi/producto"]', - label: "DETAIL", - }); - - const productsEls = $("ul#products").children("li"); - - for (const el of productsEls) { - const title = $(el) - .find(".atg_store_productTitle .descrip_full") - .text() - .trim(); - const href = $(el).find('a[href^="/sitios/cdigi/producto"]'); - await pushData({ title, url: href.attr("href") }, "product-list"); - } - // Save results as JSON to ./storage/datasets/default - - // Extract links from the current page - // and add them to the crawling queue. - await enqueueLinks({ - selector: "[title=Siguiente]", - label: "CATEGORY", - }); - } -); -router.addDefaultHandler(async ({ enqueueLinks }) => { - await enqueueLinks({ - urls: ["https://www.cotodigital3.com.ar/sitios/cdigi/browse"], - label: "CATEGORY", - }); -}); - -// PlaywrightCrawler crawls the web using a headless -// browser controlled by the Playwright library. -const crawler = new CheerioCrawler({ - proxyConfiguration: proxyConf, - ignoreSslErrors: true, - - useSessionPool: true, - sessionPoolOptions: { - blockedStatusCodes: [401, 403, 429, 500], - }, - minConcurrency: 10, - maxConcurrency: 50, - maxRequestRetries: 50, - requestHandlerTimeoutSecs: 30, - requestHandler: router, - // async errorHandler({ request, response, log }) { - // if (!response || !("statusCode" in response)) { - // log.error("Response has no statusCode", { response }); - // return; - // } - // if (response.statusCode === 557) { - // log.warning("No proxy available, waiting"); - // await new Promise((resolve) => setTimeout(resolve, 30 * 1000)); - // return; - // } - // const proxyName = response.headers["x-scrapoxy-proxyname"]; - // log.warning(`Resetting proxy`, { proxyName, headers: response.headers }); - // const res = await fetch( - // `http://${scrapoxyConfig.apiUrl}/api/scraper/project/proxies/remove`, - // { - // method: "POST", - // body: JSON.stringify([{ id: proxyName, force: false }]), - // headers: { - // Authorization: `Basic ${btoa(`${scrapoxyConfig.username}:${scrapoxyConfig.password}`)}`, - // "Content-Type": "application/json", - // }, - // } - // ); - // if (!res.ok) - // log.error(`status code ${res.status}`, { json: await res.json() }); - // }, -}); - -await crawler.run(["https://www.cotodigital3.com.ar/"]); diff --git a/coto-crawlee/tsconfig.json b/coto-crawlee/tsconfig.json deleted file mode 100644 index cc141ac..0000000 --- a/coto-crawlee/tsconfig.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "extends": "@apify/tsconfig", - "compilerOptions": { - "module": "NodeNext", - "moduleResolution": "NodeNext", - "target": "ES2022", - "outDir": "dist", - "noUnusedLocals": false, - "lib": ["DOM"] - }, - "include": ["./src/**/*"] -}