diff --git a/coto-crawlee/.dockerignore b/coto-crawlee/.dockerignore new file mode 100644 index 0000000..2710668 --- /dev/null +++ b/coto-crawlee/.dockerignore @@ -0,0 +1,8 @@ +# configurations +.idea + +# crawlee storage folder +storage + +# installed files +node_modules diff --git a/coto-crawlee/.gitignore b/coto-crawlee/.gitignore new file mode 100644 index 0000000..e0996e6 --- /dev/null +++ b/coto-crawlee/.gitignore @@ -0,0 +1,6 @@ +# This file tells Git which files shouldn't be added to source control + +.idea +dist +node_modules +storage diff --git a/coto-crawlee/Dockerfile b/coto-crawlee/Dockerfile new file mode 100644 index 0000000..1fe6784 --- /dev/null +++ b/coto-crawlee/Dockerfile @@ -0,0 +1,51 @@ +# Specify the base Docker image. You can read more about +# the available images at https://crawlee.dev/docs/guides/docker-images +# You can also use any other image from Docker Hub. +FROM apify/actor-node-playwright-chrome:20 AS builder + +# Copy just package.json and package-lock.json +# to speed up the build using Docker layer cache. +COPY --chown=myuser package*.json ./ + +# Install all dependencies. Don't audit to speed up the installation. +RUN npm install --include=dev --audit=false + +# Next, copy the source files using the user set +# in the base image. +COPY --chown=myuser . ./ + +# Install all dependencies and build the project. +# Don't audit to speed up the installation. +RUN npm run build + +# Create final image +FROM apify/actor-node-playwright-chrome:20 + +# Copy only built JS files from builder image +COPY --from=builder --chown=myuser /home/myuser/dist ./dist + +# Copy just package.json and package-lock.json +# to speed up the build using Docker layer cache. +COPY --chown=myuser package*.json ./ + +# Install NPM packages, skip optional and development dependencies to +# keep the image small. Avoid logging too much and print the dependency +# tree for debugging +RUN npm --quiet set progress=false \ + && npm install --omit=dev --omit=optional \ + && echo "Installed NPM packages:" \ + && (npm list --omit=dev --all || true) \ + && echo "Node.js version:" \ + && node --version \ + && echo "NPM version:" \ + && npm --version + +# Next, copy the remaining files and directories with the source code. +# Since we do this after NPM install, quick build will be really fast +# for most source file changes. +COPY --chown=myuser . ./ + + +# Run the image. If you know you won't need headful browsers, +# you can remove the XVFB start script for a micro perf gain. +CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent diff --git a/coto-crawlee/README.md b/coto-crawlee/README.md new file mode 100644 index 0000000..6a404f0 --- /dev/null +++ b/coto-crawlee/README.md @@ -0,0 +1,9 @@ +# Getting started with Crawlee + +This example uses `PlaywrightCrawler` to recursively crawl https://crawlee.dev using the browser automation library [Playwright](https://playwright.dev). + +You can find more examples and documentation at the following links: + +- [Step-by-step tutorial](https://crawlee.dev/docs/introduction) for Crawlee +- `PlaywrightCrawler` [API documentation](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler) +- Other [examples](https://crawlee.dev/docs/examples/playwright-crawler) diff --git a/coto-crawlee/bun.lockb b/coto-crawlee/bun.lockb new file mode 100755 index 0000000..ae88d9d Binary files /dev/null and b/coto-crawlee/bun.lockb differ diff --git a/coto-crawlee/package.json b/coto-crawlee/package.json new file mode 100644 index 0000000..b07d258 --- /dev/null +++ b/coto-crawlee/package.json @@ -0,0 +1,25 @@ +{ + "name": "coto-crawlee", + "version": "0.0.1", + "type": "module", + "description": "This is an example of a Crawlee project.", + "dependencies": { + "crawlee": "^3.0.0", + "playwright": "*" + }, + "devDependencies": { + "@apify/tsconfig": "^0.1.0", + "tsx": "^4.4.0", + "typescript": "~5.5.0", + "@types/node": "^20.0.0" + }, + "scripts": { + "start": "npm run start:dev", + "start:prod": "node dist/main.js", + "start:dev": "tsx src/main.ts", + "build": "tsc", + "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" + }, + "author": "It's not you it's me", + "license": "ISC" +} diff --git a/coto-crawlee/src/main.ts b/coto-crawlee/src/main.ts new file mode 100644 index 0000000..579132a --- /dev/null +++ b/coto-crawlee/src/main.ts @@ -0,0 +1,123 @@ +// For more information, see https://crawlee.dev/ +import { + CheerioCrawler, + createCheerioRouter, + createPlaywrightRouter, + enqueueLinks, + PlaywrightCrawler, + ProxyConfiguration, +} from "crawlee"; +import { readFileSync } from "fs"; + +const proxyUrls = readFileSync("proxies.txt", "utf-8") + .split(/\r?\n/) + .filter((x) => x.trim().length > 0) + .map((x) => { + const [ip, port, username, password] = x.split(":"); + return `http://${username}:${password}@${ip}:${port}`; + }); +console.log(proxyUrls); +// const scrapoxyConfig = { +// username: "asdf", +// password: "asdf", +// proxyUrl: "partido-obrero:8888", +// apiUrl: "partido-obrero:8890", +// }; + +const proxyConf = new ProxyConfiguration({ + proxyUrls: proxyUrls, // proxyUrls: Array(100).fill( + // `http://${scrapoxyConfig.username}:${scrapoxyConfig.password}@${scrapoxyConfig.proxyUrl}/` + // // "http://asdfasdf-rotate:asdfasdf@p.webshare.io" + // ), +}); + +const router = createCheerioRouter(); +router.addHandler("DETAIL", async ({ request, parseWithCheerio, pushData }) => { + const $ = await parseWithCheerio(); + + const name = $("h1.product_page").text().trim(); + await pushData({ name, url: request.loadedUrl }, "products"); +}); +router.addHandler( + "CATEGORY", + async ({ request, enqueueLinks, log, pushData, parseWithCheerio }) => { + // const title = await page.title(); + // log.info(`Title of ${request.loadedUrl} is '${title}'`); + const $ = await parseWithCheerio(); + + await enqueueLinks({ + selector: 'a[href^="/sitios/cdigi/producto"]', + label: "DETAIL", + }); + + const productsEls = $("ul#products").children("li"); + + for (const el of productsEls) { + const title = $(el) + .find(".atg_store_productTitle .descrip_full") + .text() + .trim(); + const href = $(el).find('a[href^="/sitios/cdigi/producto"]'); + await pushData({ title, url: href.attr("href") }, "product-list"); + } + // Save results as JSON to ./storage/datasets/default + + // Extract links from the current page + // and add them to the crawling queue. + await enqueueLinks({ + selector: "[title=Siguiente]", + label: "CATEGORY", + }); + } +); +router.addDefaultHandler(async ({ enqueueLinks }) => { + await enqueueLinks({ + urls: ["https://www.cotodigital3.com.ar/sitios/cdigi/browse"], + label: "CATEGORY", + }); +}); + +// PlaywrightCrawler crawls the web using a headless +// browser controlled by the Playwright library. +const crawler = new CheerioCrawler({ + proxyConfiguration: proxyConf, + ignoreSslErrors: true, + + useSessionPool: true, + sessionPoolOptions: { + blockedStatusCodes: [401, 403, 429, 500], + }, + minConcurrency: 10, + maxConcurrency: 50, + maxRequestRetries: 50, + requestHandlerTimeoutSecs: 30, + requestHandler: router, + // async errorHandler({ request, response, log }) { + // if (!response || !("statusCode" in response)) { + // log.error("Response has no statusCode", { response }); + // return; + // } + // if (response.statusCode === 557) { + // log.warning("No proxy available, waiting"); + // await new Promise((resolve) => setTimeout(resolve, 30 * 1000)); + // return; + // } + // const proxyName = response.headers["x-scrapoxy-proxyname"]; + // log.warning(`Resetting proxy`, { proxyName, headers: response.headers }); + // const res = await fetch( + // `http://${scrapoxyConfig.apiUrl}/api/scraper/project/proxies/remove`, + // { + // method: "POST", + // body: JSON.stringify([{ id: proxyName, force: false }]), + // headers: { + // Authorization: `Basic ${btoa(`${scrapoxyConfig.username}:${scrapoxyConfig.password}`)}`, + // "Content-Type": "application/json", + // }, + // } + // ); + // if (!res.ok) + // log.error(`status code ${res.status}`, { json: await res.json() }); + // }, +}); + +await crawler.run(["https://www.cotodigital3.com.ar/"]); diff --git a/coto-crawlee/tsconfig.json b/coto-crawlee/tsconfig.json new file mode 100644 index 0000000..cc141ac --- /dev/null +++ b/coto-crawlee/tsconfig.json @@ -0,0 +1,12 @@ +{ + "extends": "@apify/tsconfig", + "compilerOptions": { + "module": "NodeNext", + "moduleResolution": "NodeNext", + "target": "ES2022", + "outDir": "dist", + "noUnusedLocals": false, + "lib": ["DOM"] + }, + "include": ["./src/**/*"] +}