rm coto-crawlee

2024-11-21 14:16:18 +00:00 · 2024-11-13 09:49:44 -03:00 · 2024-11-13 09:49:44 -03:00 · 9cb7c0e27e
commit 9cb7c0e27e
parent 9a82a556f9
8 changed files with 0 additions and 234 deletions
--- a/coto-crawlee/.dockerignore
+++ b/coto-crawlee/.dockerignore
@ -1,8 +0,0 @@
-# configurations
-.idea
-
-# crawlee storage folder
-storage
-
-# installed files
-node_modules
--- a/coto-crawlee/.gitignore
+++ b/coto-crawlee/.gitignore
@ -1,6 +0,0 @@
-# This file tells Git which files shouldn't be added to source control
-
-.idea
-dist
-node_modules
-storage
--- a/coto-crawlee/Dockerfile
+++ b/coto-crawlee/Dockerfile
@ -1,51 +0,0 @@
-# Specify the base Docker image. You can read more about
-# the available images at https://crawlee.dev/docs/guides/docker-images
-# You can also use any other image from Docker Hub.
-FROM apify/actor-node-playwright-chrome:20 AS builder
-
-# Copy just package.json and package-lock.json
-# to speed up the build using Docker layer cache.
-COPY --chown=myuser package*.json ./
-
-# Install all dependencies. Don't audit to speed up the installation.
-RUN npm install --include=dev --audit=false
-
-# Next, copy the source files using the user set
-# in the base image.
-COPY --chown=myuser . ./
-
-# Install all dependencies and build the project.
-# Don't audit to speed up the installation.
-RUN npm run build
-
-# Create final image
-FROM apify/actor-node-playwright-chrome:20
-
-# Copy only built JS files from builder image
-COPY --from=builder --chown=myuser /home/myuser/dist ./dist
-
-# Copy just package.json and package-lock.json
-# to speed up the build using Docker layer cache.
-COPY --chown=myuser package*.json ./
-
-# Install NPM packages, skip optional and development dependencies to
-# keep the image small. Avoid logging too much and print the dependency
-# tree for debugging
-RUN npm --quiet set progress=false \
-    && npm install --omit=dev --omit=optional \
-    && echo "Installed NPM packages:" \
-    && (npm list --omit=dev --all || true) \
-    && echo "Node.js version:" \
-    && node --version \
-    && echo "NPM version:" \
-    && npm --version
-
-# Next, copy the remaining files and directories with the source code.
-# Since we do this after NPM install, quick build will be really fast
-# for most source file changes.
-COPY --chown=myuser . ./
-
-
-# Run the image. If you know you won't need headful browsers,
-# you can remove the XVFB start script for a micro perf gain.
-CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
--- a/coto-crawlee/README.md
+++ b/coto-crawlee/README.md
@ -1,9 +0,0 @@
-# Getting started with Crawlee
-
-This example uses `PlaywrightCrawler` to recursively crawl https://crawlee.dev using the browser automation library [Playwright](https://playwright.dev).
-
-You can find more examples and documentation at the following links:
-
- [Step-by-step tutorial](https://crawlee.dev/docs/introduction) for Crawlee
- `PlaywrightCrawler` [API documentation](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler)
- Other [examples](https://crawlee.dev/docs/examples/playwright-crawler)
--- a/coto-crawlee/bun.lockb
+++ b/coto-crawlee/bun.lockb
--- a/coto-crawlee/package.json
+++ b/coto-crawlee/package.json
@ -1,25 +0,0 @@
-{
-    "name": "coto-crawlee",
-    "version": "0.0.1",
-    "type": "module",
-    "description": "This is an example of a Crawlee project.",
-    "dependencies": {
-        "crawlee": "^3.0.0",
-        "playwright": "*"
-    },
-    "devDependencies": {
-        "@apify/tsconfig": "^0.1.0",
-        "tsx": "^4.4.0",
-        "typescript": "~5.5.0",
-        "@types/node": "^20.0.0"
-    },
-    "scripts": {
-        "start": "npm run start:dev",
-        "start:prod": "node dist/main.js",
-        "start:dev": "tsx src/main.ts",
-        "build": "tsc",
-        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
-    },
-    "author": "It's not you it's me",
-    "license": "ISC"
-}
--- a/coto-crawlee/src/main.ts
+++ b/coto-crawlee/src/main.ts
@ -1,123 +0,0 @@
-// For more information, see https://crawlee.dev/
-import {
-  CheerioCrawler,
-  createCheerioRouter,
-  createPlaywrightRouter,
-  enqueueLinks,
-  PlaywrightCrawler,
-  ProxyConfiguration,
-} from "crawlee";
-import { readFileSync } from "fs";
-
-const proxyUrls = readFileSync("proxies.txt", "utf-8")
-  .split(/\r?\n/)
-  .filter((x) => x.trim().length > 0)
-  .map((x) => {
-    const [ip, port, username, password] = x.split(":");
-    return `http://${username}:${password}@${ip}:${port}`;
-  });
-console.log(proxyUrls);
-// const scrapoxyConfig = {
-//   username: "asdf",
-//   password: "asdf",
-//   proxyUrl: "partido-obrero:8888",
-//   apiUrl: "partido-obrero:8890",
-// };
-
-const proxyConf = new ProxyConfiguration({
-  proxyUrls: proxyUrls, // proxyUrls: Array(100).fill(
-  //   `http://${scrapoxyConfig.username}:${scrapoxyConfig.password}@${scrapoxyConfig.proxyUrl}/`
-  //   // "http://asdfasdf-rotate:asdfasdf@p.webshare.io"
-  // ),
-});
-
-const router = createCheerioRouter();
-router.addHandler("DETAIL", async ({ request, parseWithCheerio, pushData }) => {
-  const $ = await parseWithCheerio();
-
-  const name = $("h1.product_page").text().trim();
-  await pushData({ name, url: request.loadedUrl }, "products");
-});
-router.addHandler(
-  "CATEGORY",
-  async ({ request, enqueueLinks, log, pushData, parseWithCheerio }) => {
-    // const title = await page.title();
-    // log.info(`Title of ${request.loadedUrl} is '${title}'`);
-    const $ = await parseWithCheerio();
-
-    await enqueueLinks({
-      selector: 'a[href^="/sitios/cdigi/producto"]',
-      label: "DETAIL",
-    });
-
-    const productsEls = $("ul#products").children("li");
-
-    for (const el of productsEls) {
-      const title = $(el)
-        .find(".atg_store_productTitle .descrip_full")
-        .text()
-        .trim();
-      const href = $(el).find('a[href^="/sitios/cdigi/producto"]');
-      await pushData({ title, url: href.attr("href") }, "product-list");
-    }
-    // Save results as JSON to ./storage/datasets/default
-
-    // Extract links from the current page
-    // and add them to the crawling queue.
-    await enqueueLinks({
-      selector: "[title=Siguiente]",
-      label: "CATEGORY",
-    });
-  }
-);
-router.addDefaultHandler(async ({ enqueueLinks }) => {
-  await enqueueLinks({
-    urls: ["https://www.cotodigital3.com.ar/sitios/cdigi/browse"],
-    label: "CATEGORY",
-  });
-});
-
-// PlaywrightCrawler crawls the web using a headless
-// browser controlled by the Playwright library.
-const crawler = new CheerioCrawler({
-  proxyConfiguration: proxyConf,
-  ignoreSslErrors: true,
-
-  useSessionPool: true,
-  sessionPoolOptions: {
-    blockedStatusCodes: [401, 403, 429, 500],
-  },
-  minConcurrency: 10,
-  maxConcurrency: 50,
-  maxRequestRetries: 50,
-  requestHandlerTimeoutSecs: 30,
-  requestHandler: router,
-  // async errorHandler({ request, response, log }) {
-  //   if (!response || !("statusCode" in response)) {
-  //     log.error("Response has no statusCode", { response });
-  //     return;
-  //   }
-  //   if (response.statusCode === 557) {
-  //     log.warning("No proxy available, waiting");
-  //     await new Promise((resolve) => setTimeout(resolve, 30 * 1000));
-  //     return;
-  //   }
-  //   const proxyName = response.headers["x-scrapoxy-proxyname"];
-  //   log.warning(`Resetting proxy`, { proxyName, headers: response.headers });
-  //   const res = await fetch(
-  //     `http://${scrapoxyConfig.apiUrl}/api/scraper/project/proxies/remove`,
-  //     {
-  //       method: "POST",
-  //       body: JSON.stringify([{ id: proxyName, force: false }]),
-  //       headers: {
-  //         Authorization: `Basic ${btoa(`${scrapoxyConfig.username}:${scrapoxyConfig.password}`)}`,
-  //         "Content-Type": "application/json",
-  //       },
-  //     }
-  //   );
-  //   if (!res.ok)
-  //     log.error(`status code ${res.status}`, { json: await res.json() });
-  // },
-});
-
-await crawler.run(["https://www.cotodigital3.com.ar/"]);
--- a/coto-crawlee/tsconfig.json
+++ b/coto-crawlee/tsconfig.json
@ -1,12 +0,0 @@
-{
-	"extends": "@apify/tsconfig",
-	"compilerOptions": {
-		"module": "NodeNext",
-		"moduleResolution": "NodeNext",
-		"target": "ES2022",
-		"outDir": "dist",
-		"noUnusedLocals": false,
-		"lib": ["DOM"]
-	},
-	"include": ["./src/**/*"]
-}