scraper: reordenar codigo

- borrar código viejo - centralizar scrapers de links
2024-11-22 14:16:19 +00:00 · 2024-01-04 18:10:02 -03:00 · 2024-01-04 18:10:02 -03:00 · fa6de68f60
commit fa6de68f60
parent a322bc36fc
15 changed files with 17 additions and 66 deletions
--- a/bun.lockb
+++ b/bun.lockb
--- a/carrefour-link-scraper/package.json
+++ b/carrefour-link-scraper/package.json
@ -1,17 +0,0 @@
 {
  "name": "carrefour-link-scraper",
  "type": "module",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "keywords": [],
  "author": "",
  "license": "ISC",
  "dependencies": {
    "linkedom": "^0.16.5",
    "p-map": "^7.0.1"
  }
 }
--- a/dia-link-scraper/package.json
+++ b/dia-link-scraper/package.json
@ -1,17 +0,0 @@
 {
  "name": "dia-link-scraper",
  "type": "module",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "keywords": [],
  "author": "",
  "license": "ISC",
  "dependencies": {
    "linkedom": "^0.16.5",
    "p-map": "^7.0.0"
  }
 }
--- a/carrefour-link-scraper/index.ts
+++ b/carrefour-link-scraper/index.ts
--- a/coto-link-scraper/index.ts
+++ b/coto-link-scraper/index.ts
@ -1,4 +1,3 @@
 import { getHtml } from "../scraper/fetch.js";
 import { parseHTML } from "linkedom";
 import PQueue from "p-queue";
 import { saveUrls } from "db-datos/urlHelpers.js";
@ -28,12 +27,13 @@ function getPage(url: string) {
  return async () => {
    let html;
    try {
-      html = await getHtml(url);
+      const res = await fetch(url);
      html = await res.text();
    } catch (error) {
      await getPage(url)();
      return;
    }
-    const { document } = parseHTML(html.toString("utf-8"));
+    const { document } = parseHTML(html);
    const hrefs = Array.from(
      document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
--- a/dia-link-scraper/index.ts
+++ b/dia-link-scraper/index.ts
@ -1,7 +1,6 @@
 import pMap from "p-map";
 import { decodeXML } from "entities";
 import { parseHTML } from "linkedom";
 import { getHtml } from "../scraper/fetch.js";
 import { saveUrls } from "db-datos/urlHelpers.js";
 const categorias = [
@ -111,8 +110,9 @@ async function scrapBySite() {
  await pMap(
    links,
    async (url) => {
-      const html = await getHtml(url);
+      const res = await fetch(url);
-      const { document } = parseHTML(html.toString("utf-8"));
+      const html = await res.text();
      const { document } = parseHTML(html);
      const hrefs = Array.from(
        document.querySelectorAll<HTMLAnchorElement>(
--- a/coto-link-scraper/package.json
+++ b/coto-link-scraper/package.json
@ -1,5 +1,5 @@
 {
-  "name": "coto-link-scraper",
+  "name": "link-scrapers",
  "type": "module",
  "version": "1.0.0",
  "description": "",
--- a/package.json
+++ b/package.json
@ -2,9 +2,7 @@
  "name": "preciazo",
  "private": true,
  "workspaces": [
-    "dia-link-scraper",
+    "link-scrapers",
    "coto-link-scraper",
    "carrefour-link-scraper",
    "scraper",
    "sitio",
    "db-datos"
--- a/readme.md
+++ b/readme.md
@ -4,7 +4,7 @@ scrapeo "masivo" de precios y datos en supermercados argentinos
 ## componentes (en orden de proceso)
- los link scrapers ([coto-link-scraper](./coto-link-scraper/), [dia-link-scraper](./dia-link-scraper/) y [carrefour-link-scraper](./carrefour-link-scraper)) crean listas de links a productos para scrapear
+- los link scrapers ([link-scrapers/](./link-scrapers/)) crean listas de links a productos para scrapear
  (no hace falta correrlos porque ya hay listas armadas en [data/](./data/))
--- a/scraper/auto.ts
+++ b/scraper/auto.ts
@ -8,9 +8,9 @@ import { downloadList } from "./scrap.js";
 import { db } from "db-datos/db.js";
 import { like } from "drizzle-orm";
 import { productoUrls } from "db-datos/schema.js";
-import { scrapDiaProducts } from "../dia-link-scraper/index.js";
+import { scrapDiaProducts } from "../link-scrapers/dia.js";
-import { scrapCotoProducts } from "../coto-link-scraper/index.js";
+import { scrapCotoProducts } from "../link-scrapers/coto.js";
-import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
+import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
 const supermercados: Supermercado[] = [
  Supermercado.Carrefour,
--- a/scraper/cli.ts
+++ b/scraper/cli.ts
@ -1,6 +1,6 @@
-import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
+import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
-import { scrapCotoProducts } from "../coto-link-scraper/index.js";
+import { scrapCotoProducts } from "../link-scrapers/coto.js";
-import { scrapDiaProducts } from "../dia-link-scraper/index.js";
+import { scrapDiaProducts } from "../link-scrapers/dia.js";
 import { auto } from "./auto.js";
 import { downloadList, getProduct } from "./scrap.js";
--- a/scraper/fetch.ts
+++ b/scraper/fetch.ts
@ -1,13 +0,0 @@
 export async function getHtml(url: string) {
  const res = await fetch(url);
  return readableToBuffer(res.body!);
 }
 async function readableToBuffer(source: AsyncIterable<any>) {
  // https://stackoverflow.com/a/72891118
  const buffers = [];
  for await (const data of source) {
    buffers.push(data);
  }
  return Buffer.concat(buffers);
 }
--- a/scraper/parsers/carrefour.ts
+++ b/scraper/parsers/carrefour.ts
@ -1,6 +1,6 @@
 import { parseHTML } from "linkedom";
 import { Precioish } from "../scrap.js";
-import { getProductJsonLd, priceFromMeta, stockFromMeta } from "../common.js";
+import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
 function parseScriptJson<T>(dom: Window, varname: string): T {
  const script = dom.window.document.querySelector<HTMLTemplateElement>(
--- a/scraper/parsers/common.ts
+++ b/scraper/parsers/common.ts
--- a/scraper/parsers/dia.ts
+++ b/scraper/parsers/dia.ts
@ -1,6 +1,6 @@
 import { parseHTML } from "linkedom";
 import { type Precioish } from "../scrap.js";
-import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js";
+import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
 export function getDiaProduct(html: string | Buffer): Precioish {
  const dom = parseHTML(html);