scraper: reordenar codigo

- borrar código viejo
- centralizar scrapers de links
This commit is contained in:
Cat /dev/Nulo 2024-01-04 18:10:02 -03:00
parent a322bc36fc
commit fa6de68f60
15 changed files with 17 additions and 66 deletions

BIN
bun.lockb

Binary file not shown.

View file

@ -1,17 +0,0 @@
{
"name": "carrefour-link-scraper",
"type": "module",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"linkedom": "^0.16.5",
"p-map": "^7.0.1"
}
}

View file

@ -1,17 +0,0 @@
{
"name": "dia-link-scraper",
"type": "module",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"linkedom": "^0.16.5",
"p-map": "^7.0.0"
}
}

View file

@ -1,4 +1,3 @@
import { getHtml } from "../scraper/fetch.js";
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import PQueue from "p-queue"; import PQueue from "p-queue";
import { saveUrls } from "db-datos/urlHelpers.js"; import { saveUrls } from "db-datos/urlHelpers.js";
@ -28,12 +27,13 @@ function getPage(url: string) {
return async () => { return async () => {
let html; let html;
try { try {
html = await getHtml(url); const res = await fetch(url);
html = await res.text();
} catch (error) { } catch (error) {
await getPage(url)(); await getPage(url)();
return; return;
} }
const { document } = parseHTML(html.toString("utf-8")); const { document } = parseHTML(html);
const hrefs = Array.from( const hrefs = Array.from(
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"), document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),

View file

@ -1,7 +1,6 @@
import pMap from "p-map"; import pMap from "p-map";
import { decodeXML } from "entities"; import { decodeXML } from "entities";
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import { getHtml } from "../scraper/fetch.js";
import { saveUrls } from "db-datos/urlHelpers.js"; import { saveUrls } from "db-datos/urlHelpers.js";
const categorias = [ const categorias = [
@ -111,8 +110,9 @@ async function scrapBySite() {
await pMap( await pMap(
links, links,
async (url) => { async (url) => {
const html = await getHtml(url); const res = await fetch(url);
const { document } = parseHTML(html.toString("utf-8")); const html = await res.text();
const { document } = parseHTML(html);
const hrefs = Array.from( const hrefs = Array.from(
document.querySelectorAll<HTMLAnchorElement>( document.querySelectorAll<HTMLAnchorElement>(

View file

@ -1,5 +1,5 @@
{ {
"name": "coto-link-scraper", "name": "link-scrapers",
"type": "module", "type": "module",
"version": "1.0.0", "version": "1.0.0",
"description": "", "description": "",

View file

@ -2,9 +2,7 @@
"name": "preciazo", "name": "preciazo",
"private": true, "private": true,
"workspaces": [ "workspaces": [
"dia-link-scraper", "link-scrapers",
"coto-link-scraper",
"carrefour-link-scraper",
"scraper", "scraper",
"sitio", "sitio",
"db-datos" "db-datos"

View file

@ -4,7 +4,7 @@ scrapeo "masivo" de precios y datos en supermercados argentinos
## componentes (en orden de proceso) ## componentes (en orden de proceso)
- los link scrapers ([coto-link-scraper](./coto-link-scraper/), [dia-link-scraper](./dia-link-scraper/) y [carrefour-link-scraper](./carrefour-link-scraper)) crean listas de links a productos para scrapear - los link scrapers ([link-scrapers/](./link-scrapers/)) crean listas de links a productos para scrapear
(no hace falta correrlos porque ya hay listas armadas en [data/](./data/)) (no hace falta correrlos porque ya hay listas armadas en [data/](./data/))

View file

@ -8,9 +8,9 @@ import { downloadList } from "./scrap.js";
import { db } from "db-datos/db.js"; import { db } from "db-datos/db.js";
import { like } from "drizzle-orm"; import { like } from "drizzle-orm";
import { productoUrls } from "db-datos/schema.js"; import { productoUrls } from "db-datos/schema.js";
import { scrapDiaProducts } from "../dia-link-scraper/index.js"; import { scrapDiaProducts } from "../link-scrapers/dia.js";
import { scrapCotoProducts } from "../coto-link-scraper/index.js"; import { scrapCotoProducts } from "../link-scrapers/coto.js";
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js"; import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
const supermercados: Supermercado[] = [ const supermercados: Supermercado[] = [
Supermercado.Carrefour, Supermercado.Carrefour,

View file

@ -1,6 +1,6 @@
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js"; import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
import { scrapCotoProducts } from "../coto-link-scraper/index.js"; import { scrapCotoProducts } from "../link-scrapers/coto.js";
import { scrapDiaProducts } from "../dia-link-scraper/index.js"; import { scrapDiaProducts } from "../link-scrapers/dia.js";
import { auto } from "./auto.js"; import { auto } from "./auto.js";
import { downloadList, getProduct } from "./scrap.js"; import { downloadList, getProduct } from "./scrap.js";

View file

@ -1,13 +0,0 @@
export async function getHtml(url: string) {
const res = await fetch(url);
return readableToBuffer(res.body!);
}
async function readableToBuffer(source: AsyncIterable<any>) {
// https://stackoverflow.com/a/72891118
const buffers = [];
for await (const data of source) {
buffers.push(data);
}
return Buffer.concat(buffers);
}

View file

@ -1,6 +1,6 @@
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import { Precioish } from "../scrap.js"; import { Precioish } from "../scrap.js";
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "../common.js"; import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
function parseScriptJson<T>(dom: Window, varname: string): T { function parseScriptJson<T>(dom: Window, varname: string): T {
const script = dom.window.document.querySelector<HTMLTemplateElement>( const script = dom.window.document.querySelector<HTMLTemplateElement>(

View file

@ -1,6 +1,6 @@
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import { type Precioish } from "../scrap.js"; import { type Precioish } from "../scrap.js";
import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js"; import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
export function getDiaProduct(html: string | Buffer): Precioish { export function getDiaProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html); const dom = parseHTML(html);