mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-22 06:16:18 +00:00
scraper: reordenar codigo
- borrar código viejo - centralizar scrapers de links
This commit is contained in:
parent
a322bc36fc
commit
fa6de68f60
15 changed files with 17 additions and 66 deletions
BIN
bun.lockb
BIN
bun.lockb
Binary file not shown.
|
@ -1,17 +0,0 @@
|
|||
{
|
||||
"name": "carrefour-link-scraper",
|
||||
"type": "module",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"linkedom": "^0.16.5",
|
||||
"p-map": "^7.0.1"
|
||||
}
|
||||
}
|
|
@ -1,17 +0,0 @@
|
|||
{
|
||||
"name": "dia-link-scraper",
|
||||
"type": "module",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"linkedom": "^0.16.5",
|
||||
"p-map": "^7.0.0"
|
||||
}
|
||||
}
|
|
@ -1,4 +1,3 @@
|
|||
import { getHtml } from "../scraper/fetch.js";
|
||||
import { parseHTML } from "linkedom";
|
||||
import PQueue from "p-queue";
|
||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||
|
@ -28,12 +27,13 @@ function getPage(url: string) {
|
|||
return async () => {
|
||||
let html;
|
||||
try {
|
||||
html = await getHtml(url);
|
||||
const res = await fetch(url);
|
||||
html = await res.text();
|
||||
} catch (error) {
|
||||
await getPage(url)();
|
||||
return;
|
||||
}
|
||||
const { document } = parseHTML(html.toString("utf-8"));
|
||||
const { document } = parseHTML(html);
|
||||
|
||||
const hrefs = Array.from(
|
||||
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
|
|
@ -1,7 +1,6 @@
|
|||
import pMap from "p-map";
|
||||
import { decodeXML } from "entities";
|
||||
import { parseHTML } from "linkedom";
|
||||
import { getHtml } from "../scraper/fetch.js";
|
||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||
|
||||
const categorias = [
|
||||
|
@ -111,8 +110,9 @@ async function scrapBySite() {
|
|||
await pMap(
|
||||
links,
|
||||
async (url) => {
|
||||
const html = await getHtml(url);
|
||||
const { document } = parseHTML(html.toString("utf-8"));
|
||||
const res = await fetch(url);
|
||||
const html = await res.text();
|
||||
const { document } = parseHTML(html);
|
||||
|
||||
const hrefs = Array.from(
|
||||
document.querySelectorAll<HTMLAnchorElement>(
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"name": "coto-link-scraper",
|
||||
"name": "link-scrapers",
|
||||
"type": "module",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
|
@ -2,9 +2,7 @@
|
|||
"name": "preciazo",
|
||||
"private": true,
|
||||
"workspaces": [
|
||||
"dia-link-scraper",
|
||||
"coto-link-scraper",
|
||||
"carrefour-link-scraper",
|
||||
"link-scrapers",
|
||||
"scraper",
|
||||
"sitio",
|
||||
"db-datos"
|
||||
|
|
|
@ -4,7 +4,7 @@ scrapeo "masivo" de precios y datos en supermercados argentinos
|
|||
|
||||
## componentes (en orden de proceso)
|
||||
|
||||
- los link scrapers ([coto-link-scraper](./coto-link-scraper/), [dia-link-scraper](./dia-link-scraper/) y [carrefour-link-scraper](./carrefour-link-scraper)) crean listas de links a productos para scrapear
|
||||
- los link scrapers ([link-scrapers/](./link-scrapers/)) crean listas de links a productos para scrapear
|
||||
|
||||
(no hace falta correrlos porque ya hay listas armadas en [data/](./data/))
|
||||
|
||||
|
|
|
@ -8,9 +8,9 @@ import { downloadList } from "./scrap.js";
|
|||
import { db } from "db-datos/db.js";
|
||||
import { like } from "drizzle-orm";
|
||||
import { productoUrls } from "db-datos/schema.js";
|
||||
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
|
||||
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
|
||||
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
|
||||
import { scrapDiaProducts } from "../link-scrapers/dia.js";
|
||||
import { scrapCotoProducts } from "../link-scrapers/coto.js";
|
||||
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
|
||||
|
||||
const supermercados: Supermercado[] = [
|
||||
Supermercado.Carrefour,
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
|
||||
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
|
||||
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
|
||||
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
|
||||
import { scrapCotoProducts } from "../link-scrapers/coto.js";
|
||||
import { scrapDiaProducts } from "../link-scrapers/dia.js";
|
||||
import { auto } from "./auto.js";
|
||||
import { downloadList, getProduct } from "./scrap.js";
|
||||
|
||||
|
|
|
@ -1,13 +0,0 @@
|
|||
export async function getHtml(url: string) {
|
||||
const res = await fetch(url);
|
||||
return readableToBuffer(res.body!);
|
||||
}
|
||||
|
||||
async function readableToBuffer(source: AsyncIterable<any>) {
|
||||
// https://stackoverflow.com/a/72891118
|
||||
const buffers = [];
|
||||
for await (const data of source) {
|
||||
buffers.push(data);
|
||||
}
|
||||
return Buffer.concat(buffers);
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
import { Precioish } from "../scrap.js";
|
||||
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "../common.js";
|
||||
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
|
||||
|
||||
function parseScriptJson<T>(dom: Window, varname: string): T {
|
||||
const script = dom.window.document.querySelector<HTMLTemplateElement>(
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
import { type Precioish } from "../scrap.js";
|
||||
import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js";
|
||||
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
|
||||
|
||||
export function getDiaProduct(html: string | Buffer): Precioish {
|
||||
const dom = parseHTML(html);
|
||||
|
|
Loading…
Reference in a new issue