mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-22 22:26:19 +00:00
scraper: reordenar codigo
- borrar código viejo - centralizar scrapers de links
This commit is contained in:
parent
a322bc36fc
commit
fa6de68f60
15 changed files with 17 additions and 66 deletions
BIN
bun.lockb
BIN
bun.lockb
Binary file not shown.
|
@ -1,17 +0,0 @@
|
||||||
{
|
|
||||||
"name": "carrefour-link-scraper",
|
|
||||||
"type": "module",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"description": "",
|
|
||||||
"main": "index.js",
|
|
||||||
"scripts": {
|
|
||||||
"test": "echo \"Error: no test specified\" && exit 1"
|
|
||||||
},
|
|
||||||
"keywords": [],
|
|
||||||
"author": "",
|
|
||||||
"license": "ISC",
|
|
||||||
"dependencies": {
|
|
||||||
"linkedom": "^0.16.5",
|
|
||||||
"p-map": "^7.0.1"
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,17 +0,0 @@
|
||||||
{
|
|
||||||
"name": "dia-link-scraper",
|
|
||||||
"type": "module",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"description": "",
|
|
||||||
"main": "index.js",
|
|
||||||
"scripts": {
|
|
||||||
"test": "echo \"Error: no test specified\" && exit 1"
|
|
||||||
},
|
|
||||||
"keywords": [],
|
|
||||||
"author": "",
|
|
||||||
"license": "ISC",
|
|
||||||
"dependencies": {
|
|
||||||
"linkedom": "^0.16.5",
|
|
||||||
"p-map": "^7.0.0"
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,4 +1,3 @@
|
||||||
import { getHtml } from "../scraper/fetch.js";
|
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import PQueue from "p-queue";
|
import PQueue from "p-queue";
|
||||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
|
@ -28,12 +27,13 @@ function getPage(url: string) {
|
||||||
return async () => {
|
return async () => {
|
||||||
let html;
|
let html;
|
||||||
try {
|
try {
|
||||||
html = await getHtml(url);
|
const res = await fetch(url);
|
||||||
|
html = await res.text();
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
await getPage(url)();
|
await getPage(url)();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const { document } = parseHTML(html.toString("utf-8"));
|
const { document } = parseHTML(html);
|
||||||
|
|
||||||
const hrefs = Array.from(
|
const hrefs = Array.from(
|
||||||
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
|
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
|
|
@ -1,7 +1,6 @@
|
||||||
import pMap from "p-map";
|
import pMap from "p-map";
|
||||||
import { decodeXML } from "entities";
|
import { decodeXML } from "entities";
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import { getHtml } from "../scraper/fetch.js";
|
|
||||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
|
|
||||||
const categorias = [
|
const categorias = [
|
||||||
|
@ -111,8 +110,9 @@ async function scrapBySite() {
|
||||||
await pMap(
|
await pMap(
|
||||||
links,
|
links,
|
||||||
async (url) => {
|
async (url) => {
|
||||||
const html = await getHtml(url);
|
const res = await fetch(url);
|
||||||
const { document } = parseHTML(html.toString("utf-8"));
|
const html = await res.text();
|
||||||
|
const { document } = parseHTML(html);
|
||||||
|
|
||||||
const hrefs = Array.from(
|
const hrefs = Array.from(
|
||||||
document.querySelectorAll<HTMLAnchorElement>(
|
document.querySelectorAll<HTMLAnchorElement>(
|
|
@ -1,5 +1,5 @@
|
||||||
{
|
{
|
||||||
"name": "coto-link-scraper",
|
"name": "link-scrapers",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"description": "",
|
"description": "",
|
|
@ -2,9 +2,7 @@
|
||||||
"name": "preciazo",
|
"name": "preciazo",
|
||||||
"private": true,
|
"private": true,
|
||||||
"workspaces": [
|
"workspaces": [
|
||||||
"dia-link-scraper",
|
"link-scrapers",
|
||||||
"coto-link-scraper",
|
|
||||||
"carrefour-link-scraper",
|
|
||||||
"scraper",
|
"scraper",
|
||||||
"sitio",
|
"sitio",
|
||||||
"db-datos"
|
"db-datos"
|
||||||
|
|
|
@ -4,7 +4,7 @@ scrapeo "masivo" de precios y datos en supermercados argentinos
|
||||||
|
|
||||||
## componentes (en orden de proceso)
|
## componentes (en orden de proceso)
|
||||||
|
|
||||||
- los link scrapers ([coto-link-scraper](./coto-link-scraper/), [dia-link-scraper](./dia-link-scraper/) y [carrefour-link-scraper](./carrefour-link-scraper)) crean listas de links a productos para scrapear
|
- los link scrapers ([link-scrapers/](./link-scrapers/)) crean listas de links a productos para scrapear
|
||||||
|
|
||||||
(no hace falta correrlos porque ya hay listas armadas en [data/](./data/))
|
(no hace falta correrlos porque ya hay listas armadas en [data/](./data/))
|
||||||
|
|
||||||
|
|
|
@ -8,9 +8,9 @@ import { downloadList } from "./scrap.js";
|
||||||
import { db } from "db-datos/db.js";
|
import { db } from "db-datos/db.js";
|
||||||
import { like } from "drizzle-orm";
|
import { like } from "drizzle-orm";
|
||||||
import { productoUrls } from "db-datos/schema.js";
|
import { productoUrls } from "db-datos/schema.js";
|
||||||
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
|
import { scrapDiaProducts } from "../link-scrapers/dia.js";
|
||||||
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
|
import { scrapCotoProducts } from "../link-scrapers/coto.js";
|
||||||
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
|
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
|
||||||
|
|
||||||
const supermercados: Supermercado[] = [
|
const supermercados: Supermercado[] = [
|
||||||
Supermercado.Carrefour,
|
Supermercado.Carrefour,
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
|
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
|
||||||
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
|
import { scrapCotoProducts } from "../link-scrapers/coto.js";
|
||||||
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
|
import { scrapDiaProducts } from "../link-scrapers/dia.js";
|
||||||
import { auto } from "./auto.js";
|
import { auto } from "./auto.js";
|
||||||
import { downloadList, getProduct } from "./scrap.js";
|
import { downloadList, getProduct } from "./scrap.js";
|
||||||
|
|
||||||
|
|
|
@ -1,13 +0,0 @@
|
||||||
export async function getHtml(url: string) {
|
|
||||||
const res = await fetch(url);
|
|
||||||
return readableToBuffer(res.body!);
|
|
||||||
}
|
|
||||||
|
|
||||||
async function readableToBuffer(source: AsyncIterable<any>) {
|
|
||||||
// https://stackoverflow.com/a/72891118
|
|
||||||
const buffers = [];
|
|
||||||
for await (const data of source) {
|
|
||||||
buffers.push(data);
|
|
||||||
}
|
|
||||||
return Buffer.concat(buffers);
|
|
||||||
}
|
|
|
@ -1,6 +1,6 @@
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import { Precioish } from "../scrap.js";
|
import { Precioish } from "../scrap.js";
|
||||||
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "../common.js";
|
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
|
||||||
|
|
||||||
function parseScriptJson<T>(dom: Window, varname: string): T {
|
function parseScriptJson<T>(dom: Window, varname: string): T {
|
||||||
const script = dom.window.document.querySelector<HTMLTemplateElement>(
|
const script = dom.window.document.querySelector<HTMLTemplateElement>(
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import { type Precioish } from "../scrap.js";
|
import { type Precioish } from "../scrap.js";
|
||||||
import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js";
|
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
|
||||||
|
|
||||||
export function getDiaProduct(html: string | Buffer): Precioish {
|
export function getDiaProduct(html: string | Buffer): Precioish {
|
||||||
const dom = parseHTML(html);
|
const dom = parseHTML(html);
|
||||||
|
|
Loading…
Reference in a new issue