mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-22 14:16:19 +00:00
Compare commits
3 commits
3396a433e5
...
1439c3dd1d
Author | SHA1 | Date | |
---|---|---|---|
1439c3dd1d | |||
4f135f8464 | |||
23ea94ecd5 |
22 changed files with 26 additions and 2132 deletions
22
.github/workflows/container.yml
vendored
22
.github/workflows/container.yml
vendored
|
@ -88,12 +88,32 @@ jobs:
|
|||
uses: actions/cache@v3
|
||||
with:
|
||||
path: usr/src/app/target
|
||||
key: usr/src/app/target-${{ hashFiles('Dockerfile') }}
|
||||
key: usr/src/app/target-${{ hashFiles('Dockerfile.scraper') }}
|
||||
- name: inject usr/src/app/target into docker
|
||||
uses: reproducible-containers/buildkit-cache-dance@v2.1.3
|
||||
with:
|
||||
cache-source: usr/src/app/target
|
||||
cache-target: /usr/src/app/target
|
||||
- name: Cache root/.cargo/registry
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: root/.cargo/registry
|
||||
key: root/.cargo/registry-${{ hashFiles('Dockerfile.scraper') }}
|
||||
- name: inject root/.cargo/registry into docker
|
||||
uses: reproducible-containers/buildkit-cache-dance@v2.1.3
|
||||
with:
|
||||
cache-source: root/.cargo/registry
|
||||
cache-target: /root/.cargo/registry
|
||||
- name: Cache root/.cargo/git
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: root/.cargo/git
|
||||
key: root/.cargo/git-${{ hashFiles('Dockerfile.scraper') }}
|
||||
- name: inject root/.cargo/git into docker
|
||||
uses: reproducible-containers/buildkit-cache-dance@v2.1.3
|
||||
with:
|
||||
cache-source: root/.cargo/git
|
||||
cache-target: /root/.cargo/git
|
||||
- name: Build and push Docker image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
export const DB_PATH = process.env.DB_PATH ?? "../scraper/sqlite.db";
|
||||
export const DB_PATH = process.env.DB_PATH ?? "../sqlite.db";
|
||||
|
||||
/** @type { import("drizzle-kit").Config } */
|
||||
export default {
|
||||
|
|
|
@ -1,33 +0,0 @@
|
|||
import pMap from "p-map";
|
||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||
import { getUrlsFromSitemap } from "./common.js";
|
||||
|
||||
export async function scrapCarrefourProducts() {
|
||||
await scrapBySitemap();
|
||||
}
|
||||
|
||||
async function scrapBySitemap() {
|
||||
// de https://www.carrefour.com.ar/sitemap.xml
|
||||
const sitemaps = [
|
||||
"https://www.carrefour.com.ar/sitemap/product-0.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-1.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-2.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-3.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-4.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-5.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-6.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-7.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-8.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-9.xml",
|
||||
];
|
||||
|
||||
await pMap(
|
||||
sitemaps,
|
||||
async (sitemapUrl) => {
|
||||
const res = await fetch(sitemapUrl);
|
||||
const xml = await res.text();
|
||||
saveUrls(getUrlsFromSitemap(xml));
|
||||
},
|
||||
{ concurrency: 3 }
|
||||
);
|
||||
}
|
|
@ -1,14 +0,0 @@
|
|||
import { decodeXML } from "entities";
|
||||
export function getUrlsFromSitemap(xml: string) {
|
||||
let urls = new Set<string>();
|
||||
new HTMLRewriter()
|
||||
.on("loc", {
|
||||
text(element) {
|
||||
const txt = element.text.trim();
|
||||
if (!txt) return;
|
||||
urls.add(decodeXML(txt));
|
||||
},
|
||||
})
|
||||
.transform(new Response(xml));
|
||||
return Array.from(urls);
|
||||
}
|
|
@ -1,44 +0,0 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
import PQueue from "p-queue";
|
||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||
|
||||
export async function scrapCotoProducts() {
|
||||
const initial =
|
||||
"https://www.cotodigital3.com.ar/sitios/cdigi/browse?Nf=product.endDate%7CGTEQ+1.7032032E12%7C%7Cproduct.startDate%7CLTEQ+1.7032032E12&No=2200&Nr=AND%28product.sDisp_200%3A1004%2Cproduct.language%3Aespa%C3%B1ol%2COR%28product.siteId%3ACotoDigital%29%29&Nrpp=200";
|
||||
|
||||
const queue = new PQueue({ concurrency: 4 });
|
||||
|
||||
const pageSize = 300; // hasta 1000
|
||||
const links = Array.from(
|
||||
{ length: Math.ceil(29000 / pageSize) },
|
||||
(x, i) => i
|
||||
).map((i) => {
|
||||
const url = new URL(initial);
|
||||
url.searchParams.set("No", `${i * pageSize}`);
|
||||
url.searchParams.set("Nrpp", `${pageSize}`);
|
||||
return url.toString();
|
||||
});
|
||||
|
||||
const promises = links.map((l) => queue.add(getPage(l)));
|
||||
await Promise.all(promises);
|
||||
}
|
||||
|
||||
function getPage(url: string) {
|
||||
return async () => {
|
||||
let html;
|
||||
try {
|
||||
const res = await fetch(url);
|
||||
html = await res.text();
|
||||
} catch (error) {
|
||||
await getPage(url)();
|
||||
return;
|
||||
}
|
||||
const { document } = parseHTML(html);
|
||||
|
||||
const hrefs = Array.from(
|
||||
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
|
||||
(a) => new URL(a.href, url).toString()
|
||||
);
|
||||
saveUrls(hrefs);
|
||||
};
|
||||
}
|
|
@ -1,124 +0,0 @@
|
|||
import pMap from "p-map";
|
||||
import { parseHTML } from "linkedom";
|
||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||
import { getUrlsFromSitemap } from "./common.js";
|
||||
|
||||
const categorias = [
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/conservas",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/aceites-y-aderezos",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/pastas-secas",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/arroz-y-legumbres",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/panaderia",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/golosinas-y-alfajores",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/reposteria",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/comidas-listas",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/harinas",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/picadas",
|
||||
"https://diaonline.supermercadosdia.com.ar/almacen/panaderia/pan-rallado-y-rebozadores",
|
||||
"https://diaonline.supermercadosdia.com.ar/desayuno",
|
||||
"https://diaonline.supermercadosdia.com.ar/desayuno/galletitas-y-cereales",
|
||||
"https://diaonline.supermercadosdia.com.ar/desayuno/infusiones-y-endulzantes",
|
||||
"https://diaonline.supermercadosdia.com.ar/desayuno/para-untar",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/leches",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/fiambreria",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/lacteos",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/carniceria",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/pastas-frescas",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/listos-para-disfrutar",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutas",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/verduras",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/huevos",
|
||||
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutos-secos",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/gaseosas",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/cervezas",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/aguas",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/bodega",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/jugos-e-isot%C3%B3nicas",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/aperitivos",
|
||||
"https://diaonline.supermercadosdia.com.ar/bebidas/bebidas-blancas-y-licores",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/hamburguesas-y-medallones",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/rebozados",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/vegetales-congelados",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/postres-congelados",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/pescaderia",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/papas-congeladas",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/comidas-congeladas",
|
||||
"https://diaonline.supermercadosdia.com.ar/congelados/hielo",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/cuidado-de-la-ropa",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/papeleria",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/limpiadores",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/limpieza-de-cocina",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/accesorios-de-limpieza",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/desodorantes-de-ambiente",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/insecticidas",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/fosforos-y-velas",
|
||||
"https://diaonline.supermercadosdia.com.ar/limpieza/bolsas",
|
||||
"https://diaonline.supermercadosdia.com.ar/4160?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||
"https://diaonline.supermercadosdia.com.ar/4136?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||
"https://diaonline.supermercadosdia.com.ar/4143?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||
"https://diaonline.supermercadosdia.com.ar/4189?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||
"https://diaonline.supermercadosdia.com.ar/4086?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||
"https://diaonline.supermercadosdia.com.ar/2089?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||
];
|
||||
|
||||
export async function scrapDiaProducts() {
|
||||
await Promise.all([
|
||||
// scrapBySite(),
|
||||
scrapBySitemap(),
|
||||
]);
|
||||
}
|
||||
|
||||
async function scrapBySitemap() {
|
||||
// de https://diaonline.supermercadosdia.com.ar/sitemap.xml
|
||||
const sitemaps = [
|
||||
"https://diaonline.supermercadosdia.com.ar/sitemap/product-1.xml",
|
||||
"https://diaonline.supermercadosdia.com.ar/sitemap/product-2.xml",
|
||||
"https://diaonline.supermercadosdia.com.ar/sitemap/product-3.xml",
|
||||
"https://diaonline.supermercadosdia.com.ar/sitemap/product-4.xml",
|
||||
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
|
||||
];
|
||||
|
||||
await pMap(
|
||||
sitemaps,
|
||||
async (sitemapUrl) => {
|
||||
const res = await fetch(sitemapUrl);
|
||||
const xml = await res.text();
|
||||
saveUrls(getUrlsFromSitemap(xml));
|
||||
},
|
||||
{ concurrency: 3 }
|
||||
);
|
||||
}
|
||||
|
||||
async function scrapBySite() {
|
||||
const links = categorias.flatMap((link) =>
|
||||
Array.from({ length: 51 }, (x, i) => i).map((i) => {
|
||||
const url = new URL(link);
|
||||
url.searchParams.set("page", `${i}`);
|
||||
return url.toString();
|
||||
})
|
||||
);
|
||||
|
||||
await pMap(
|
||||
links,
|
||||
async (url) => {
|
||||
const res = await fetch(url, { timeout: false });
|
||||
const html = await res.text();
|
||||
const { document } = parseHTML(html);
|
||||
|
||||
const hrefs = Array.from(
|
||||
document.querySelectorAll<HTMLAnchorElement>(
|
||||
"a.vtex-product-summary-2-x-clearLink"
|
||||
),
|
||||
(a) => new URL(a.href, url).toString()
|
||||
);
|
||||
saveUrls(hrefs);
|
||||
},
|
||||
{ concurrency: 32 }
|
||||
);
|
||||
}
|
|
@ -1,38 +0,0 @@
|
|||
import pMap from "p-map";
|
||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||
import { getUrlsFromSitemap } from "./common.js";
|
||||
|
||||
export async function scrapJumboProducts() {
|
||||
await scrapBySitemap();
|
||||
}
|
||||
|
||||
async function scrapBySitemap() {
|
||||
// de https://www.jumbo.com.ar/sitemap.xml
|
||||
const sitemaps = [
|
||||
"https://www.jumbo.com.ar/sitemap/product-1.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-10.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-11.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-12.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-13.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-14.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-15.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-2.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-3.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-4.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-5.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-6.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-7.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-8.xml",
|
||||
"https://www.jumbo.com.ar/sitemap/product-9.xml",
|
||||
];
|
||||
|
||||
await pMap(
|
||||
sitemaps,
|
||||
async (sitemapUrl) => {
|
||||
const res = await fetch(sitemapUrl);
|
||||
const xml = await res.text();
|
||||
saveUrls(getUrlsFromSitemap(xml));
|
||||
},
|
||||
{ concurrency: 3 }
|
||||
);
|
||||
}
|
|
@ -1,18 +0,0 @@
|
|||
{
|
||||
"name": "link-scrapers",
|
||||
"type": "module",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"entities": "^4.5.0",
|
||||
"linkedom": "^0.16.5",
|
||||
"p-queue": "^8.0.1"
|
||||
}
|
||||
}
|
1278
pnpm-lock.yaml
1278
pnpm-lock.yaml
File diff suppressed because it is too large
Load diff
|
@ -1,5 +1,3 @@
|
|||
packages:
|
||||
- link-scrapers
|
||||
- scraper
|
||||
- sitio
|
||||
- db-datos
|
||||
|
|
|
@ -111,7 +111,7 @@ async fn fetch_list(pool: &Pool, links: Vec<String>) -> Counters {
|
|||
}
|
||||
|
||||
fn connect_db() -> Pool {
|
||||
let db_path = env::var("DB_PATH").unwrap_or("../scraper/sqlite.db".to_string());
|
||||
let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string());
|
||||
let cfg = deadpool_sqlite::Config::new(db_path);
|
||||
let pool = cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap();
|
||||
pool
|
||||
|
|
137
scraper/auto.ts
137
scraper/auto.ts
|
@ -1,137 +0,0 @@
|
|||
import { mkdtemp, writeFile } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { Supermercado, hosts, supermercados } from "db-datos/supermercado.js";
|
||||
import PQueue from "p-queue";
|
||||
import { formatDuration, intervalToDuration } from "date-fns";
|
||||
import { db } from "db-datos/db.js";
|
||||
import { like } from "drizzle-orm";
|
||||
import { productoUrls } from "db-datos/schema.js";
|
||||
import { scrapDiaProducts } from "../link-scrapers/dia.js";
|
||||
import { scrapCotoProducts } from "../link-scrapers/coto.js";
|
||||
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
|
||||
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
|
||||
import { readableStreamToText } from "bun";
|
||||
|
||||
// hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU
|
||||
const scrapQueue = new PQueue({ concurrency: 1 });
|
||||
|
||||
export async function auto() {
|
||||
const a = new Auto();
|
||||
await Promise.all(supermercados.map((supr) => a.downloadList(supr)));
|
||||
}
|
||||
|
||||
class Auto {
|
||||
telegramConfig?: { token: string; chatId: string };
|
||||
|
||||
constructor() {
|
||||
if (!process.env.TELEGRAM_BOT_TOKEN)
|
||||
console.warn("no hay TELEGRAM_BOT_TOKEN, no voy a loggear por allá");
|
||||
else if (!process.env.TELEGRAM_BOT_CHAT_ID)
|
||||
console.warn("no hay TELEGRAM_BOT_CHAT_ID, no voy a loggear por allá");
|
||||
else
|
||||
this.telegramConfig = {
|
||||
token: process.env.TELEGRAM_BOT_TOKEN,
|
||||
chatId: process.env.TELEGRAM_BOT_CHAT_ID,
|
||||
};
|
||||
|
||||
this.inform("[auto] Empezando scrap");
|
||||
}
|
||||
|
||||
async scrapUrls(supermercado: Supermercado) {
|
||||
const t0 = performance.now();
|
||||
switch (supermercado) {
|
||||
case "Dia":
|
||||
await scrapDiaProducts();
|
||||
break;
|
||||
case "Coto":
|
||||
await scrapCotoProducts();
|
||||
break;
|
||||
case "Carrefour":
|
||||
await scrapCarrefourProducts();
|
||||
break;
|
||||
case "Jumbo":
|
||||
await scrapJumboProducts();
|
||||
break;
|
||||
}
|
||||
this.inform(
|
||||
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
|
||||
);
|
||||
}
|
||||
|
||||
async downloadList(supermercado: Supermercado) {
|
||||
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-download-"));
|
||||
|
||||
await scrapQueue.add(async () => {
|
||||
await this.scrapUrls(supermercado);
|
||||
});
|
||||
|
||||
const listPath = join(ctxPath, `lista-${supermercado}.txt`);
|
||||
const host = Object.entries(hosts).find(
|
||||
([host, supe]) => supe === supermercado
|
||||
)![0];
|
||||
const results = await db.query.productoUrls
|
||||
.findMany({
|
||||
where: like(productoUrls.url, `%${host}%`),
|
||||
})
|
||||
.execute();
|
||||
const urls = results.map((r) => r.url);
|
||||
await writeFile(listPath, urls.join("\n") + "\n");
|
||||
|
||||
this.scrapAndInform({ listPath });
|
||||
// TODO: borrar archivos temporales
|
||||
}
|
||||
|
||||
async scrapAndInform({ listPath }: { listPath: string }) {
|
||||
const res = await scrapQueue.add(async () => {
|
||||
const t0 = performance.now();
|
||||
|
||||
const sub = Bun.spawn({
|
||||
cmd: ["scraper-rs", "fetch-list", listPath],
|
||||
stdio: ["ignore", "pipe", "inherit"],
|
||||
});
|
||||
const text = await readableStreamToText(sub.stdout);
|
||||
const code = await sub.exited;
|
||||
if (code !== 0) throw new Error(`scraper-rs threw ${code}`);
|
||||
|
||||
return { took: performance.now() - t0, text };
|
||||
});
|
||||
|
||||
if (res) {
|
||||
const { took, text } = res;
|
||||
this.inform(
|
||||
`Procesado ${listPath} (${text}) (tardó ${formatMs(took)})`
|
||||
//(${progress.done} ok, ${
|
||||
// progress.skipped
|
||||
// } skipped, ${progress.errors.length} errores)
|
||||
);
|
||||
} else {
|
||||
this.inform(`Algo falló en ${listPath}`);
|
||||
}
|
||||
}
|
||||
|
||||
inform(msg: string) {
|
||||
this.sendTelegramMsg(msg);
|
||||
console.info(msg);
|
||||
}
|
||||
report(msg: string) {
|
||||
this.inform(msg);
|
||||
const error = new Error(msg);
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
async sendTelegramMsg(text: string) {
|
||||
if (!this.telegramConfig) return;
|
||||
const url = new URL(
|
||||
`https://api.telegram.org/bot${this.telegramConfig.token}/sendMessage`
|
||||
);
|
||||
url.searchParams.set("chat_id", this.telegramConfig.chatId);
|
||||
url.searchParams.set("text", text);
|
||||
await fetch(url);
|
||||
}
|
||||
}
|
||||
|
||||
function formatMs(ms: number) {
|
||||
return formatDuration(intervalToDuration({ start: 0, end: Math.round(ms) }));
|
||||
}
|
|
@ -1,44 +0,0 @@
|
|||
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
|
||||
import { scrapCotoProducts } from "../link-scrapers/coto.js";
|
||||
import { scrapDiaProducts } from "../link-scrapers/dia.js";
|
||||
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
|
||||
import { auto } from "./auto.js";
|
||||
import { downloadList, getProduct } from "./scrap.js";
|
||||
import Cron from "croner";
|
||||
|
||||
if (process.argv[2] === "auto") {
|
||||
await auto();
|
||||
} else if (process.argv[2] === "cron") {
|
||||
Cron("0 2 * * *", () => {
|
||||
auto();
|
||||
});
|
||||
} else if (process.argv[2] === "scrap-carrefour-links") {
|
||||
await scrapCarrefourProducts();
|
||||
} else if (process.argv[2] === "scrap-dia-links") {
|
||||
await scrapDiaProducts();
|
||||
} else if (process.argv[2] === "scrap-coto-links") {
|
||||
await scrapCotoProducts();
|
||||
} else if (process.argv[2] === "scrap-jumbo-links") {
|
||||
await scrapJumboProducts();
|
||||
} else if (process.argv[2] === "scrap-link") {
|
||||
const url = new URL(process.argv[3]);
|
||||
const res = await fetch(url);
|
||||
const text = await res.text();
|
||||
console.info(await getProduct(url, text));
|
||||
} else if (process.argv[2] === "scrap") {
|
||||
const urlLists = process.argv.slice(3);
|
||||
if (urlLists.length > 0) {
|
||||
for (const path of urlLists) {
|
||||
const res = await downloadList(path);
|
||||
console.info("=======================================");
|
||||
console.info(path, res);
|
||||
console.info("=======================================");
|
||||
}
|
||||
} else {
|
||||
console.error("Especificá listas de urls para scrapear.");
|
||||
process.exit(1);
|
||||
}
|
||||
} else {
|
||||
console.error("Especificá una acción (tipo `auto` o `scrap`) para hacer.");
|
||||
process.exit(1);
|
||||
}
|
|
@ -1,29 +0,0 @@
|
|||
{
|
||||
"name": "scraper",
|
||||
"type": "module",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"check": "tsc"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@aws-sdk/client-s3": "^3.478.0",
|
||||
"@aws-sdk/lib-storage": "^3.478.0",
|
||||
"croner": "^8.0.0",
|
||||
"date-fns": "^3.0.6",
|
||||
"db-datos": "workspace:^",
|
||||
"drizzle-orm": "^0.29.1",
|
||||
"linkedom": "^0.16.5",
|
||||
"nanoid": "^5.0.4",
|
||||
"p-map": "^7.0.1",
|
||||
"p-queue": "^8.0.1",
|
||||
"zod": "^3.22.4"
|
||||
},
|
||||
"devDependencies": {
|
||||
"typescript": "^5.3.3"
|
||||
}
|
||||
}
|
|
@ -1,56 +0,0 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
import { Precioish } from "../scrap.js";
|
||||
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
|
||||
|
||||
function parseScriptJson<T>(dom: Window, varname: string): T {
|
||||
const script = dom.window.document.querySelector<HTMLTemplateElement>(
|
||||
`template[data-type="json"][data-varname="${varname}"]`
|
||||
)?.content?.children[0];
|
||||
if (!script) throw new Error("no encuentro el script");
|
||||
return JSON.parse(script.innerHTML);
|
||||
}
|
||||
function eanFromSeedState(dom: Window): string {
|
||||
const json = parseScriptJson<object>(dom, "__STATE__");
|
||||
const productJson = Object.entries(json).find(
|
||||
([key, val]) => key.startsWith("Product:") && val.__typename === "Product"
|
||||
);
|
||||
if (!productJson) throw new Error("no encontré el product en el json");
|
||||
|
||||
const productSkuJson = Object.entries(json).find(
|
||||
([key, val]) =>
|
||||
key.startsWith(`Product:${productJson[1].cacheId}`) &&
|
||||
val.__typename === "SKU"
|
||||
);
|
||||
if (!productSkuJson) throw new Error("no encontré el sku en el json");
|
||||
return productSkuJson[1].ean;
|
||||
}
|
||||
|
||||
export function getCarrefourProduct(html: string | Buffer): Precioish {
|
||||
const dom = parseHTML(html);
|
||||
|
||||
const precioCentavos = priceFromMeta(dom);
|
||||
const inStock = stockFromMeta(dom);
|
||||
|
||||
const ean = eanFromSeedState(dom);
|
||||
|
||||
let name, imageUrl;
|
||||
try {
|
||||
const ld = getProductJsonLd(dom);
|
||||
name = ld.name;
|
||||
imageUrl = ld.image;
|
||||
} catch (error) {
|
||||
if (inStock) {
|
||||
throw error;
|
||||
} else {
|
||||
// algunas paginas sin stock no tienen json ld
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
name,
|
||||
imageUrl,
|
||||
ean,
|
||||
precioCentavos,
|
||||
inStock,
|
||||
};
|
||||
}
|
|
@ -1,55 +0,0 @@
|
|||
import { z } from "zod";
|
||||
|
||||
export function getMetaProp(dom: Window, prop: string) {
|
||||
return dom.window.document
|
||||
.querySelector(`meta[property="${prop}"]`)
|
||||
?.getAttribute("content");
|
||||
}
|
||||
|
||||
export function priceFromMeta(dom: Window) {
|
||||
const precioMeta = getMetaProp(dom, "product:price:amount");
|
||||
if (!precioMeta) return null;
|
||||
const precioCentavos = parseFloat(precioMeta) * 100;
|
||||
return precioCentavos;
|
||||
}
|
||||
export function stockFromMeta(dom: Window) {
|
||||
const stockMeta = getMetaProp(dom, "product:availability");
|
||||
return stockMeta === "instock";
|
||||
}
|
||||
|
||||
function parseJsonLds(dom: Window): object[] {
|
||||
const scripts = dom.window.document.querySelectorAll(
|
||||
'script[type="application/ld+json"]'
|
||||
);
|
||||
return Array.from(scripts, (script) => JSON.parse(script.innerHTML));
|
||||
}
|
||||
function findJsonLd(dom: Window, type: string): object | undefined {
|
||||
return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type);
|
||||
}
|
||||
|
||||
const zProductLd = z.object({
|
||||
"@type": z.literal("Product"),
|
||||
name: z.string(),
|
||||
image: z.string(),
|
||||
sku: z.string().optional(),
|
||||
offers: z.object({
|
||||
offers: z.array(
|
||||
z.object({
|
||||
"@type": z.literal("Offer"),
|
||||
price: z.number(),
|
||||
priceCurrency: z.literal("ARS"),
|
||||
availability: z.enum([
|
||||
"http://schema.org/OutOfStock",
|
||||
"http://schema.org/InStock",
|
||||
]),
|
||||
})
|
||||
),
|
||||
}),
|
||||
});
|
||||
type ProductLd = z.infer<typeof zProductLd>;
|
||||
|
||||
export function getProductJsonLd(dom: Window): ProductLd {
|
||||
const ld = findJsonLd(dom, "Product");
|
||||
const productLd = zProductLd.parse(ld);
|
||||
return productLd;
|
||||
}
|
|
@ -1,48 +0,0 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
import { type Precioish } from "../scrap.js";
|
||||
|
||||
function getEanFromText({ document }: Window) {
|
||||
const potentialEanEls = Array.from(
|
||||
document.querySelectorAll("div#brandText")
|
||||
);
|
||||
const eanParent = potentialEanEls.find(
|
||||
(el) => el.textContent?.includes("| EAN: ")
|
||||
);
|
||||
if (!eanParent) throw new Error("no encuentro el eanparent");
|
||||
|
||||
const eanEl = Array.from(
|
||||
eanParent?.querySelectorAll("span.span_codigoplu")
|
||||
)[1];
|
||||
const ean = eanEl?.textContent?.trim();
|
||||
if (!ean) throw new Error("no encuentro el ean");
|
||||
return ean;
|
||||
}
|
||||
function getPriceFromText({ document }: Window) {
|
||||
const el = document.querySelector(".atg_store_newPrice");
|
||||
if (!el?.textContent) return null;
|
||||
const nStr = el.textContent
|
||||
.trim()
|
||||
.replace("$", "")
|
||||
.replaceAll(".", "")
|
||||
.replace(",", ".");
|
||||
return parseFloat(nStr) * 100;
|
||||
}
|
||||
function getInStock({ document }: Window) {
|
||||
return !document.querySelector(".product_not_available");
|
||||
}
|
||||
|
||||
export function getCotoProduct(html: string | Buffer): Precioish {
|
||||
const dom = parseHTML(html);
|
||||
|
||||
const ean = getEanFromText(dom);
|
||||
const precioCentavos = getPriceFromText(dom);
|
||||
const inStock = getInStock(dom);
|
||||
|
||||
const name = dom.document
|
||||
.querySelector("h1.product_page")
|
||||
?.textContent?.trim();
|
||||
const imageUrl =
|
||||
dom.document.querySelector<HTMLImageElement>(".zoom img")?.src;
|
||||
|
||||
return { name, imageUrl, ean, precioCentavos, inStock };
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
import { type Precioish } from "../scrap.js";
|
||||
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
|
||||
|
||||
export function getDiaProduct(html: string | Buffer): Precioish {
|
||||
const dom = parseHTML(html);
|
||||
|
||||
const ean = getMetaProp(dom, "product:retailer_item_id");
|
||||
if (!ean) throw new Error("No encontré el ean");
|
||||
const precioCentavos = priceFromMeta(dom);
|
||||
|
||||
const ld = getProductJsonLd(dom);
|
||||
const name = ld.name;
|
||||
const imageUrl = ld.image;
|
||||
const inStock =
|
||||
ld.offers.offers[0].availability === "http://schema.org/InStock";
|
||||
|
||||
return {
|
||||
name,
|
||||
imageUrl,
|
||||
ean,
|
||||
precioCentavos,
|
||||
inStock,
|
||||
};
|
||||
}
|
|
@ -1,54 +0,0 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
import { type Precioish } from "../scrap.js";
|
||||
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
|
||||
import { z } from "zod";
|
||||
|
||||
const zJumboSearch = z.tuple([
|
||||
z.object({
|
||||
items: z.array(
|
||||
z.object({
|
||||
ean: z.string(),
|
||||
})
|
||||
),
|
||||
}),
|
||||
]);
|
||||
|
||||
async function getEanFromSearch(sku: string) {
|
||||
const url = new URL(
|
||||
"https://www.jumbo.com.ar/api/catalog_system/pub/products/search"
|
||||
);
|
||||
url.searchParams.set("fq", `skuId:${sku}`);
|
||||
const res = await fetch(url);
|
||||
const json = await res.json();
|
||||
const parsed = zJumboSearch.parse(json);
|
||||
const ean = parsed[0].items[0].ean;
|
||||
if (!parsed[0].items.every((x) => x.ean === ean)) {
|
||||
throw new Error("Inesperado: no todos los items tienen el mismo EAN");
|
||||
}
|
||||
return ean;
|
||||
}
|
||||
|
||||
export async function getJumboProduct(
|
||||
html: string | Buffer
|
||||
): Promise<Precioish> {
|
||||
const dom = parseHTML(html);
|
||||
const precioCentavos = priceFromMeta(dom);
|
||||
const inStock = stockFromMeta(dom);
|
||||
|
||||
const ld = getProductJsonLd(dom);
|
||||
const name = ld.name;
|
||||
const imageUrl = ld.image;
|
||||
|
||||
const retailerSku = ld.sku;
|
||||
if (!retailerSku)
|
||||
throw new Error("No encontré el SKU de Jumbo para pedir el EAN");
|
||||
const ean = await getEanFromSearch(retailerSku);
|
||||
|
||||
return {
|
||||
name,
|
||||
imageUrl,
|
||||
ean,
|
||||
precioCentavos,
|
||||
inStock,
|
||||
};
|
||||
}
|
127
scraper/scrap.ts
127
scraper/scrap.ts
|
@ -1,127 +0,0 @@
|
|||
/// <reference lib="dom" />
|
||||
import * as schema from "db-datos/schema.js";
|
||||
import { writeFile, mkdir } from "fs/promises";
|
||||
import { createHash } from "crypto";
|
||||
import { getCarrefourProduct } from "./parsers/carrefour.js";
|
||||
import { getDiaProduct } from "./parsers/dia.js";
|
||||
import { getCotoProduct } from "./parsers/coto.js";
|
||||
import { join } from "path";
|
||||
import { db } from "db-datos/db.js";
|
||||
import pMap from "p-map";
|
||||
import { getJumboProduct } from "./parsers/jumbo.js";
|
||||
|
||||
const DEBUG = true;
|
||||
const PARSER_VERSION = 4;
|
||||
|
||||
export type Precio = typeof schema.precios.$inferInsert;
|
||||
export type Precioish = Omit<
|
||||
Precio,
|
||||
"fetchedAt" | "url" | "id" | "warcRecordId" | "parserVersion"
|
||||
>;
|
||||
|
||||
export async function downloadList(path: string) {
|
||||
let list = (await Bun.file(path).text())
|
||||
.split("\n")
|
||||
.filter((s) => s.length > 0);
|
||||
|
||||
const results = await pMap(
|
||||
list,
|
||||
async (urlS) => {
|
||||
let res: ScrapResult = { type: "skipped" };
|
||||
for (let attempts = 0; attempts < 6; attempts++) {
|
||||
if (attempts !== 0) await wait(1500);
|
||||
res = await scrap(urlS);
|
||||
if (res.type === "done" || res.type === "skipped") {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (res.type === "error") console.error(res);
|
||||
return res;
|
||||
},
|
||||
{ concurrency: 32 }
|
||||
);
|
||||
|
||||
let progress: {
|
||||
done: number;
|
||||
skipped: number;
|
||||
errors: { error: any; url: string; debugPath: string }[];
|
||||
} = { done: 0, skipped: 0, errors: [] };
|
||||
for (const result of results) {
|
||||
switch (result.type) {
|
||||
case "done":
|
||||
progress.done++;
|
||||
break;
|
||||
case "error":
|
||||
progress.errors.push(result);
|
||||
break;
|
||||
case "skipped":
|
||||
progress.skipped++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return progress;
|
||||
}
|
||||
|
||||
export async function getProduct(url: URL, html: string): Promise<Precioish> {
|
||||
if (url.hostname === "www.carrefour.com.ar") return getCarrefourProduct(html);
|
||||
else if (url.hostname === "diaonline.supermercadosdia.com.ar")
|
||||
return getDiaProduct(html);
|
||||
else if (url.hostname === "www.cotodigital3.com.ar")
|
||||
return getCotoProduct(html);
|
||||
else if (url.hostname === "www.jumbo.com.ar")
|
||||
return await getJumboProduct(html);
|
||||
else throw new Error(`Unknown host ${url.hostname}`);
|
||||
}
|
||||
|
||||
type ScrapResult =
|
||||
| { type: "skipped" }
|
||||
| { type: "done" }
|
||||
| { type: "error"; url: string; error: any; debugPath: string };
|
||||
async function scrap(urlS: string): Promise<ScrapResult> {
|
||||
let url;
|
||||
try {
|
||||
url = new URL(urlS);
|
||||
} catch (err) {
|
||||
console.error(`skipped ${urlS} because ${err}`);
|
||||
return { type: "skipped" };
|
||||
}
|
||||
const res = await fetch(url);
|
||||
if (!res.ok) {
|
||||
console.debug(`skipped ${urlS} because status=${res.status} (!=200)`);
|
||||
return { type: "skipped" };
|
||||
}
|
||||
|
||||
const html = await res.text();
|
||||
|
||||
try {
|
||||
let ish = await getProduct(url, html);
|
||||
|
||||
const p: Precio = {
|
||||
...ish,
|
||||
fetchedAt: new Date(),
|
||||
url: urlS,
|
||||
parserVersion: PARSER_VERSION,
|
||||
};
|
||||
|
||||
await db.insert(schema.precios).values(p);
|
||||
|
||||
return { type: "done" };
|
||||
} catch (error) {
|
||||
const urlHash = createHash("md5").update(urlS).digest("hex");
|
||||
const output = join("debug", `${urlHash}.html`);
|
||||
if (DEBUG) {
|
||||
await mkdir("debug", { recursive: true });
|
||||
await writeFile(output, html);
|
||||
}
|
||||
return {
|
||||
type: "error",
|
||||
url: urlS,
|
||||
error,
|
||||
debugPath: output,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
function wait(ms: number) {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
|
@ -1,4 +0,0 @@
|
|||
{
|
||||
"extends": "../tsconfig.json",
|
||||
"exclude": ["../sitio"]
|
||||
}
|
|
@ -11,7 +11,7 @@ export const load: PageServerLoad = async ({ url }) => {
|
|||
join precios p on p.ean = f.ean
|
||||
where f.name match ${`"${query}"`}
|
||||
group by p.ean
|
||||
having max(p.fetched_at) and max(p.in_stock)
|
||||
having max(p.fetched_at)
|
||||
order by p.in_stock desc;`;
|
||||
results = db.all(sqlQuery);
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue