Compare commits

..

No commits in common. "1439c3dd1da6cf4731556b84abf100b7b3ebc64f" and "3396a433e592faa22307105eefc47fee6f539b87" have entirely different histories.

22 changed files with 2132 additions and 26 deletions

View file

@ -88,32 +88,12 @@ jobs:
uses: actions/cache@v3 uses: actions/cache@v3
with: with:
path: usr/src/app/target path: usr/src/app/target
key: usr/src/app/target-${{ hashFiles('Dockerfile.scraper') }} key: usr/src/app/target-${{ hashFiles('Dockerfile') }}
- name: inject usr/src/app/target into docker - name: inject usr/src/app/target into docker
uses: reproducible-containers/buildkit-cache-dance@v2.1.3 uses: reproducible-containers/buildkit-cache-dance@v2.1.3
with: with:
cache-source: usr/src/app/target cache-source: usr/src/app/target
cache-target: /usr/src/app/target cache-target: /usr/src/app/target
- name: Cache root/.cargo/registry
uses: actions/cache@v3
with:
path: root/.cargo/registry
key: root/.cargo/registry-${{ hashFiles('Dockerfile.scraper') }}
- name: inject root/.cargo/registry into docker
uses: reproducible-containers/buildkit-cache-dance@v2.1.3
with:
cache-source: root/.cargo/registry
cache-target: /root/.cargo/registry
- name: Cache root/.cargo/git
uses: actions/cache@v3
with:
path: root/.cargo/git
key: root/.cargo/git-${{ hashFiles('Dockerfile.scraper') }}
- name: inject root/.cargo/git into docker
uses: reproducible-containers/buildkit-cache-dance@v2.1.3
with:
cache-source: root/.cargo/git
cache-target: /root/.cargo/git
- name: Build and push Docker image - name: Build and push Docker image
uses: docker/build-push-action@v5 uses: docker/build-push-action@v5
with: with:

View file

@ -1,4 +1,4 @@
export const DB_PATH = process.env.DB_PATH ?? "../sqlite.db"; export const DB_PATH = process.env.DB_PATH ?? "../scraper/sqlite.db";
/** @type { import("drizzle-kit").Config } */ /** @type { import("drizzle-kit").Config } */
export default { export default {

View file

@ -0,0 +1,33 @@
import pMap from "p-map";
import { saveUrls } from "db-datos/urlHelpers.js";
import { getUrlsFromSitemap } from "./common.js";
export async function scrapCarrefourProducts() {
await scrapBySitemap();
}
async function scrapBySitemap() {
// de https://www.carrefour.com.ar/sitemap.xml
const sitemaps = [
"https://www.carrefour.com.ar/sitemap/product-0.xml",
"https://www.carrefour.com.ar/sitemap/product-1.xml",
"https://www.carrefour.com.ar/sitemap/product-2.xml",
"https://www.carrefour.com.ar/sitemap/product-3.xml",
"https://www.carrefour.com.ar/sitemap/product-4.xml",
"https://www.carrefour.com.ar/sitemap/product-5.xml",
"https://www.carrefour.com.ar/sitemap/product-6.xml",
"https://www.carrefour.com.ar/sitemap/product-7.xml",
"https://www.carrefour.com.ar/sitemap/product-8.xml",
"https://www.carrefour.com.ar/sitemap/product-9.xml",
];
await pMap(
sitemaps,
async (sitemapUrl) => {
const res = await fetch(sitemapUrl);
const xml = await res.text();
saveUrls(getUrlsFromSitemap(xml));
},
{ concurrency: 3 }
);
}

14
link-scrapers/common.ts Normal file
View file

@ -0,0 +1,14 @@
import { decodeXML } from "entities";
export function getUrlsFromSitemap(xml: string) {
let urls = new Set<string>();
new HTMLRewriter()
.on("loc", {
text(element) {
const txt = element.text.trim();
if (!txt) return;
urls.add(decodeXML(txt));
},
})
.transform(new Response(xml));
return Array.from(urls);
}

44
link-scrapers/coto.ts Normal file
View file

@ -0,0 +1,44 @@
import { parseHTML } from "linkedom";
import PQueue from "p-queue";
import { saveUrls } from "db-datos/urlHelpers.js";
export async function scrapCotoProducts() {
const initial =
"https://www.cotodigital3.com.ar/sitios/cdigi/browse?Nf=product.endDate%7CGTEQ+1.7032032E12%7C%7Cproduct.startDate%7CLTEQ+1.7032032E12&No=2200&Nr=AND%28product.sDisp_200%3A1004%2Cproduct.language%3Aespa%C3%B1ol%2COR%28product.siteId%3ACotoDigital%29%29&Nrpp=200";
const queue = new PQueue({ concurrency: 4 });
const pageSize = 300; // hasta 1000
const links = Array.from(
{ length: Math.ceil(29000 / pageSize) },
(x, i) => i
).map((i) => {
const url = new URL(initial);
url.searchParams.set("No", `${i * pageSize}`);
url.searchParams.set("Nrpp", `${pageSize}`);
return url.toString();
});
const promises = links.map((l) => queue.add(getPage(l)));
await Promise.all(promises);
}
function getPage(url: string) {
return async () => {
let html;
try {
const res = await fetch(url);
html = await res.text();
} catch (error) {
await getPage(url)();
return;
}
const { document } = parseHTML(html);
const hrefs = Array.from(
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
(a) => new URL(a.href, url).toString()
);
saveUrls(hrefs);
};
}

124
link-scrapers/dia.ts Normal file
View file

@ -0,0 +1,124 @@
import pMap from "p-map";
import { parseHTML } from "linkedom";
import { saveUrls } from "db-datos/urlHelpers.js";
import { getUrlsFromSitemap } from "./common.js";
const categorias = [
"https://diaonline.supermercadosdia.com.ar/almacen",
"https://diaonline.supermercadosdia.com.ar/almacen/conservas",
"https://diaonline.supermercadosdia.com.ar/almacen/aceites-y-aderezos",
"https://diaonline.supermercadosdia.com.ar/almacen/pastas-secas",
"https://diaonline.supermercadosdia.com.ar/almacen/arroz-y-legumbres",
"https://diaonline.supermercadosdia.com.ar/almacen/panaderia",
"https://diaonline.supermercadosdia.com.ar/almacen/golosinas-y-alfajores",
"https://diaonline.supermercadosdia.com.ar/almacen/reposteria",
"https://diaonline.supermercadosdia.com.ar/almacen/comidas-listas",
"https://diaonline.supermercadosdia.com.ar/almacen/harinas",
"https://diaonline.supermercadosdia.com.ar/almacen/picadas",
"https://diaonline.supermercadosdia.com.ar/almacen/panaderia/pan-rallado-y-rebozadores",
"https://diaonline.supermercadosdia.com.ar/desayuno",
"https://diaonline.supermercadosdia.com.ar/desayuno/galletitas-y-cereales",
"https://diaonline.supermercadosdia.com.ar/desayuno/infusiones-y-endulzantes",
"https://diaonline.supermercadosdia.com.ar/desayuno/para-untar",
"https://diaonline.supermercadosdia.com.ar/frescos",
"https://diaonline.supermercadosdia.com.ar/frescos/leches",
"https://diaonline.supermercadosdia.com.ar/frescos/fiambreria",
"https://diaonline.supermercadosdia.com.ar/frescos/lacteos",
"https://diaonline.supermercadosdia.com.ar/frescos/carniceria",
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras",
"https://diaonline.supermercadosdia.com.ar/frescos/pastas-frescas",
"https://diaonline.supermercadosdia.com.ar/frescos/listos-para-disfrutar",
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutas",
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/verduras",
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/huevos",
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutos-secos",
"https://diaonline.supermercadosdia.com.ar/bebidas",
"https://diaonline.supermercadosdia.com.ar/bebidas/gaseosas",
"https://diaonline.supermercadosdia.com.ar/bebidas/cervezas",
"https://diaonline.supermercadosdia.com.ar/bebidas/aguas",
"https://diaonline.supermercadosdia.com.ar/bebidas/bodega",
"https://diaonline.supermercadosdia.com.ar/bebidas/jugos-e-isot%C3%B3nicas",
"https://diaonline.supermercadosdia.com.ar/bebidas/aperitivos",
"https://diaonline.supermercadosdia.com.ar/bebidas/bebidas-blancas-y-licores",
"https://diaonline.supermercadosdia.com.ar/congelados",
"https://diaonline.supermercadosdia.com.ar/congelados/hamburguesas-y-medallones",
"https://diaonline.supermercadosdia.com.ar/congelados/rebozados",
"https://diaonline.supermercadosdia.com.ar/congelados/vegetales-congelados",
"https://diaonline.supermercadosdia.com.ar/congelados/postres-congelados",
"https://diaonline.supermercadosdia.com.ar/congelados/pescaderia",
"https://diaonline.supermercadosdia.com.ar/congelados/papas-congeladas",
"https://diaonline.supermercadosdia.com.ar/congelados/comidas-congeladas",
"https://diaonline.supermercadosdia.com.ar/congelados/hielo",
"https://diaonline.supermercadosdia.com.ar/limpieza",
"https://diaonline.supermercadosdia.com.ar/limpieza/cuidado-de-la-ropa",
"https://diaonline.supermercadosdia.com.ar/limpieza/papeleria",
"https://diaonline.supermercadosdia.com.ar/limpieza/limpiadores",
"https://diaonline.supermercadosdia.com.ar/limpieza/limpieza-de-cocina",
"https://diaonline.supermercadosdia.com.ar/limpieza/accesorios-de-limpieza",
"https://diaonline.supermercadosdia.com.ar/limpieza/desodorantes-de-ambiente",
"https://diaonline.supermercadosdia.com.ar/limpieza/insecticidas",
"https://diaonline.supermercadosdia.com.ar/limpieza/fosforos-y-velas",
"https://diaonline.supermercadosdia.com.ar/limpieza/bolsas",
"https://diaonline.supermercadosdia.com.ar/4160?map=productClusterIds&order=OrderByBestDiscountDESC",
"https://diaonline.supermercadosdia.com.ar/4136?map=productClusterIds&order=OrderByBestDiscountDESC",
"https://diaonline.supermercadosdia.com.ar/4143?map=productClusterIds&order=OrderByBestDiscountDESC",
"https://diaonline.supermercadosdia.com.ar/4189?map=productClusterIds&order=OrderByBestDiscountDESC",
"https://diaonline.supermercadosdia.com.ar/4086?map=productClusterIds&order=OrderByBestDiscountDESC",
"https://diaonline.supermercadosdia.com.ar/2089?map=productClusterIds&order=OrderByBestDiscountDESC",
];
export async function scrapDiaProducts() {
await Promise.all([
// scrapBySite(),
scrapBySitemap(),
]);
}
async function scrapBySitemap() {
// de https://diaonline.supermercadosdia.com.ar/sitemap.xml
const sitemaps = [
"https://diaonline.supermercadosdia.com.ar/sitemap/product-1.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-2.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-3.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-4.xml",
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
];
await pMap(
sitemaps,
async (sitemapUrl) => {
const res = await fetch(sitemapUrl);
const xml = await res.text();
saveUrls(getUrlsFromSitemap(xml));
},
{ concurrency: 3 }
);
}
async function scrapBySite() {
const links = categorias.flatMap((link) =>
Array.from({ length: 51 }, (x, i) => i).map((i) => {
const url = new URL(link);
url.searchParams.set("page", `${i}`);
return url.toString();
})
);
await pMap(
links,
async (url) => {
const res = await fetch(url, { timeout: false });
const html = await res.text();
const { document } = parseHTML(html);
const hrefs = Array.from(
document.querySelectorAll<HTMLAnchorElement>(
"a.vtex-product-summary-2-x-clearLink"
),
(a) => new URL(a.href, url).toString()
);
saveUrls(hrefs);
},
{ concurrency: 32 }
);
}

38
link-scrapers/jumbo.ts Normal file
View file

@ -0,0 +1,38 @@
import pMap from "p-map";
import { saveUrls } from "db-datos/urlHelpers.js";
import { getUrlsFromSitemap } from "./common.js";
export async function scrapJumboProducts() {
await scrapBySitemap();
}
async function scrapBySitemap() {
// de https://www.jumbo.com.ar/sitemap.xml
const sitemaps = [
"https://www.jumbo.com.ar/sitemap/product-1.xml",
"https://www.jumbo.com.ar/sitemap/product-10.xml",
"https://www.jumbo.com.ar/sitemap/product-11.xml",
"https://www.jumbo.com.ar/sitemap/product-12.xml",
"https://www.jumbo.com.ar/sitemap/product-13.xml",
"https://www.jumbo.com.ar/sitemap/product-14.xml",
"https://www.jumbo.com.ar/sitemap/product-15.xml",
"https://www.jumbo.com.ar/sitemap/product-2.xml",
"https://www.jumbo.com.ar/sitemap/product-3.xml",
"https://www.jumbo.com.ar/sitemap/product-4.xml",
"https://www.jumbo.com.ar/sitemap/product-5.xml",
"https://www.jumbo.com.ar/sitemap/product-6.xml",
"https://www.jumbo.com.ar/sitemap/product-7.xml",
"https://www.jumbo.com.ar/sitemap/product-8.xml",
"https://www.jumbo.com.ar/sitemap/product-9.xml",
];
await pMap(
sitemaps,
async (sitemapUrl) => {
const res = await fetch(sitemapUrl);
const xml = await res.text();
saveUrls(getUrlsFromSitemap(xml));
},
{ concurrency: 3 }
);
}

View file

@ -0,0 +1,18 @@
{
"name": "link-scrapers",
"type": "module",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"entities": "^4.5.0",
"linkedom": "^0.16.5",
"p-queue": "^8.0.1"
}
}

File diff suppressed because it is too large Load diff

View file

@ -1,3 +1,5 @@
packages: packages:
- link-scrapers
- scraper
- sitio - sitio
- db-datos - db-datos

View file

@ -111,7 +111,7 @@ async fn fetch_list(pool: &Pool, links: Vec<String>) -> Counters {
} }
fn connect_db() -> Pool { fn connect_db() -> Pool {
let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string()); let db_path = env::var("DB_PATH").unwrap_or("../scraper/sqlite.db".to_string());
let cfg = deadpool_sqlite::Config::new(db_path); let cfg = deadpool_sqlite::Config::new(db_path);
let pool = cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap(); let pool = cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap();
pool pool

137
scraper/auto.ts Normal file
View file

@ -0,0 +1,137 @@
import { mkdtemp, writeFile } from "node:fs/promises";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { Supermercado, hosts, supermercados } from "db-datos/supermercado.js";
import PQueue from "p-queue";
import { formatDuration, intervalToDuration } from "date-fns";
import { db } from "db-datos/db.js";
import { like } from "drizzle-orm";
import { productoUrls } from "db-datos/schema.js";
import { scrapDiaProducts } from "../link-scrapers/dia.js";
import { scrapCotoProducts } from "../link-scrapers/coto.js";
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
import { readableStreamToText } from "bun";
// hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU
const scrapQueue = new PQueue({ concurrency: 1 });
export async function auto() {
const a = new Auto();
await Promise.all(supermercados.map((supr) => a.downloadList(supr)));
}
class Auto {
telegramConfig?: { token: string; chatId: string };
constructor() {
if (!process.env.TELEGRAM_BOT_TOKEN)
console.warn("no hay TELEGRAM_BOT_TOKEN, no voy a loggear por allá");
else if (!process.env.TELEGRAM_BOT_CHAT_ID)
console.warn("no hay TELEGRAM_BOT_CHAT_ID, no voy a loggear por allá");
else
this.telegramConfig = {
token: process.env.TELEGRAM_BOT_TOKEN,
chatId: process.env.TELEGRAM_BOT_CHAT_ID,
};
this.inform("[auto] Empezando scrap");
}
async scrapUrls(supermercado: Supermercado) {
const t0 = performance.now();
switch (supermercado) {
case "Dia":
await scrapDiaProducts();
break;
case "Coto":
await scrapCotoProducts();
break;
case "Carrefour":
await scrapCarrefourProducts();
break;
case "Jumbo":
await scrapJumboProducts();
break;
}
this.inform(
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
);
}
async downloadList(supermercado: Supermercado) {
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-download-"));
await scrapQueue.add(async () => {
await this.scrapUrls(supermercado);
});
const listPath = join(ctxPath, `lista-${supermercado}.txt`);
const host = Object.entries(hosts).find(
([host, supe]) => supe === supermercado
)![0];
const results = await db.query.productoUrls
.findMany({
where: like(productoUrls.url, `%${host}%`),
})
.execute();
const urls = results.map((r) => r.url);
await writeFile(listPath, urls.join("\n") + "\n");
this.scrapAndInform({ listPath });
// TODO: borrar archivos temporales
}
async scrapAndInform({ listPath }: { listPath: string }) {
const res = await scrapQueue.add(async () => {
const t0 = performance.now();
const sub = Bun.spawn({
cmd: ["scraper-rs", "fetch-list", listPath],
stdio: ["ignore", "pipe", "inherit"],
});
const text = await readableStreamToText(sub.stdout);
const code = await sub.exited;
if (code !== 0) throw new Error(`scraper-rs threw ${code}`);
return { took: performance.now() - t0, text };
});
if (res) {
const { took, text } = res;
this.inform(
`Procesado ${listPath} (${text}) (tardó ${formatMs(took)})`
//(${progress.done} ok, ${
// progress.skipped
// } skipped, ${progress.errors.length} errores)
);
} else {
this.inform(`Algo falló en ${listPath}`);
}
}
inform(msg: string) {
this.sendTelegramMsg(msg);
console.info(msg);
}
report(msg: string) {
this.inform(msg);
const error = new Error(msg);
return error;
}
async sendTelegramMsg(text: string) {
if (!this.telegramConfig) return;
const url = new URL(
`https://api.telegram.org/bot${this.telegramConfig.token}/sendMessage`
);
url.searchParams.set("chat_id", this.telegramConfig.chatId);
url.searchParams.set("text", text);
await fetch(url);
}
}
function formatMs(ms: number) {
return formatDuration(intervalToDuration({ start: 0, end: Math.round(ms) }));
}

44
scraper/cli.ts Normal file
View file

@ -0,0 +1,44 @@
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
import { scrapCotoProducts } from "../link-scrapers/coto.js";
import { scrapDiaProducts } from "../link-scrapers/dia.js";
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
import { auto } from "./auto.js";
import { downloadList, getProduct } from "./scrap.js";
import Cron from "croner";
if (process.argv[2] === "auto") {
await auto();
} else if (process.argv[2] === "cron") {
Cron("0 2 * * *", () => {
auto();
});
} else if (process.argv[2] === "scrap-carrefour-links") {
await scrapCarrefourProducts();
} else if (process.argv[2] === "scrap-dia-links") {
await scrapDiaProducts();
} else if (process.argv[2] === "scrap-coto-links") {
await scrapCotoProducts();
} else if (process.argv[2] === "scrap-jumbo-links") {
await scrapJumboProducts();
} else if (process.argv[2] === "scrap-link") {
const url = new URL(process.argv[3]);
const res = await fetch(url);
const text = await res.text();
console.info(await getProduct(url, text));
} else if (process.argv[2] === "scrap") {
const urlLists = process.argv.slice(3);
if (urlLists.length > 0) {
for (const path of urlLists) {
const res = await downloadList(path);
console.info("=======================================");
console.info(path, res);
console.info("=======================================");
}
} else {
console.error("Especificá listas de urls para scrapear.");
process.exit(1);
}
} else {
console.error("Especificá una acción (tipo `auto` o `scrap`) para hacer.");
process.exit(1);
}

29
scraper/package.json Normal file
View file

@ -0,0 +1,29 @@
{
"name": "scraper",
"type": "module",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"check": "tsc"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"@aws-sdk/client-s3": "^3.478.0",
"@aws-sdk/lib-storage": "^3.478.0",
"croner": "^8.0.0",
"date-fns": "^3.0.6",
"db-datos": "workspace:^",
"drizzle-orm": "^0.29.1",
"linkedom": "^0.16.5",
"nanoid": "^5.0.4",
"p-map": "^7.0.1",
"p-queue": "^8.0.1",
"zod": "^3.22.4"
},
"devDependencies": {
"typescript": "^5.3.3"
}
}

View file

@ -0,0 +1,56 @@
import { parseHTML } from "linkedom";
import { Precioish } from "../scrap.js";
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
function parseScriptJson<T>(dom: Window, varname: string): T {
const script = dom.window.document.querySelector<HTMLTemplateElement>(
`template[data-type="json"][data-varname="${varname}"]`
)?.content?.children[0];
if (!script) throw new Error("no encuentro el script");
return JSON.parse(script.innerHTML);
}
function eanFromSeedState(dom: Window): string {
const json = parseScriptJson<object>(dom, "__STATE__");
const productJson = Object.entries(json).find(
([key, val]) => key.startsWith("Product:") && val.__typename === "Product"
);
if (!productJson) throw new Error("no encontré el product en el json");
const productSkuJson = Object.entries(json).find(
([key, val]) =>
key.startsWith(`Product:${productJson[1].cacheId}`) &&
val.__typename === "SKU"
);
if (!productSkuJson) throw new Error("no encontré el sku en el json");
return productSkuJson[1].ean;
}
export function getCarrefourProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html);
const precioCentavos = priceFromMeta(dom);
const inStock = stockFromMeta(dom);
const ean = eanFromSeedState(dom);
let name, imageUrl;
try {
const ld = getProductJsonLd(dom);
name = ld.name;
imageUrl = ld.image;
} catch (error) {
if (inStock) {
throw error;
} else {
// algunas paginas sin stock no tienen json ld
}
}
return {
name,
imageUrl,
ean,
precioCentavos,
inStock,
};
}

55
scraper/parsers/common.ts Normal file
View file

@ -0,0 +1,55 @@
import { z } from "zod";
export function getMetaProp(dom: Window, prop: string) {
return dom.window.document
.querySelector(`meta[property="${prop}"]`)
?.getAttribute("content");
}
export function priceFromMeta(dom: Window) {
const precioMeta = getMetaProp(dom, "product:price:amount");
if (!precioMeta) return null;
const precioCentavos = parseFloat(precioMeta) * 100;
return precioCentavos;
}
export function stockFromMeta(dom: Window) {
const stockMeta = getMetaProp(dom, "product:availability");
return stockMeta === "instock";
}
function parseJsonLds(dom: Window): object[] {
const scripts = dom.window.document.querySelectorAll(
'script[type="application/ld+json"]'
);
return Array.from(scripts, (script) => JSON.parse(script.innerHTML));
}
function findJsonLd(dom: Window, type: string): object | undefined {
return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type);
}
const zProductLd = z.object({
"@type": z.literal("Product"),
name: z.string(),
image: z.string(),
sku: z.string().optional(),
offers: z.object({
offers: z.array(
z.object({
"@type": z.literal("Offer"),
price: z.number(),
priceCurrency: z.literal("ARS"),
availability: z.enum([
"http://schema.org/OutOfStock",
"http://schema.org/InStock",
]),
})
),
}),
});
type ProductLd = z.infer<typeof zProductLd>;
export function getProductJsonLd(dom: Window): ProductLd {
const ld = findJsonLd(dom, "Product");
const productLd = zProductLd.parse(ld);
return productLd;
}

48
scraper/parsers/coto.ts Normal file
View file

@ -0,0 +1,48 @@
import { parseHTML } from "linkedom";
import { type Precioish } from "../scrap.js";
function getEanFromText({ document }: Window) {
const potentialEanEls = Array.from(
document.querySelectorAll("div#brandText")
);
const eanParent = potentialEanEls.find(
(el) => el.textContent?.includes("| EAN: ")
);
if (!eanParent) throw new Error("no encuentro el eanparent");
const eanEl = Array.from(
eanParent?.querySelectorAll("span.span_codigoplu")
)[1];
const ean = eanEl?.textContent?.trim();
if (!ean) throw new Error("no encuentro el ean");
return ean;
}
function getPriceFromText({ document }: Window) {
const el = document.querySelector(".atg_store_newPrice");
if (!el?.textContent) return null;
const nStr = el.textContent
.trim()
.replace("$", "")
.replaceAll(".", "")
.replace(",", ".");
return parseFloat(nStr) * 100;
}
function getInStock({ document }: Window) {
return !document.querySelector(".product_not_available");
}
export function getCotoProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html);
const ean = getEanFromText(dom);
const precioCentavos = getPriceFromText(dom);
const inStock = getInStock(dom);
const name = dom.document
.querySelector("h1.product_page")
?.textContent?.trim();
const imageUrl =
dom.document.querySelector<HTMLImageElement>(".zoom img")?.src;
return { name, imageUrl, ean, precioCentavos, inStock };
}

25
scraper/parsers/dia.ts Normal file
View file

@ -0,0 +1,25 @@
import { parseHTML } from "linkedom";
import { type Precioish } from "../scrap.js";
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
export function getDiaProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html);
const ean = getMetaProp(dom, "product:retailer_item_id");
if (!ean) throw new Error("No encontré el ean");
const precioCentavos = priceFromMeta(dom);
const ld = getProductJsonLd(dom);
const name = ld.name;
const imageUrl = ld.image;
const inStock =
ld.offers.offers[0].availability === "http://schema.org/InStock";
return {
name,
imageUrl,
ean,
precioCentavos,
inStock,
};
}

54
scraper/parsers/jumbo.ts Normal file
View file

@ -0,0 +1,54 @@
import { parseHTML } from "linkedom";
import { type Precioish } from "../scrap.js";
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
import { z } from "zod";
const zJumboSearch = z.tuple([
z.object({
items: z.array(
z.object({
ean: z.string(),
})
),
}),
]);
async function getEanFromSearch(sku: string) {
const url = new URL(
"https://www.jumbo.com.ar/api/catalog_system/pub/products/search"
);
url.searchParams.set("fq", `skuId:${sku}`);
const res = await fetch(url);
const json = await res.json();
const parsed = zJumboSearch.parse(json);
const ean = parsed[0].items[0].ean;
if (!parsed[0].items.every((x) => x.ean === ean)) {
throw new Error("Inesperado: no todos los items tienen el mismo EAN");
}
return ean;
}
export async function getJumboProduct(
html: string | Buffer
): Promise<Precioish> {
const dom = parseHTML(html);
const precioCentavos = priceFromMeta(dom);
const inStock = stockFromMeta(dom);
const ld = getProductJsonLd(dom);
const name = ld.name;
const imageUrl = ld.image;
const retailerSku = ld.sku;
if (!retailerSku)
throw new Error("No encontré el SKU de Jumbo para pedir el EAN");
const ean = await getEanFromSearch(retailerSku);
return {
name,
imageUrl,
ean,
precioCentavos,
inStock,
};
}

127
scraper/scrap.ts Normal file
View file

@ -0,0 +1,127 @@
/// <reference lib="dom" />
import * as schema from "db-datos/schema.js";
import { writeFile, mkdir } from "fs/promises";
import { createHash } from "crypto";
import { getCarrefourProduct } from "./parsers/carrefour.js";
import { getDiaProduct } from "./parsers/dia.js";
import { getCotoProduct } from "./parsers/coto.js";
import { join } from "path";
import { db } from "db-datos/db.js";
import pMap from "p-map";
import { getJumboProduct } from "./parsers/jumbo.js";
const DEBUG = true;
const PARSER_VERSION = 4;
export type Precio = typeof schema.precios.$inferInsert;
export type Precioish = Omit<
Precio,
"fetchedAt" | "url" | "id" | "warcRecordId" | "parserVersion"
>;
export async function downloadList(path: string) {
let list = (await Bun.file(path).text())
.split("\n")
.filter((s) => s.length > 0);
const results = await pMap(
list,
async (urlS) => {
let res: ScrapResult = { type: "skipped" };
for (let attempts = 0; attempts < 6; attempts++) {
if (attempts !== 0) await wait(1500);
res = await scrap(urlS);
if (res.type === "done" || res.type === "skipped") {
break;
}
}
if (res.type === "error") console.error(res);
return res;
},
{ concurrency: 32 }
);
let progress: {
done: number;
skipped: number;
errors: { error: any; url: string; debugPath: string }[];
} = { done: 0, skipped: 0, errors: [] };
for (const result of results) {
switch (result.type) {
case "done":
progress.done++;
break;
case "error":
progress.errors.push(result);
break;
case "skipped":
progress.skipped++;
break;
}
}
return progress;
}
export async function getProduct(url: URL, html: string): Promise<Precioish> {
if (url.hostname === "www.carrefour.com.ar") return getCarrefourProduct(html);
else if (url.hostname === "diaonline.supermercadosdia.com.ar")
return getDiaProduct(html);
else if (url.hostname === "www.cotodigital3.com.ar")
return getCotoProduct(html);
else if (url.hostname === "www.jumbo.com.ar")
return await getJumboProduct(html);
else throw new Error(`Unknown host ${url.hostname}`);
}
type ScrapResult =
| { type: "skipped" }
| { type: "done" }
| { type: "error"; url: string; error: any; debugPath: string };
async function scrap(urlS: string): Promise<ScrapResult> {
let url;
try {
url = new URL(urlS);
} catch (err) {
console.error(`skipped ${urlS} because ${err}`);
return { type: "skipped" };
}
const res = await fetch(url);
if (!res.ok) {
console.debug(`skipped ${urlS} because status=${res.status} (!=200)`);
return { type: "skipped" };
}
const html = await res.text();
try {
let ish = await getProduct(url, html);
const p: Precio = {
...ish,
fetchedAt: new Date(),
url: urlS,
parserVersion: PARSER_VERSION,
};
await db.insert(schema.precios).values(p);
return { type: "done" };
} catch (error) {
const urlHash = createHash("md5").update(urlS).digest("hex");
const output = join("debug", `${urlHash}.html`);
if (DEBUG) {
await mkdir("debug", { recursive: true });
await writeFile(output, html);
}
return {
type: "error",
url: urlS,
error,
debugPath: output,
};
}
}
function wait(ms: number) {
return new Promise((resolve) => setTimeout(resolve, ms));
}

4
scraper/tsconfig.json Normal file
View file

@ -0,0 +1,4 @@
{
"extends": "../tsconfig.json",
"exclude": ["../sitio"]
}

View file

@ -11,7 +11,7 @@ export const load: PageServerLoad = async ({ url }) => {
join precios p on p.ean = f.ean join precios p on p.ean = f.ean
where f.name match ${`"${query}"`} where f.name match ${`"${query}"`}
group by p.ean group by p.ean
having max(p.fetched_at) having max(p.fetched_at) and max(p.in_stock)
order by p.in_stock desc;`; order by p.in_stock desc;`;
results = db.all(sqlQuery); results = db.all(sqlQuery);
} }