mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 19:46:19 +00:00
Compare commits
No commits in common. "1439c3dd1da6cf4731556b84abf100b7b3ebc64f" and "3396a433e592faa22307105eefc47fee6f539b87" have entirely different histories.
1439c3dd1d
...
3396a433e5
22 changed files with 2132 additions and 26 deletions
22
.github/workflows/container.yml
vendored
22
.github/workflows/container.yml
vendored
|
@ -88,32 +88,12 @@ jobs:
|
||||||
uses: actions/cache@v3
|
uses: actions/cache@v3
|
||||||
with:
|
with:
|
||||||
path: usr/src/app/target
|
path: usr/src/app/target
|
||||||
key: usr/src/app/target-${{ hashFiles('Dockerfile.scraper') }}
|
key: usr/src/app/target-${{ hashFiles('Dockerfile') }}
|
||||||
- name: inject usr/src/app/target into docker
|
- name: inject usr/src/app/target into docker
|
||||||
uses: reproducible-containers/buildkit-cache-dance@v2.1.3
|
uses: reproducible-containers/buildkit-cache-dance@v2.1.3
|
||||||
with:
|
with:
|
||||||
cache-source: usr/src/app/target
|
cache-source: usr/src/app/target
|
||||||
cache-target: /usr/src/app/target
|
cache-target: /usr/src/app/target
|
||||||
- name: Cache root/.cargo/registry
|
|
||||||
uses: actions/cache@v3
|
|
||||||
with:
|
|
||||||
path: root/.cargo/registry
|
|
||||||
key: root/.cargo/registry-${{ hashFiles('Dockerfile.scraper') }}
|
|
||||||
- name: inject root/.cargo/registry into docker
|
|
||||||
uses: reproducible-containers/buildkit-cache-dance@v2.1.3
|
|
||||||
with:
|
|
||||||
cache-source: root/.cargo/registry
|
|
||||||
cache-target: /root/.cargo/registry
|
|
||||||
- name: Cache root/.cargo/git
|
|
||||||
uses: actions/cache@v3
|
|
||||||
with:
|
|
||||||
path: root/.cargo/git
|
|
||||||
key: root/.cargo/git-${{ hashFiles('Dockerfile.scraper') }}
|
|
||||||
- name: inject root/.cargo/git into docker
|
|
||||||
uses: reproducible-containers/buildkit-cache-dance@v2.1.3
|
|
||||||
with:
|
|
||||||
cache-source: root/.cargo/git
|
|
||||||
cache-target: /root/.cargo/git
|
|
||||||
- name: Build and push Docker image
|
- name: Build and push Docker image
|
||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v5
|
||||||
with:
|
with:
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
export const DB_PATH = process.env.DB_PATH ?? "../sqlite.db";
|
export const DB_PATH = process.env.DB_PATH ?? "../scraper/sqlite.db";
|
||||||
|
|
||||||
/** @type { import("drizzle-kit").Config } */
|
/** @type { import("drizzle-kit").Config } */
|
||||||
export default {
|
export default {
|
||||||
|
|
33
link-scrapers/carrefour.ts
Normal file
33
link-scrapers/carrefour.ts
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
import pMap from "p-map";
|
||||||
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
|
import { getUrlsFromSitemap } from "./common.js";
|
||||||
|
|
||||||
|
export async function scrapCarrefourProducts() {
|
||||||
|
await scrapBySitemap();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapBySitemap() {
|
||||||
|
// de https://www.carrefour.com.ar/sitemap.xml
|
||||||
|
const sitemaps = [
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-0.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-1.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-2.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-3.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-4.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-5.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-6.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-7.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-8.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-9.xml",
|
||||||
|
];
|
||||||
|
|
||||||
|
await pMap(
|
||||||
|
sitemaps,
|
||||||
|
async (sitemapUrl) => {
|
||||||
|
const res = await fetch(sitemapUrl);
|
||||||
|
const xml = await res.text();
|
||||||
|
saveUrls(getUrlsFromSitemap(xml));
|
||||||
|
},
|
||||||
|
{ concurrency: 3 }
|
||||||
|
);
|
||||||
|
}
|
14
link-scrapers/common.ts
Normal file
14
link-scrapers/common.ts
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
import { decodeXML } from "entities";
|
||||||
|
export function getUrlsFromSitemap(xml: string) {
|
||||||
|
let urls = new Set<string>();
|
||||||
|
new HTMLRewriter()
|
||||||
|
.on("loc", {
|
||||||
|
text(element) {
|
||||||
|
const txt = element.text.trim();
|
||||||
|
if (!txt) return;
|
||||||
|
urls.add(decodeXML(txt));
|
||||||
|
},
|
||||||
|
})
|
||||||
|
.transform(new Response(xml));
|
||||||
|
return Array.from(urls);
|
||||||
|
}
|
44
link-scrapers/coto.ts
Normal file
44
link-scrapers/coto.ts
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
import { parseHTML } from "linkedom";
|
||||||
|
import PQueue from "p-queue";
|
||||||
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
|
|
||||||
|
export async function scrapCotoProducts() {
|
||||||
|
const initial =
|
||||||
|
"https://www.cotodigital3.com.ar/sitios/cdigi/browse?Nf=product.endDate%7CGTEQ+1.7032032E12%7C%7Cproduct.startDate%7CLTEQ+1.7032032E12&No=2200&Nr=AND%28product.sDisp_200%3A1004%2Cproduct.language%3Aespa%C3%B1ol%2COR%28product.siteId%3ACotoDigital%29%29&Nrpp=200";
|
||||||
|
|
||||||
|
const queue = new PQueue({ concurrency: 4 });
|
||||||
|
|
||||||
|
const pageSize = 300; // hasta 1000
|
||||||
|
const links = Array.from(
|
||||||
|
{ length: Math.ceil(29000 / pageSize) },
|
||||||
|
(x, i) => i
|
||||||
|
).map((i) => {
|
||||||
|
const url = new URL(initial);
|
||||||
|
url.searchParams.set("No", `${i * pageSize}`);
|
||||||
|
url.searchParams.set("Nrpp", `${pageSize}`);
|
||||||
|
return url.toString();
|
||||||
|
});
|
||||||
|
|
||||||
|
const promises = links.map((l) => queue.add(getPage(l)));
|
||||||
|
await Promise.all(promises);
|
||||||
|
}
|
||||||
|
|
||||||
|
function getPage(url: string) {
|
||||||
|
return async () => {
|
||||||
|
let html;
|
||||||
|
try {
|
||||||
|
const res = await fetch(url);
|
||||||
|
html = await res.text();
|
||||||
|
} catch (error) {
|
||||||
|
await getPage(url)();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const { document } = parseHTML(html);
|
||||||
|
|
||||||
|
const hrefs = Array.from(
|
||||||
|
document.querySelectorAll<HTMLAnchorElement>(".product_info_container a"),
|
||||||
|
(a) => new URL(a.href, url).toString()
|
||||||
|
);
|
||||||
|
saveUrls(hrefs);
|
||||||
|
};
|
||||||
|
}
|
124
link-scrapers/dia.ts
Normal file
124
link-scrapers/dia.ts
Normal file
|
@ -0,0 +1,124 @@
|
||||||
|
import pMap from "p-map";
|
||||||
|
import { parseHTML } from "linkedom";
|
||||||
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
|
import { getUrlsFromSitemap } from "./common.js";
|
||||||
|
|
||||||
|
const categorias = [
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/almacen",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/almacen/conservas",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/almacen/aceites-y-aderezos",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/almacen/pastas-secas",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/almacen/arroz-y-legumbres",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/almacen/panaderia",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/almacen/golosinas-y-alfajores",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/almacen/reposteria",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/almacen/comidas-listas",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/almacen/harinas",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/almacen/picadas",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/almacen/panaderia/pan-rallado-y-rebozadores",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/desayuno",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/desayuno/galletitas-y-cereales",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/desayuno/infusiones-y-endulzantes",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/desayuno/para-untar",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/frescos",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/frescos/leches",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/frescos/fiambreria",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/frescos/lacteos",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/frescos/carniceria",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/frescos/pastas-frescas",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/frescos/listos-para-disfrutar",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutas",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/verduras",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/huevos",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/frescos/frutas-y-verduras/frutos-secos",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/bebidas",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/bebidas/gaseosas",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/bebidas/cervezas",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/bebidas/aguas",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/bebidas/bodega",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/bebidas/jugos-e-isot%C3%B3nicas",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/bebidas/aperitivos",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/bebidas/bebidas-blancas-y-licores",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/congelados",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/congelados/hamburguesas-y-medallones",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/congelados/rebozados",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/congelados/vegetales-congelados",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/congelados/postres-congelados",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/congelados/pescaderia",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/congelados/papas-congeladas",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/congelados/comidas-congeladas",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/congelados/hielo",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/limpieza",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/limpieza/cuidado-de-la-ropa",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/limpieza/papeleria",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/limpieza/limpiadores",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/limpieza/limpieza-de-cocina",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/limpieza/accesorios-de-limpieza",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/limpieza/desodorantes-de-ambiente",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/limpieza/insecticidas",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/limpieza/fosforos-y-velas",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/limpieza/bolsas",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/4160?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/4136?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/4143?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/4189?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/4086?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/2089?map=productClusterIds&order=OrderByBestDiscountDESC",
|
||||||
|
];
|
||||||
|
|
||||||
|
export async function scrapDiaProducts() {
|
||||||
|
await Promise.all([
|
||||||
|
// scrapBySite(),
|
||||||
|
scrapBySitemap(),
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapBySitemap() {
|
||||||
|
// de https://diaonline.supermercadosdia.com.ar/sitemap.xml
|
||||||
|
const sitemaps = [
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/sitemap/product-1.xml",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/sitemap/product-2.xml",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/sitemap/product-3.xml",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/sitemap/product-4.xml",
|
||||||
|
"https://diaonline.supermercadosdia.com.ar/sitemap/product-5.xml",
|
||||||
|
];
|
||||||
|
|
||||||
|
await pMap(
|
||||||
|
sitemaps,
|
||||||
|
async (sitemapUrl) => {
|
||||||
|
const res = await fetch(sitemapUrl);
|
||||||
|
const xml = await res.text();
|
||||||
|
saveUrls(getUrlsFromSitemap(xml));
|
||||||
|
},
|
||||||
|
{ concurrency: 3 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapBySite() {
|
||||||
|
const links = categorias.flatMap((link) =>
|
||||||
|
Array.from({ length: 51 }, (x, i) => i).map((i) => {
|
||||||
|
const url = new URL(link);
|
||||||
|
url.searchParams.set("page", `${i}`);
|
||||||
|
return url.toString();
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
await pMap(
|
||||||
|
links,
|
||||||
|
async (url) => {
|
||||||
|
const res = await fetch(url, { timeout: false });
|
||||||
|
const html = await res.text();
|
||||||
|
const { document } = parseHTML(html);
|
||||||
|
|
||||||
|
const hrefs = Array.from(
|
||||||
|
document.querySelectorAll<HTMLAnchorElement>(
|
||||||
|
"a.vtex-product-summary-2-x-clearLink"
|
||||||
|
),
|
||||||
|
(a) => new URL(a.href, url).toString()
|
||||||
|
);
|
||||||
|
saveUrls(hrefs);
|
||||||
|
},
|
||||||
|
{ concurrency: 32 }
|
||||||
|
);
|
||||||
|
}
|
38
link-scrapers/jumbo.ts
Normal file
38
link-scrapers/jumbo.ts
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
import pMap from "p-map";
|
||||||
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
|
import { getUrlsFromSitemap } from "./common.js";
|
||||||
|
|
||||||
|
export async function scrapJumboProducts() {
|
||||||
|
await scrapBySitemap();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapBySitemap() {
|
||||||
|
// de https://www.jumbo.com.ar/sitemap.xml
|
||||||
|
const sitemaps = [
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-1.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-10.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-11.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-12.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-13.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-14.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-15.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-2.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-3.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-4.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-5.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-6.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-7.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-8.xml",
|
||||||
|
"https://www.jumbo.com.ar/sitemap/product-9.xml",
|
||||||
|
];
|
||||||
|
|
||||||
|
await pMap(
|
||||||
|
sitemaps,
|
||||||
|
async (sitemapUrl) => {
|
||||||
|
const res = await fetch(sitemapUrl);
|
||||||
|
const xml = await res.text();
|
||||||
|
saveUrls(getUrlsFromSitemap(xml));
|
||||||
|
},
|
||||||
|
{ concurrency: 3 }
|
||||||
|
);
|
||||||
|
}
|
18
link-scrapers/package.json
Normal file
18
link-scrapers/package.json
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
{
|
||||||
|
"name": "link-scrapers",
|
||||||
|
"type": "module",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "",
|
||||||
|
"main": "index.js",
|
||||||
|
"scripts": {
|
||||||
|
"test": "echo \"Error: no test specified\" && exit 1"
|
||||||
|
},
|
||||||
|
"keywords": [],
|
||||||
|
"author": "",
|
||||||
|
"license": "ISC",
|
||||||
|
"dependencies": {
|
||||||
|
"entities": "^4.5.0",
|
||||||
|
"linkedom": "^0.16.5",
|
||||||
|
"p-queue": "^8.0.1"
|
||||||
|
}
|
||||||
|
}
|
1278
pnpm-lock.yaml
1278
pnpm-lock.yaml
File diff suppressed because it is too large
Load diff
|
@ -1,3 +1,5 @@
|
||||||
packages:
|
packages:
|
||||||
|
- link-scrapers
|
||||||
|
- scraper
|
||||||
- sitio
|
- sitio
|
||||||
- db-datos
|
- db-datos
|
||||||
|
|
|
@ -111,7 +111,7 @@ async fn fetch_list(pool: &Pool, links: Vec<String>) -> Counters {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn connect_db() -> Pool {
|
fn connect_db() -> Pool {
|
||||||
let db_path = env::var("DB_PATH").unwrap_or("../sqlite.db".to_string());
|
let db_path = env::var("DB_PATH").unwrap_or("../scraper/sqlite.db".to_string());
|
||||||
let cfg = deadpool_sqlite::Config::new(db_path);
|
let cfg = deadpool_sqlite::Config::new(db_path);
|
||||||
let pool = cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap();
|
let pool = cfg.create_pool(deadpool_sqlite::Runtime::Tokio1).unwrap();
|
||||||
pool
|
pool
|
||||||
|
|
137
scraper/auto.ts
Normal file
137
scraper/auto.ts
Normal file
|
@ -0,0 +1,137 @@
|
||||||
|
import { mkdtemp, writeFile } from "node:fs/promises";
|
||||||
|
import { tmpdir } from "node:os";
|
||||||
|
import { join } from "node:path";
|
||||||
|
import { Supermercado, hosts, supermercados } from "db-datos/supermercado.js";
|
||||||
|
import PQueue from "p-queue";
|
||||||
|
import { formatDuration, intervalToDuration } from "date-fns";
|
||||||
|
import { db } from "db-datos/db.js";
|
||||||
|
import { like } from "drizzle-orm";
|
||||||
|
import { productoUrls } from "db-datos/schema.js";
|
||||||
|
import { scrapDiaProducts } from "../link-scrapers/dia.js";
|
||||||
|
import { scrapCotoProducts } from "../link-scrapers/coto.js";
|
||||||
|
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
|
||||||
|
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
|
||||||
|
import { readableStreamToText } from "bun";
|
||||||
|
|
||||||
|
// hacemos una cola para el scrapeo para no tener varios writers a la BD y no sobrecargar la CPU
|
||||||
|
const scrapQueue = new PQueue({ concurrency: 1 });
|
||||||
|
|
||||||
|
export async function auto() {
|
||||||
|
const a = new Auto();
|
||||||
|
await Promise.all(supermercados.map((supr) => a.downloadList(supr)));
|
||||||
|
}
|
||||||
|
|
||||||
|
class Auto {
|
||||||
|
telegramConfig?: { token: string; chatId: string };
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
if (!process.env.TELEGRAM_BOT_TOKEN)
|
||||||
|
console.warn("no hay TELEGRAM_BOT_TOKEN, no voy a loggear por allá");
|
||||||
|
else if (!process.env.TELEGRAM_BOT_CHAT_ID)
|
||||||
|
console.warn("no hay TELEGRAM_BOT_CHAT_ID, no voy a loggear por allá");
|
||||||
|
else
|
||||||
|
this.telegramConfig = {
|
||||||
|
token: process.env.TELEGRAM_BOT_TOKEN,
|
||||||
|
chatId: process.env.TELEGRAM_BOT_CHAT_ID,
|
||||||
|
};
|
||||||
|
|
||||||
|
this.inform("[auto] Empezando scrap");
|
||||||
|
}
|
||||||
|
|
||||||
|
async scrapUrls(supermercado: Supermercado) {
|
||||||
|
const t0 = performance.now();
|
||||||
|
switch (supermercado) {
|
||||||
|
case "Dia":
|
||||||
|
await scrapDiaProducts();
|
||||||
|
break;
|
||||||
|
case "Coto":
|
||||||
|
await scrapCotoProducts();
|
||||||
|
break;
|
||||||
|
case "Carrefour":
|
||||||
|
await scrapCarrefourProducts();
|
||||||
|
break;
|
||||||
|
case "Jumbo":
|
||||||
|
await scrapJumboProducts();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
this.inform(
|
||||||
|
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
async downloadList(supermercado: Supermercado) {
|
||||||
|
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-download-"));
|
||||||
|
|
||||||
|
await scrapQueue.add(async () => {
|
||||||
|
await this.scrapUrls(supermercado);
|
||||||
|
});
|
||||||
|
|
||||||
|
const listPath = join(ctxPath, `lista-${supermercado}.txt`);
|
||||||
|
const host = Object.entries(hosts).find(
|
||||||
|
([host, supe]) => supe === supermercado
|
||||||
|
)![0];
|
||||||
|
const results = await db.query.productoUrls
|
||||||
|
.findMany({
|
||||||
|
where: like(productoUrls.url, `%${host}%`),
|
||||||
|
})
|
||||||
|
.execute();
|
||||||
|
const urls = results.map((r) => r.url);
|
||||||
|
await writeFile(listPath, urls.join("\n") + "\n");
|
||||||
|
|
||||||
|
this.scrapAndInform({ listPath });
|
||||||
|
// TODO: borrar archivos temporales
|
||||||
|
}
|
||||||
|
|
||||||
|
async scrapAndInform({ listPath }: { listPath: string }) {
|
||||||
|
const res = await scrapQueue.add(async () => {
|
||||||
|
const t0 = performance.now();
|
||||||
|
|
||||||
|
const sub = Bun.spawn({
|
||||||
|
cmd: ["scraper-rs", "fetch-list", listPath],
|
||||||
|
stdio: ["ignore", "pipe", "inherit"],
|
||||||
|
});
|
||||||
|
const text = await readableStreamToText(sub.stdout);
|
||||||
|
const code = await sub.exited;
|
||||||
|
if (code !== 0) throw new Error(`scraper-rs threw ${code}`);
|
||||||
|
|
||||||
|
return { took: performance.now() - t0, text };
|
||||||
|
});
|
||||||
|
|
||||||
|
if (res) {
|
||||||
|
const { took, text } = res;
|
||||||
|
this.inform(
|
||||||
|
`Procesado ${listPath} (${text}) (tardó ${formatMs(took)})`
|
||||||
|
//(${progress.done} ok, ${
|
||||||
|
// progress.skipped
|
||||||
|
// } skipped, ${progress.errors.length} errores)
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
this.inform(`Algo falló en ${listPath}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inform(msg: string) {
|
||||||
|
this.sendTelegramMsg(msg);
|
||||||
|
console.info(msg);
|
||||||
|
}
|
||||||
|
report(msg: string) {
|
||||||
|
this.inform(msg);
|
||||||
|
const error = new Error(msg);
|
||||||
|
|
||||||
|
return error;
|
||||||
|
}
|
||||||
|
|
||||||
|
async sendTelegramMsg(text: string) {
|
||||||
|
if (!this.telegramConfig) return;
|
||||||
|
const url = new URL(
|
||||||
|
`https://api.telegram.org/bot${this.telegramConfig.token}/sendMessage`
|
||||||
|
);
|
||||||
|
url.searchParams.set("chat_id", this.telegramConfig.chatId);
|
||||||
|
url.searchParams.set("text", text);
|
||||||
|
await fetch(url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatMs(ms: number) {
|
||||||
|
return formatDuration(intervalToDuration({ start: 0, end: Math.round(ms) }));
|
||||||
|
}
|
44
scraper/cli.ts
Normal file
44
scraper/cli.ts
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
import { scrapCarrefourProducts } from "../link-scrapers/carrefour.js";
|
||||||
|
import { scrapCotoProducts } from "../link-scrapers/coto.js";
|
||||||
|
import { scrapDiaProducts } from "../link-scrapers/dia.js";
|
||||||
|
import { scrapJumboProducts } from "../link-scrapers/jumbo.js";
|
||||||
|
import { auto } from "./auto.js";
|
||||||
|
import { downloadList, getProduct } from "./scrap.js";
|
||||||
|
import Cron from "croner";
|
||||||
|
|
||||||
|
if (process.argv[2] === "auto") {
|
||||||
|
await auto();
|
||||||
|
} else if (process.argv[2] === "cron") {
|
||||||
|
Cron("0 2 * * *", () => {
|
||||||
|
auto();
|
||||||
|
});
|
||||||
|
} else if (process.argv[2] === "scrap-carrefour-links") {
|
||||||
|
await scrapCarrefourProducts();
|
||||||
|
} else if (process.argv[2] === "scrap-dia-links") {
|
||||||
|
await scrapDiaProducts();
|
||||||
|
} else if (process.argv[2] === "scrap-coto-links") {
|
||||||
|
await scrapCotoProducts();
|
||||||
|
} else if (process.argv[2] === "scrap-jumbo-links") {
|
||||||
|
await scrapJumboProducts();
|
||||||
|
} else if (process.argv[2] === "scrap-link") {
|
||||||
|
const url = new URL(process.argv[3]);
|
||||||
|
const res = await fetch(url);
|
||||||
|
const text = await res.text();
|
||||||
|
console.info(await getProduct(url, text));
|
||||||
|
} else if (process.argv[2] === "scrap") {
|
||||||
|
const urlLists = process.argv.slice(3);
|
||||||
|
if (urlLists.length > 0) {
|
||||||
|
for (const path of urlLists) {
|
||||||
|
const res = await downloadList(path);
|
||||||
|
console.info("=======================================");
|
||||||
|
console.info(path, res);
|
||||||
|
console.info("=======================================");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
console.error("Especificá listas de urls para scrapear.");
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
console.error("Especificá una acción (tipo `auto` o `scrap`) para hacer.");
|
||||||
|
process.exit(1);
|
||||||
|
}
|
29
scraper/package.json
Normal file
29
scraper/package.json
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
{
|
||||||
|
"name": "scraper",
|
||||||
|
"type": "module",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "",
|
||||||
|
"main": "index.js",
|
||||||
|
"scripts": {
|
||||||
|
"check": "tsc"
|
||||||
|
},
|
||||||
|
"keywords": [],
|
||||||
|
"author": "",
|
||||||
|
"license": "ISC",
|
||||||
|
"dependencies": {
|
||||||
|
"@aws-sdk/client-s3": "^3.478.0",
|
||||||
|
"@aws-sdk/lib-storage": "^3.478.0",
|
||||||
|
"croner": "^8.0.0",
|
||||||
|
"date-fns": "^3.0.6",
|
||||||
|
"db-datos": "workspace:^",
|
||||||
|
"drizzle-orm": "^0.29.1",
|
||||||
|
"linkedom": "^0.16.5",
|
||||||
|
"nanoid": "^5.0.4",
|
||||||
|
"p-map": "^7.0.1",
|
||||||
|
"p-queue": "^8.0.1",
|
||||||
|
"zod": "^3.22.4"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"typescript": "^5.3.3"
|
||||||
|
}
|
||||||
|
}
|
56
scraper/parsers/carrefour.ts
Normal file
56
scraper/parsers/carrefour.ts
Normal file
|
@ -0,0 +1,56 @@
|
||||||
|
import { parseHTML } from "linkedom";
|
||||||
|
import { Precioish } from "../scrap.js";
|
||||||
|
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
|
||||||
|
|
||||||
|
function parseScriptJson<T>(dom: Window, varname: string): T {
|
||||||
|
const script = dom.window.document.querySelector<HTMLTemplateElement>(
|
||||||
|
`template[data-type="json"][data-varname="${varname}"]`
|
||||||
|
)?.content?.children[0];
|
||||||
|
if (!script) throw new Error("no encuentro el script");
|
||||||
|
return JSON.parse(script.innerHTML);
|
||||||
|
}
|
||||||
|
function eanFromSeedState(dom: Window): string {
|
||||||
|
const json = parseScriptJson<object>(dom, "__STATE__");
|
||||||
|
const productJson = Object.entries(json).find(
|
||||||
|
([key, val]) => key.startsWith("Product:") && val.__typename === "Product"
|
||||||
|
);
|
||||||
|
if (!productJson) throw new Error("no encontré el product en el json");
|
||||||
|
|
||||||
|
const productSkuJson = Object.entries(json).find(
|
||||||
|
([key, val]) =>
|
||||||
|
key.startsWith(`Product:${productJson[1].cacheId}`) &&
|
||||||
|
val.__typename === "SKU"
|
||||||
|
);
|
||||||
|
if (!productSkuJson) throw new Error("no encontré el sku en el json");
|
||||||
|
return productSkuJson[1].ean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getCarrefourProduct(html: string | Buffer): Precioish {
|
||||||
|
const dom = parseHTML(html);
|
||||||
|
|
||||||
|
const precioCentavos = priceFromMeta(dom);
|
||||||
|
const inStock = stockFromMeta(dom);
|
||||||
|
|
||||||
|
const ean = eanFromSeedState(dom);
|
||||||
|
|
||||||
|
let name, imageUrl;
|
||||||
|
try {
|
||||||
|
const ld = getProductJsonLd(dom);
|
||||||
|
name = ld.name;
|
||||||
|
imageUrl = ld.image;
|
||||||
|
} catch (error) {
|
||||||
|
if (inStock) {
|
||||||
|
throw error;
|
||||||
|
} else {
|
||||||
|
// algunas paginas sin stock no tienen json ld
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
name,
|
||||||
|
imageUrl,
|
||||||
|
ean,
|
||||||
|
precioCentavos,
|
||||||
|
inStock,
|
||||||
|
};
|
||||||
|
}
|
55
scraper/parsers/common.ts
Normal file
55
scraper/parsers/common.ts
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
import { z } from "zod";
|
||||||
|
|
||||||
|
export function getMetaProp(dom: Window, prop: string) {
|
||||||
|
return dom.window.document
|
||||||
|
.querySelector(`meta[property="${prop}"]`)
|
||||||
|
?.getAttribute("content");
|
||||||
|
}
|
||||||
|
|
||||||
|
export function priceFromMeta(dom: Window) {
|
||||||
|
const precioMeta = getMetaProp(dom, "product:price:amount");
|
||||||
|
if (!precioMeta) return null;
|
||||||
|
const precioCentavos = parseFloat(precioMeta) * 100;
|
||||||
|
return precioCentavos;
|
||||||
|
}
|
||||||
|
export function stockFromMeta(dom: Window) {
|
||||||
|
const stockMeta = getMetaProp(dom, "product:availability");
|
||||||
|
return stockMeta === "instock";
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseJsonLds(dom: Window): object[] {
|
||||||
|
const scripts = dom.window.document.querySelectorAll(
|
||||||
|
'script[type="application/ld+json"]'
|
||||||
|
);
|
||||||
|
return Array.from(scripts, (script) => JSON.parse(script.innerHTML));
|
||||||
|
}
|
||||||
|
function findJsonLd(dom: Window, type: string): object | undefined {
|
||||||
|
return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type);
|
||||||
|
}
|
||||||
|
|
||||||
|
const zProductLd = z.object({
|
||||||
|
"@type": z.literal("Product"),
|
||||||
|
name: z.string(),
|
||||||
|
image: z.string(),
|
||||||
|
sku: z.string().optional(),
|
||||||
|
offers: z.object({
|
||||||
|
offers: z.array(
|
||||||
|
z.object({
|
||||||
|
"@type": z.literal("Offer"),
|
||||||
|
price: z.number(),
|
||||||
|
priceCurrency: z.literal("ARS"),
|
||||||
|
availability: z.enum([
|
||||||
|
"http://schema.org/OutOfStock",
|
||||||
|
"http://schema.org/InStock",
|
||||||
|
]),
|
||||||
|
})
|
||||||
|
),
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
type ProductLd = z.infer<typeof zProductLd>;
|
||||||
|
|
||||||
|
export function getProductJsonLd(dom: Window): ProductLd {
|
||||||
|
const ld = findJsonLd(dom, "Product");
|
||||||
|
const productLd = zProductLd.parse(ld);
|
||||||
|
return productLd;
|
||||||
|
}
|
48
scraper/parsers/coto.ts
Normal file
48
scraper/parsers/coto.ts
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
import { parseHTML } from "linkedom";
|
||||||
|
import { type Precioish } from "../scrap.js";
|
||||||
|
|
||||||
|
function getEanFromText({ document }: Window) {
|
||||||
|
const potentialEanEls = Array.from(
|
||||||
|
document.querySelectorAll("div#brandText")
|
||||||
|
);
|
||||||
|
const eanParent = potentialEanEls.find(
|
||||||
|
(el) => el.textContent?.includes("| EAN: ")
|
||||||
|
);
|
||||||
|
if (!eanParent) throw new Error("no encuentro el eanparent");
|
||||||
|
|
||||||
|
const eanEl = Array.from(
|
||||||
|
eanParent?.querySelectorAll("span.span_codigoplu")
|
||||||
|
)[1];
|
||||||
|
const ean = eanEl?.textContent?.trim();
|
||||||
|
if (!ean) throw new Error("no encuentro el ean");
|
||||||
|
return ean;
|
||||||
|
}
|
||||||
|
function getPriceFromText({ document }: Window) {
|
||||||
|
const el = document.querySelector(".atg_store_newPrice");
|
||||||
|
if (!el?.textContent) return null;
|
||||||
|
const nStr = el.textContent
|
||||||
|
.trim()
|
||||||
|
.replace("$", "")
|
||||||
|
.replaceAll(".", "")
|
||||||
|
.replace(",", ".");
|
||||||
|
return parseFloat(nStr) * 100;
|
||||||
|
}
|
||||||
|
function getInStock({ document }: Window) {
|
||||||
|
return !document.querySelector(".product_not_available");
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getCotoProduct(html: string | Buffer): Precioish {
|
||||||
|
const dom = parseHTML(html);
|
||||||
|
|
||||||
|
const ean = getEanFromText(dom);
|
||||||
|
const precioCentavos = getPriceFromText(dom);
|
||||||
|
const inStock = getInStock(dom);
|
||||||
|
|
||||||
|
const name = dom.document
|
||||||
|
.querySelector("h1.product_page")
|
||||||
|
?.textContent?.trim();
|
||||||
|
const imageUrl =
|
||||||
|
dom.document.querySelector<HTMLImageElement>(".zoom img")?.src;
|
||||||
|
|
||||||
|
return { name, imageUrl, ean, precioCentavos, inStock };
|
||||||
|
}
|
25
scraper/parsers/dia.ts
Normal file
25
scraper/parsers/dia.ts
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
import { parseHTML } from "linkedom";
|
||||||
|
import { type Precioish } from "../scrap.js";
|
||||||
|
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
|
||||||
|
|
||||||
|
export function getDiaProduct(html: string | Buffer): Precioish {
|
||||||
|
const dom = parseHTML(html);
|
||||||
|
|
||||||
|
const ean = getMetaProp(dom, "product:retailer_item_id");
|
||||||
|
if (!ean) throw new Error("No encontré el ean");
|
||||||
|
const precioCentavos = priceFromMeta(dom);
|
||||||
|
|
||||||
|
const ld = getProductJsonLd(dom);
|
||||||
|
const name = ld.name;
|
||||||
|
const imageUrl = ld.image;
|
||||||
|
const inStock =
|
||||||
|
ld.offers.offers[0].availability === "http://schema.org/InStock";
|
||||||
|
|
||||||
|
return {
|
||||||
|
name,
|
||||||
|
imageUrl,
|
||||||
|
ean,
|
||||||
|
precioCentavos,
|
||||||
|
inStock,
|
||||||
|
};
|
||||||
|
}
|
54
scraper/parsers/jumbo.ts
Normal file
54
scraper/parsers/jumbo.ts
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
import { parseHTML } from "linkedom";
|
||||||
|
import { type Precioish } from "../scrap.js";
|
||||||
|
import { getProductJsonLd, priceFromMeta, stockFromMeta } from "./common.js";
|
||||||
|
import { z } from "zod";
|
||||||
|
|
||||||
|
const zJumboSearch = z.tuple([
|
||||||
|
z.object({
|
||||||
|
items: z.array(
|
||||||
|
z.object({
|
||||||
|
ean: z.string(),
|
||||||
|
})
|
||||||
|
),
|
||||||
|
}),
|
||||||
|
]);
|
||||||
|
|
||||||
|
async function getEanFromSearch(sku: string) {
|
||||||
|
const url = new URL(
|
||||||
|
"https://www.jumbo.com.ar/api/catalog_system/pub/products/search"
|
||||||
|
);
|
||||||
|
url.searchParams.set("fq", `skuId:${sku}`);
|
||||||
|
const res = await fetch(url);
|
||||||
|
const json = await res.json();
|
||||||
|
const parsed = zJumboSearch.parse(json);
|
||||||
|
const ean = parsed[0].items[0].ean;
|
||||||
|
if (!parsed[0].items.every((x) => x.ean === ean)) {
|
||||||
|
throw new Error("Inesperado: no todos los items tienen el mismo EAN");
|
||||||
|
}
|
||||||
|
return ean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getJumboProduct(
|
||||||
|
html: string | Buffer
|
||||||
|
): Promise<Precioish> {
|
||||||
|
const dom = parseHTML(html);
|
||||||
|
const precioCentavos = priceFromMeta(dom);
|
||||||
|
const inStock = stockFromMeta(dom);
|
||||||
|
|
||||||
|
const ld = getProductJsonLd(dom);
|
||||||
|
const name = ld.name;
|
||||||
|
const imageUrl = ld.image;
|
||||||
|
|
||||||
|
const retailerSku = ld.sku;
|
||||||
|
if (!retailerSku)
|
||||||
|
throw new Error("No encontré el SKU de Jumbo para pedir el EAN");
|
||||||
|
const ean = await getEanFromSearch(retailerSku);
|
||||||
|
|
||||||
|
return {
|
||||||
|
name,
|
||||||
|
imageUrl,
|
||||||
|
ean,
|
||||||
|
precioCentavos,
|
||||||
|
inStock,
|
||||||
|
};
|
||||||
|
}
|
127
scraper/scrap.ts
Normal file
127
scraper/scrap.ts
Normal file
|
@ -0,0 +1,127 @@
|
||||||
|
/// <reference lib="dom" />
|
||||||
|
import * as schema from "db-datos/schema.js";
|
||||||
|
import { writeFile, mkdir } from "fs/promises";
|
||||||
|
import { createHash } from "crypto";
|
||||||
|
import { getCarrefourProduct } from "./parsers/carrefour.js";
|
||||||
|
import { getDiaProduct } from "./parsers/dia.js";
|
||||||
|
import { getCotoProduct } from "./parsers/coto.js";
|
||||||
|
import { join } from "path";
|
||||||
|
import { db } from "db-datos/db.js";
|
||||||
|
import pMap from "p-map";
|
||||||
|
import { getJumboProduct } from "./parsers/jumbo.js";
|
||||||
|
|
||||||
|
const DEBUG = true;
|
||||||
|
const PARSER_VERSION = 4;
|
||||||
|
|
||||||
|
export type Precio = typeof schema.precios.$inferInsert;
|
||||||
|
export type Precioish = Omit<
|
||||||
|
Precio,
|
||||||
|
"fetchedAt" | "url" | "id" | "warcRecordId" | "parserVersion"
|
||||||
|
>;
|
||||||
|
|
||||||
|
export async function downloadList(path: string) {
|
||||||
|
let list = (await Bun.file(path).text())
|
||||||
|
.split("\n")
|
||||||
|
.filter((s) => s.length > 0);
|
||||||
|
|
||||||
|
const results = await pMap(
|
||||||
|
list,
|
||||||
|
async (urlS) => {
|
||||||
|
let res: ScrapResult = { type: "skipped" };
|
||||||
|
for (let attempts = 0; attempts < 6; attempts++) {
|
||||||
|
if (attempts !== 0) await wait(1500);
|
||||||
|
res = await scrap(urlS);
|
||||||
|
if (res.type === "done" || res.type === "skipped") {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (res.type === "error") console.error(res);
|
||||||
|
return res;
|
||||||
|
},
|
||||||
|
{ concurrency: 32 }
|
||||||
|
);
|
||||||
|
|
||||||
|
let progress: {
|
||||||
|
done: number;
|
||||||
|
skipped: number;
|
||||||
|
errors: { error: any; url: string; debugPath: string }[];
|
||||||
|
} = { done: 0, skipped: 0, errors: [] };
|
||||||
|
for (const result of results) {
|
||||||
|
switch (result.type) {
|
||||||
|
case "done":
|
||||||
|
progress.done++;
|
||||||
|
break;
|
||||||
|
case "error":
|
||||||
|
progress.errors.push(result);
|
||||||
|
break;
|
||||||
|
case "skipped":
|
||||||
|
progress.skipped++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return progress;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getProduct(url: URL, html: string): Promise<Precioish> {
|
||||||
|
if (url.hostname === "www.carrefour.com.ar") return getCarrefourProduct(html);
|
||||||
|
else if (url.hostname === "diaonline.supermercadosdia.com.ar")
|
||||||
|
return getDiaProduct(html);
|
||||||
|
else if (url.hostname === "www.cotodigital3.com.ar")
|
||||||
|
return getCotoProduct(html);
|
||||||
|
else if (url.hostname === "www.jumbo.com.ar")
|
||||||
|
return await getJumboProduct(html);
|
||||||
|
else throw new Error(`Unknown host ${url.hostname}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
type ScrapResult =
|
||||||
|
| { type: "skipped" }
|
||||||
|
| { type: "done" }
|
||||||
|
| { type: "error"; url: string; error: any; debugPath: string };
|
||||||
|
async function scrap(urlS: string): Promise<ScrapResult> {
|
||||||
|
let url;
|
||||||
|
try {
|
||||||
|
url = new URL(urlS);
|
||||||
|
} catch (err) {
|
||||||
|
console.error(`skipped ${urlS} because ${err}`);
|
||||||
|
return { type: "skipped" };
|
||||||
|
}
|
||||||
|
const res = await fetch(url);
|
||||||
|
if (!res.ok) {
|
||||||
|
console.debug(`skipped ${urlS} because status=${res.status} (!=200)`);
|
||||||
|
return { type: "skipped" };
|
||||||
|
}
|
||||||
|
|
||||||
|
const html = await res.text();
|
||||||
|
|
||||||
|
try {
|
||||||
|
let ish = await getProduct(url, html);
|
||||||
|
|
||||||
|
const p: Precio = {
|
||||||
|
...ish,
|
||||||
|
fetchedAt: new Date(),
|
||||||
|
url: urlS,
|
||||||
|
parserVersion: PARSER_VERSION,
|
||||||
|
};
|
||||||
|
|
||||||
|
await db.insert(schema.precios).values(p);
|
||||||
|
|
||||||
|
return { type: "done" };
|
||||||
|
} catch (error) {
|
||||||
|
const urlHash = createHash("md5").update(urlS).digest("hex");
|
||||||
|
const output = join("debug", `${urlHash}.html`);
|
||||||
|
if (DEBUG) {
|
||||||
|
await mkdir("debug", { recursive: true });
|
||||||
|
await writeFile(output, html);
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
type: "error",
|
||||||
|
url: urlS,
|
||||||
|
error,
|
||||||
|
debugPath: output,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function wait(ms: number) {
|
||||||
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
}
|
4
scraper/tsconfig.json
Normal file
4
scraper/tsconfig.json
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
{
|
||||||
|
"extends": "../tsconfig.json",
|
||||||
|
"exclude": ["../sitio"]
|
||||||
|
}
|
|
@ -11,7 +11,7 @@ export const load: PageServerLoad = async ({ url }) => {
|
||||||
join precios p on p.ean = f.ean
|
join precios p on p.ean = f.ean
|
||||||
where f.name match ${`"${query}"`}
|
where f.name match ${`"${query}"`}
|
||||||
group by p.ean
|
group by p.ean
|
||||||
having max(p.fetched_at)
|
having max(p.fetched_at) and max(p.in_stock)
|
||||||
order by p.in_stock desc;`;
|
order by p.in_stock desc;`;
|
||||||
results = db.all(sqlQuery);
|
results = db.all(sqlQuery);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue