From 405502877c288e24e45c176b86e60b98d4ee67ca Mon Sep 17 00:00:00 2001 From: Nulo Date: Mon, 1 Jan 2024 16:50:17 -0300 Subject: [PATCH] ignorar paginas status!=200 --- scraper/scrap.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scraper/scrap.ts b/scraper/scrap.ts index dc5f4df..0698ffe 100644 --- a/scraper/scrap.ts +++ b/scraper/scrap.ts @@ -52,6 +52,12 @@ export async function parseWarc(path: string) { console.debug(`skipped ${warcRecordId}`); continue; } + if (record.httpHeaders?.statusCode !== 200) { + console.debug( + `skipped ${warcRecordId} because status=${record.httpHeaders?.statusCode} (!=200)` + ); + continue; + } // TODO: sobreescribir si existe el mismo record-id pero con version mas bajo? const html = await record.contentText();