import { Parser } from "htmlparser2"; import { DomHandler, Element } from "domhandler"; import { readdir, readFile, writeFile } from "fs/promises"; import { join, relative } from "path"; import { doctype, metaUtf8, render, title, ul, li, h2, ol, basicElement, a, } from "@nulo/html.js"; const { argv } = process; const dirPath = argv[2] || "."; interface Thing { type: | "link-http" | "link-absolute" | "link-no-href" | "media-http" | "media-absolute" | "media-no-src"; description: string; } type Report = { things: Thing[]; }; type Page = { report: Report; rawHtml: string; }; function recurseElement(page: Page, el: Element) { const { report, rawHtml } = page; if (el.name === "a") { if (el.attribs.href !== undefined) { if (isHttp(el.attribs.href)) { report.things.push({ type: "link-http", description: getText(el), }); } else if (isAbsolute(el.attribs.href)) { report.things.push({ type: "link-absolute", description: getText(el), }); } } else { report.things.push({ type: "link-no-href", description: getHtml(rawHtml, el), }); } } if (["audio", "video"].includes(el.name)) { const sources = getSources(el); for (const source of sources) { if (source.attribs.src) { checkUrl(page, source, source.attribs.src); } else { report.things.push({ type: "media-no-src", description: getHtml(rawHtml, source), }); } } } if (["picture"].includes(el.name)) { const sources = getSources(el); for (const source of sources) { if (source.attribs.srcset) { // TODO: implementar srcset #3 } else { report.things.push({ type: "media-no-src", description: getHtml(rawHtml, source), }); } } } if (["img"].includes(el.name)) { if (el.attribs.srcset) { // TODO: implementar srcset #3 } } if (["audio", "video"].includes(el.name)) { const sources = getSources(el); if (sources.length > 0) { for (const source of sources) { if (source.attribs.src) { checkUrl(page, source, source.attribs.src); } else { report.things.push({ type: "media-no-src", description: getHtml(rawHtml, source), }); } } } else { if (el.attribs.src) { checkUrl(page, el, el.attribs.src); } else { report.things.push({ type: "media-no-src", description: getHtml(rawHtml, el), }); } } } if (["img", "iframe", "track"].includes(el.name)) { if (el.attribs.src) { checkUrl(page, el, el.attribs.src); } else { report.things.push({ type: "media-no-src", description: getHtml(rawHtml, el), }); } } for (const child of el.children) { if (child.type === "tag") { recurseElement(page, child); } } } function getSources(el: Element) { return el.children.filter( (c) => c.type === "tag" && c.name === "source" ) as Element[]; } function checkUrl({ report, rawHtml }: Page, el: Element, url: string) { if (isHttp(url)) { report.things.push({ type: "media-http", description: getHtml(rawHtml, el), }); } else if (isAbsolute(url)) { report.things.push({ type: "media-absolute", description: getHtml(rawHtml, el), }); } } function isHttp(url: string) { const r = /^(https?:\/\/|\/\/)/; return r.test(url); } function isAbsolute(url: string) { return url.startsWith("/"); } function getHtml(rawHtml: string, el: Element) { return rawHtml.slice(el.startIndex!, el.endIndex! + 1); } function getText(el: Element) { let text = ""; for (const child of el.children) { if (child.type === "text") text += child.data.trim(); else if (child.type === "tag") text += getText(child); } return text; } function processFile(content: string): Promise { return new Promise((resolve, reject) => { const handler = new DomHandler( (error, dom) => { if (error) { reject(error); } else { let report: Report = { things: [], }; for (const el of dom) { if (el.type === "tag") { recurseElement({ report, rawHtml: content }, el); } } resolve(report); } }, { withEndIndices: true, withStartIndices: true } ); const parser = new Parser(handler); parser.parseComplete(content); }); } type Reports = Map; let reports: Reports = new Map(); async function recurseDirectory(reports: Reports, path: string) { const dir = await readdir(path, { withFileTypes: true }); return await Promise.all( dir.map(async (file) => { const filePath = join(path, file.name); if (file.isDirectory()) await recurseDirectory(reports, filePath); else { if (!file.name.endsWith(".html")) return; const content = await readFile(filePath, "utf-8"); reports.set(filePath, await processFile(content)); } }) ); } await recurseDirectory(reports, dirPath); const totalThings = [...reports.entries()].flatMap(([name, r]) => r.things.map((t) => ({ ...t, fileName: name })) ); const kinds = [...new Set(totalThings.map((t) => t.type))].sort((a, b) => ["link-http", "link-no-href"].includes(b) ? -1 : 0 ); console.log( `Finished with ${Object.keys(reports).length} files read, ${ totalThings.length } things` ); for (const kind of kinds) { const count = totalThings.filter((t) => t.type === kind).length; console.log(`==> ${kind}: ${count}`); } console.log("This means:"); const pathBasedCount = totalThings.filter((t) => ["media-absolute", "link-absolute"].includes(t.type) ).length; console.log( `==> ${pathBasedCount} problems that affect users using legacy IPFS gateways` ); const mediaHttp = totalThings.filter((t) => t.type === "media-http").length; console.log( `==> ${mediaHttp} problems that make the website not self-contained, making it miss content if HTTP is unavailable` ); const linkHttp = totalThings.filter((t) => t.type === "link-http").length; console.log( `==> ${linkHttp} links to HTTP sites, which is not a real concern unless it's a key part of the site's navigation.` ); const code = basicElement("code"); const reportOutput = "report.html"; console.info(`Writing report to ${reportOutput}`); const html = render( doctype(), metaUtf8, title("site-analyzer report"), ul( li( `${pathBasedCount} problems that affect users using legacy IPFS gateways` ), li( `${mediaHttp} problems that make the website not self-contained, making it miss content if HTTP is unavailable` ), li( `${linkHttp} links to HTTP sites, which is not a real concern unless it's a key part of the site's navigation.` ) ), ul(...kinds.map((k) => li(a({ href: "#" + k }, k)))), ...kinds.flatMap((kind) => [ h2({ id: kind }, kind), ol( ...totalThings .filter((t) => t.type === kind) .map((t) => li(code(relative(dirPath, t.fileName)), ": ", code(t.description)) ) ), ]) ); await writeFile(reportOutput, html);