281 lines
7.2 KiB
TypeScript
281 lines
7.2 KiB
TypeScript
import { Parser } from "htmlparser2";
|
|
import { DomHandler, Element } from "domhandler";
|
|
import { readdir, readFile, writeFile } from "fs/promises";
|
|
import { join, relative } from "path";
|
|
import {
|
|
doctype,
|
|
metaUtf8,
|
|
render,
|
|
title,
|
|
ul,
|
|
li,
|
|
h2,
|
|
ol,
|
|
basicElement,
|
|
a,
|
|
} from "@nulo/html.js";
|
|
|
|
const { argv } = process;
|
|
const dirPath = argv[2] || ".";
|
|
interface Thing {
|
|
type:
|
|
| "link-http"
|
|
| "link-absolute"
|
|
| "link-no-href"
|
|
| "media-http"
|
|
| "media-absolute"
|
|
| "media-no-src";
|
|
description: string;
|
|
}
|
|
type Report = {
|
|
things: Thing[];
|
|
};
|
|
|
|
type Page = {
|
|
report: Report;
|
|
rawHtml: string;
|
|
};
|
|
|
|
function recurseElement(page: Page, el: Element) {
|
|
const { report, rawHtml } = page;
|
|
if (el.name === "a") {
|
|
if (el.attribs.href !== undefined) {
|
|
if (isHttp(el.attribs.href)) {
|
|
report.things.push({
|
|
type: "link-http",
|
|
description: getText(el),
|
|
});
|
|
} else if (isAbsolute(el.attribs.href)) {
|
|
report.things.push({
|
|
type: "link-absolute",
|
|
description: getText(el),
|
|
});
|
|
}
|
|
} else {
|
|
report.things.push({
|
|
type: "link-no-href",
|
|
description: getHtml(rawHtml, el),
|
|
});
|
|
}
|
|
}
|
|
|
|
if (["audio", "video"].includes(el.name)) {
|
|
const sources = getSources(el);
|
|
for (const source of sources) {
|
|
if (source.attribs.src) {
|
|
checkUrl(page, source, source.attribs.src);
|
|
} else {
|
|
report.things.push({
|
|
type: "media-no-src",
|
|
description: getHtml(rawHtml, source),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
if (["picture"].includes(el.name)) {
|
|
const sources = getSources(el);
|
|
for (const source of sources) {
|
|
if (source.attribs.srcset) {
|
|
// TODO: implementar srcset #3
|
|
} else {
|
|
report.things.push({
|
|
type: "media-no-src",
|
|
description: getHtml(rawHtml, source),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
if (["img"].includes(el.name)) {
|
|
if (el.attribs.srcset) {
|
|
// TODO: implementar srcset #3
|
|
}
|
|
}
|
|
if (["audio", "video"].includes(el.name)) {
|
|
const sources = getSources(el);
|
|
if (sources.length > 0) {
|
|
for (const source of sources) {
|
|
if (source.attribs.src) {
|
|
checkUrl(page, source, source.attribs.src);
|
|
} else {
|
|
report.things.push({
|
|
type: "media-no-src",
|
|
description: getHtml(rawHtml, source),
|
|
});
|
|
}
|
|
}
|
|
} else {
|
|
if (el.attribs.src) {
|
|
checkUrl(page, el, el.attribs.src);
|
|
} else {
|
|
report.things.push({
|
|
type: "media-no-src",
|
|
description: getHtml(rawHtml, el),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
if (["img", "iframe", "track"].includes(el.name)) {
|
|
if (el.attribs.src) {
|
|
checkUrl(page, el, el.attribs.src);
|
|
} else {
|
|
report.things.push({
|
|
type: "media-no-src",
|
|
description: getHtml(rawHtml, el),
|
|
});
|
|
}
|
|
}
|
|
|
|
for (const child of el.children) {
|
|
if (child.type === "tag") {
|
|
recurseElement(page, child);
|
|
}
|
|
}
|
|
}
|
|
|
|
function getSources(el: Element) {
|
|
return el.children.filter(
|
|
(c) => c.type === "tag" && c.name === "source"
|
|
) as Element[];
|
|
}
|
|
|
|
function checkUrl({ report, rawHtml }: Page, el: Element, url: string) {
|
|
if (isHttp(url)) {
|
|
report.things.push({
|
|
type: "media-http",
|
|
description: getHtml(rawHtml, el),
|
|
});
|
|
} else if (isAbsolute(url)) {
|
|
report.things.push({
|
|
type: "media-absolute",
|
|
description: getHtml(rawHtml, el),
|
|
});
|
|
}
|
|
}
|
|
|
|
function isHttp(url: string) {
|
|
const r = /^(https?:\/\/|\/\/)/;
|
|
return r.test(url);
|
|
}
|
|
function isAbsolute(url: string) {
|
|
return url.startsWith("/");
|
|
}
|
|
|
|
function getHtml(rawHtml: string, el: Element) {
|
|
return rawHtml.slice(el.startIndex!, el.endIndex! + 1);
|
|
}
|
|
function getText(el: Element) {
|
|
let text = "";
|
|
for (const child of el.children) {
|
|
if (child.type === "text") text += child.data.trim();
|
|
else if (child.type === "tag") text += getText(child);
|
|
}
|
|
return text;
|
|
}
|
|
|
|
function processFile(content: string): Promise<Report> {
|
|
return new Promise((resolve, reject) => {
|
|
const handler = new DomHandler(
|
|
(error, dom) => {
|
|
if (error) {
|
|
reject(error);
|
|
} else {
|
|
let report: Report = {
|
|
things: [],
|
|
};
|
|
for (const el of dom) {
|
|
if (el.type === "tag") {
|
|
recurseElement({ report, rawHtml: content }, el);
|
|
}
|
|
}
|
|
resolve(report);
|
|
}
|
|
},
|
|
{ withEndIndices: true, withStartIndices: true }
|
|
);
|
|
const parser = new Parser(handler);
|
|
parser.parseComplete(content);
|
|
});
|
|
}
|
|
|
|
type Reports = Map<string, Report>;
|
|
let reports: Reports = new Map();
|
|
async function recurseDirectory(reports: Reports, path: string) {
|
|
const dir = await readdir(path, { withFileTypes: true });
|
|
return await Promise.all(
|
|
dir.map(async (file) => {
|
|
const filePath = join(path, file.name);
|
|
if (file.isDirectory()) await recurseDirectory(reports, filePath);
|
|
else {
|
|
if (!file.name.endsWith(".html")) return;
|
|
const content = await readFile(filePath, "utf-8");
|
|
reports.set(filePath, await processFile(content));
|
|
}
|
|
})
|
|
);
|
|
}
|
|
await recurseDirectory(reports, dirPath);
|
|
const totalThings = [...reports.entries()].flatMap(([name, r]) =>
|
|
r.things.map((t) => ({ ...t, fileName: name }))
|
|
);
|
|
const kinds = [...new Set(totalThings.map((t) => t.type))].sort((a, b) =>
|
|
["link-http", "link-no-href"].includes(b) ? -1 : 0
|
|
);
|
|
console.log(
|
|
`Finished with ${Object.keys(reports).length} files read, ${
|
|
totalThings.length
|
|
} things`
|
|
);
|
|
for (const kind of kinds) {
|
|
const count = totalThings.filter((t) => t.type === kind).length;
|
|
console.log(`==> ${kind}: ${count}`);
|
|
}
|
|
console.log("This means:");
|
|
const pathBasedCount = totalThings.filter((t) =>
|
|
["media-absolute", "link-absolute"].includes(t.type)
|
|
).length;
|
|
console.log(
|
|
`==> ${pathBasedCount} problems that affect users using legacy IPFS gateways`
|
|
);
|
|
const mediaHttp = totalThings.filter((t) => t.type === "media-http").length;
|
|
console.log(
|
|
`==> ${mediaHttp} problems that make the website not self-contained, making it miss content if HTTP is unavailable`
|
|
);
|
|
const linkHttp = totalThings.filter((t) => t.type === "link-http").length;
|
|
console.log(
|
|
`==> ${linkHttp} links to HTTP sites, which is not a real concern unless it's a key part of the site's navigation.`
|
|
);
|
|
|
|
const code = basicElement("code");
|
|
|
|
const reportOutput = "report.html";
|
|
console.info(`Writing report to ${reportOutput}`);
|
|
const html = render(
|
|
doctype(),
|
|
metaUtf8,
|
|
title("site-analyzer report"),
|
|
|
|
ul(
|
|
li(
|
|
`${pathBasedCount} problems that affect users using legacy IPFS gateways`
|
|
),
|
|
li(
|
|
`${mediaHttp} problems that make the website not self-contained, making it miss content if HTTP is unavailable`
|
|
),
|
|
li(
|
|
`${linkHttp} links to HTTP sites, which is not a real concern unless it's a key part of the site's navigation.`
|
|
)
|
|
),
|
|
ul(...kinds.map((k) => li(a({ href: "#" + k }, k)))),
|
|
...kinds.flatMap((kind) => [
|
|
h2({ id: kind }, kind),
|
|
ol(
|
|
...totalThings
|
|
.filter((t) => t.type === kind)
|
|
.map((t) =>
|
|
li(code(relative(dirPath, t.fileName)), ": ", code(t.description))
|
|
)
|
|
),
|
|
])
|
|
);
|
|
await writeFile(reportOutput, html);
|