site-analyzer/src/index.ts

171 lines
4.4 KiB
TypeScript
Raw Normal View History

2022-11-25 21:05:58 +00:00
import { Parser } from "htmlparser2";
2022-11-26 01:01:30 +00:00
import { DomHandler, Element } from "domhandler";
import { readdir, readFile } from "fs/promises";
import { join } from "path";
2022-11-25 21:05:58 +00:00
const { argv } = process;
const dirPath = argv[2] || ".";
2022-11-26 01:01:30 +00:00
interface Thing {
type:
| "link-http"
| "link-absolute"
| "link-no-href"
| "media-http"
| "media-absolute"
| "media-no-src";
description: string;
}
interface Report {
things: Thing[];
}
2022-11-25 21:05:58 +00:00
2022-11-26 01:09:41 +00:00
interface Page {
report: Report;
rawHtml: string;
}
function recurseElement(page: Page, el: Element) {
const { report, rawHtml } = page;
2022-11-25 21:05:58 +00:00
if (el.name === "a") {
2022-11-25 22:05:45 +00:00
if (el.attribs.href !== undefined) {
if (isHttp(el.attribs.href)) {
2022-11-25 22:53:24 +00:00
report.things.push({
type: "link-http",
description: getText(el),
});
} else if (isAbsolute(el.attribs.href)) {
2022-11-25 22:53:24 +00:00
report.things.push({
type: "link-absolute",
description: getText(el),
});
}
2022-11-25 21:05:58 +00:00
} else {
2022-11-25 22:53:24 +00:00
report.things.push({
type: "link-no-href",
description: getText(el),
});
2022-11-25 21:05:58 +00:00
}
}
2022-11-26 00:42:02 +00:00
if (["audio", "video", "img", "source"].includes(el.name)) {
2022-11-25 21:05:58 +00:00
if (el.attribs.src) {
2022-11-26 01:11:35 +00:00
checkUrl(page, el, el.attribs.src);
2022-11-25 21:05:58 +00:00
} else {
2022-11-25 22:53:24 +00:00
report.things.push({
type: "media-no-src",
description: getHtml(rawHtml, el),
});
2022-11-25 21:05:58 +00:00
}
}
for (const child of el.children) {
if (child.type === "tag") {
2022-11-26 01:09:41 +00:00
recurseElement(page, child);
2022-11-25 21:05:58 +00:00
}
}
}
2022-11-26 01:11:35 +00:00
function checkUrl({ report, rawHtml }: Page, el: Element, url: string) {
2022-11-25 21:08:54 +00:00
if (isHttp(url)) {
2022-11-25 22:53:24 +00:00
report.things.push({
2022-11-26 01:01:30 +00:00
type: "media-http",
2022-11-25 22:53:24 +00:00
description: getHtml(rawHtml, el),
});
2022-11-25 21:08:54 +00:00
} else if (isAbsolute(url)) {
2022-11-25 22:53:24 +00:00
report.things.push({
2022-11-26 01:01:30 +00:00
type: "media-absolute",
2022-11-25 22:53:24 +00:00
description: getHtml(rawHtml, el),
});
2022-11-25 21:08:54 +00:00
}
}
2022-11-26 01:01:30 +00:00
function isHttp(url: string) {
2022-11-25 21:08:54 +00:00
const r = /^(https?:\/\/|\/\/)/;
return r.test(url);
}
2022-11-26 01:01:30 +00:00
function isAbsolute(url: string) {
2022-11-25 21:08:54 +00:00
return url.startsWith("/");
}
2022-11-26 01:01:30 +00:00
function getHtml(rawHtml: string, el: Element) {
return rawHtml.slice(el.startIndex!, el.endIndex!);
2022-11-25 22:53:24 +00:00
}
2022-11-26 01:01:30 +00:00
function getText(el: Element) {
2022-11-25 21:08:54 +00:00
let text = "";
for (const child of el.children) {
if (child.type === "text") text += child.data.trim();
2022-11-25 22:53:24 +00:00
else if (child.type === "tag") text += getText(child);
2022-11-25 21:08:54 +00:00
}
return text;
}
2022-11-26 01:01:30 +00:00
function processFile(content: string): Promise<Report> {
2022-11-25 21:08:54 +00:00
return new Promise((resolve, reject) => {
const handler = new DomHandler(
(error, dom) => {
if (error) {
reject(error);
} else {
2022-11-26 01:01:30 +00:00
let report: Report = {
2022-11-25 22:53:24 +00:00
things: [],
};
for (const el of dom) {
if (el.type === "tag") {
2022-11-26 01:09:41 +00:00
recurseElement({ report, rawHtml: content }, el);
}
}
resolve(report);
}
},
{ withEndIndices: true, withStartIndices: true }
);
const parser = new Parser(handler);
parser.parseComplete(content);
2022-11-25 21:08:54 +00:00
});
}
2022-11-26 01:01:30 +00:00
interface Reports {
[key: string]: Report;
}
let reports: Reports = {};
async function recurseDirectory(reports: Reports, path: string) {
const dir = await readdir(path, { withFileTypes: true });
for (const file of dir) {
const filePath = join(path, file.name);
2022-11-25 22:07:13 +00:00
if (file.isDirectory()) await recurseDirectory(reports, filePath);
else {
if (!file.name.endsWith(".html")) continue;
const content = await readFile(filePath, "utf-8");
2022-11-25 23:06:32 +00:00
reports[filePath] = await processFile(content);
}
}
}
2022-11-25 22:07:13 +00:00
await recurseDirectory(reports, dirPath);
2022-11-25 22:53:24 +00:00
const totalThings = Object.values(reports)
.map((r) => r.things)
.flat();
const kinds = new Set(totalThings.map((t) => t.type));
console.log(
2022-11-25 23:10:26 +00:00
`Finished with ${Object.keys(reports).length} files read, ${
totalThings.length
} things`
2022-11-25 22:53:24 +00:00
);
for (const kind of kinds) {
const count = totalThings.filter((t) => t.type === kind).length;
console.log(`==> ${kind}: ${count}`);
}
console.log("This means:");
const pathBasedCount = totalThings.filter((t) =>
2022-11-25 23:11:22 +00:00
["media-absolute", "link-absolute"].includes(t.type)
2022-11-25 22:53:24 +00:00
).length;
console.log(
`==> ${pathBasedCount} problems that affect users using legacy IPFS gateways`
);
const mediaHttp = totalThings.filter((t) => t.type === "media-http").length;
console.log(
`==> ${mediaHttp} problems that make the website not self-contained, making it miss content if HTTP is unavailable`
);
const linkHttp = totalThings.filter((t) => t.type === "link-http").length;
2022-11-25 22:07:13 +00:00
console.log(
2022-11-25 22:53:24 +00:00
`==> ${linkHttp} links to HTTP sites, which is not a real concern unless it's a key part of the site's navigation.`
2022-11-25 22:07:13 +00:00
);