2022-11-25 21:05:58 +00:00
|
|
|
import { Parser } from "htmlparser2";
|
2022-11-26 01:01:30 +00:00
|
|
|
import { DomHandler, Element } from "domhandler";
|
2022-11-25 21:47:57 +00:00
|
|
|
import { readdir, readFile } from "fs/promises";
|
|
|
|
import { join } from "path";
|
2022-11-25 21:05:58 +00:00
|
|
|
|
2022-11-25 21:47:57 +00:00
|
|
|
const { argv } = process;
|
|
|
|
const dirPath = argv[2] || ".";
|
2022-11-26 01:01:30 +00:00
|
|
|
interface Thing {
|
|
|
|
type:
|
|
|
|
| "link-http"
|
|
|
|
| "link-absolute"
|
|
|
|
| "link-no-href"
|
|
|
|
| "media-http"
|
|
|
|
| "media-absolute"
|
|
|
|
| "media-no-src";
|
|
|
|
description: string;
|
|
|
|
}
|
|
|
|
interface Report {
|
|
|
|
things: Thing[];
|
|
|
|
}
|
2022-11-25 21:05:58 +00:00
|
|
|
|
2022-11-26 01:09:41 +00:00
|
|
|
interface Page {
|
|
|
|
report: Report;
|
|
|
|
rawHtml: string;
|
|
|
|
}
|
|
|
|
|
|
|
|
function recurseElement(page: Page, el: Element) {
|
|
|
|
const { report, rawHtml } = page;
|
2022-11-25 21:05:58 +00:00
|
|
|
if (el.name === "a") {
|
2022-11-25 22:05:45 +00:00
|
|
|
if (el.attribs.href !== undefined) {
|
2022-11-25 22:05:24 +00:00
|
|
|
if (isHttp(el.attribs.href)) {
|
2022-11-25 22:53:24 +00:00
|
|
|
report.things.push({
|
|
|
|
type: "link-http",
|
|
|
|
description: getText(el),
|
|
|
|
});
|
2022-11-25 22:05:24 +00:00
|
|
|
} else if (isAbsolute(el.attribs.href)) {
|
2022-11-25 22:53:24 +00:00
|
|
|
report.things.push({
|
|
|
|
type: "link-absolute",
|
|
|
|
description: getText(el),
|
|
|
|
});
|
2022-11-25 22:05:24 +00:00
|
|
|
}
|
2022-11-25 21:05:58 +00:00
|
|
|
} else {
|
2022-11-25 22:53:24 +00:00
|
|
|
report.things.push({
|
|
|
|
type: "link-no-href",
|
|
|
|
description: getText(el),
|
|
|
|
});
|
2022-11-25 21:05:58 +00:00
|
|
|
}
|
|
|
|
}
|
2022-11-26 00:42:02 +00:00
|
|
|
if (["audio", "video", "img", "source"].includes(el.name)) {
|
2022-11-25 21:05:58 +00:00
|
|
|
if (el.attribs.src) {
|
2022-11-26 01:01:30 +00:00
|
|
|
checkUrl(report, rawHtml, el, el.attribs.src);
|
2022-11-25 21:05:58 +00:00
|
|
|
} else {
|
2022-11-25 22:53:24 +00:00
|
|
|
report.things.push({
|
|
|
|
type: "media-no-src",
|
|
|
|
description: getHtml(rawHtml, el),
|
|
|
|
});
|
2022-11-25 21:05:58 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const child of el.children) {
|
|
|
|
if (child.type === "tag") {
|
2022-11-26 01:09:41 +00:00
|
|
|
recurseElement(page, child);
|
2022-11-25 21:05:58 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-11-26 01:01:30 +00:00
|
|
|
function checkUrl(report: Report, rawHtml: string, el: Element, url: string) {
|
2022-11-25 21:08:54 +00:00
|
|
|
if (isHttp(url)) {
|
2022-11-25 22:53:24 +00:00
|
|
|
report.things.push({
|
2022-11-26 01:01:30 +00:00
|
|
|
type: "media-http",
|
2022-11-25 22:53:24 +00:00
|
|
|
description: getHtml(rawHtml, el),
|
|
|
|
});
|
2022-11-25 21:08:54 +00:00
|
|
|
} else if (isAbsolute(url)) {
|
2022-11-25 22:53:24 +00:00
|
|
|
report.things.push({
|
2022-11-26 01:01:30 +00:00
|
|
|
type: "media-absolute",
|
2022-11-25 22:53:24 +00:00
|
|
|
description: getHtml(rawHtml, el),
|
|
|
|
});
|
2022-11-25 21:08:54 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-11-26 01:01:30 +00:00
|
|
|
function isHttp(url: string) {
|
2022-11-25 21:08:54 +00:00
|
|
|
const r = /^(https?:\/\/|\/\/)/;
|
|
|
|
return r.test(url);
|
|
|
|
}
|
2022-11-26 01:01:30 +00:00
|
|
|
function isAbsolute(url: string) {
|
2022-11-25 21:08:54 +00:00
|
|
|
return url.startsWith("/");
|
|
|
|
}
|
|
|
|
|
2022-11-26 01:01:30 +00:00
|
|
|
function getHtml(rawHtml: string, el: Element) {
|
|
|
|
return rawHtml.slice(el.startIndex!, el.endIndex!);
|
2022-11-25 22:53:24 +00:00
|
|
|
}
|
2022-11-26 01:01:30 +00:00
|
|
|
function getText(el: Element) {
|
2022-11-25 21:08:54 +00:00
|
|
|
let text = "";
|
|
|
|
for (const child of el.children) {
|
|
|
|
if (child.type === "text") text += child.data.trim();
|
2022-11-25 22:53:24 +00:00
|
|
|
else if (child.type === "tag") text += getText(child);
|
2022-11-25 21:08:54 +00:00
|
|
|
}
|
|
|
|
return text;
|
|
|
|
}
|
|
|
|
|
2022-11-26 01:01:30 +00:00
|
|
|
function processFile(content: string): Promise<Report> {
|
2022-11-25 21:08:54 +00:00
|
|
|
return new Promise((resolve, reject) => {
|
2022-11-25 21:47:57 +00:00
|
|
|
const handler = new DomHandler(
|
|
|
|
(error, dom) => {
|
|
|
|
if (error) {
|
|
|
|
reject(error);
|
|
|
|
} else {
|
2022-11-26 01:01:30 +00:00
|
|
|
let report: Report = {
|
2022-11-25 22:53:24 +00:00
|
|
|
things: [],
|
2022-11-25 21:47:57 +00:00
|
|
|
};
|
|
|
|
for (const el of dom) {
|
|
|
|
if (el.type === "tag") {
|
2022-11-26 01:09:41 +00:00
|
|
|
recurseElement({ report, rawHtml: content }, el);
|
2022-11-25 21:47:57 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
resolve(report);
|
|
|
|
}
|
|
|
|
},
|
|
|
|
{ withEndIndices: true, withStartIndices: true }
|
|
|
|
);
|
|
|
|
const parser = new Parser(handler);
|
|
|
|
parser.parseComplete(content);
|
2022-11-25 21:08:54 +00:00
|
|
|
});
|
|
|
|
}
|
2022-11-25 21:47:57 +00:00
|
|
|
|
2022-11-26 01:01:30 +00:00
|
|
|
interface Reports {
|
|
|
|
[key: string]: Report;
|
|
|
|
}
|
|
|
|
let reports: Reports = {};
|
|
|
|
async function recurseDirectory(reports: Reports, path: string) {
|
2022-11-25 21:47:57 +00:00
|
|
|
const dir = await readdir(path, { withFileTypes: true });
|
|
|
|
for (const file of dir) {
|
|
|
|
const filePath = join(path, file.name);
|
2022-11-25 22:07:13 +00:00
|
|
|
if (file.isDirectory()) await recurseDirectory(reports, filePath);
|
2022-11-25 21:47:57 +00:00
|
|
|
else {
|
|
|
|
if (!file.name.endsWith(".html")) continue;
|
|
|
|
const content = await readFile(filePath, "utf-8");
|
2022-11-25 23:06:32 +00:00
|
|
|
reports[filePath] = await processFile(content);
|
2022-11-25 21:47:57 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2022-11-25 22:07:13 +00:00
|
|
|
await recurseDirectory(reports, dirPath);
|
2022-11-25 22:53:24 +00:00
|
|
|
const totalThings = Object.values(reports)
|
|
|
|
.map((r) => r.things)
|
|
|
|
.flat();
|
|
|
|
const kinds = new Set(totalThings.map((t) => t.type));
|
|
|
|
console.log(
|
2022-11-25 23:10:26 +00:00
|
|
|
`Finished with ${Object.keys(reports).length} files read, ${
|
|
|
|
totalThings.length
|
|
|
|
} things`
|
2022-11-25 22:53:24 +00:00
|
|
|
);
|
|
|
|
for (const kind of kinds) {
|
|
|
|
const count = totalThings.filter((t) => t.type === kind).length;
|
|
|
|
console.log(`==> ${kind}: ${count}`);
|
|
|
|
}
|
|
|
|
console.log("This means:");
|
|
|
|
const pathBasedCount = totalThings.filter((t) =>
|
2022-11-25 23:11:22 +00:00
|
|
|
["media-absolute", "link-absolute"].includes(t.type)
|
2022-11-25 22:53:24 +00:00
|
|
|
).length;
|
|
|
|
console.log(
|
|
|
|
`==> ${pathBasedCount} problems that affect users using legacy IPFS gateways`
|
|
|
|
);
|
|
|
|
const mediaHttp = totalThings.filter((t) => t.type === "media-http").length;
|
|
|
|
console.log(
|
|
|
|
`==> ${mediaHttp} problems that make the website not self-contained, making it miss content if HTTP is unavailable`
|
|
|
|
);
|
|
|
|
const linkHttp = totalThings.filter((t) => t.type === "link-http").length;
|
2022-11-25 22:07:13 +00:00
|
|
|
console.log(
|
2022-11-25 22:53:24 +00:00
|
|
|
`==> ${linkHttp} links to HTTP sites, which is not a real concern unless it's a key part of the site's navigation.`
|
2022-11-25 22:07:13 +00:00
|
|
|
);
|