site-analyzer/src/index.ts
Cat /dev/Nulo ada8122f60
Some checks failed
ci/woodpecker/push/woodpecker Pipeline failed
chequear sources y otros elementos correctamente
2023-03-31 21:12:30 -03:00

281 lines
7.2 KiB
TypeScript

import { Parser } from "htmlparser2";
import { DomHandler, Element } from "domhandler";
import { readdir, readFile, writeFile } from "fs/promises";
import { join, relative } from "path";
import {
doctype,
metaUtf8,
render,
title,
ul,
li,
h2,
ol,
basicElement,
a,
} from "@nulo/html.js";
const { argv } = process;
const dirPath = argv[2] || ".";
interface Thing {
type:
| "link-http"
| "link-absolute"
| "link-no-href"
| "media-http"
| "media-absolute"
| "media-no-src";
description: string;
}
type Report = {
things: Thing[];
};
type Page = {
report: Report;
rawHtml: string;
};
function recurseElement(page: Page, el: Element) {
const { report, rawHtml } = page;
if (el.name === "a") {
if (el.attribs.href !== undefined) {
if (isHttp(el.attribs.href)) {
report.things.push({
type: "link-http",
description: getText(el),
});
} else if (isAbsolute(el.attribs.href)) {
report.things.push({
type: "link-absolute",
description: getText(el),
});
}
} else {
report.things.push({
type: "link-no-href",
description: getHtml(rawHtml, el),
});
}
}
if (["audio", "video"].includes(el.name)) {
const sources = getSources(el);
for (const source of sources) {
if (source.attribs.src) {
checkUrl(page, source, source.attribs.src);
} else {
report.things.push({
type: "media-no-src",
description: getHtml(rawHtml, source),
});
}
}
}
if (["picture"].includes(el.name)) {
const sources = getSources(el);
for (const source of sources) {
if (source.attribs.srcset) {
// TODO: implementar srcset #3
} else {
report.things.push({
type: "media-no-src",
description: getHtml(rawHtml, source),
});
}
}
}
if (["img"].includes(el.name)) {
if (el.attribs.srcset) {
// TODO: implementar srcset #3
}
}
if (["audio", "video"].includes(el.name)) {
const sources = getSources(el);
if (sources.length > 0) {
for (const source of sources) {
if (source.attribs.src) {
checkUrl(page, source, source.attribs.src);
} else {
report.things.push({
type: "media-no-src",
description: getHtml(rawHtml, source),
});
}
}
} else {
if (el.attribs.src) {
checkUrl(page, el, el.attribs.src);
} else {
report.things.push({
type: "media-no-src",
description: getHtml(rawHtml, el),
});
}
}
}
if (["img", "iframe", "track"].includes(el.name)) {
if (el.attribs.src) {
checkUrl(page, el, el.attribs.src);
} else {
report.things.push({
type: "media-no-src",
description: getHtml(rawHtml, el),
});
}
}
for (const child of el.children) {
if (child.type === "tag") {
recurseElement(page, child);
}
}
}
function getSources(el: Element) {
return el.children.filter(
(c) => c.type === "tag" && c.name === "source"
) as Element[];
}
function checkUrl({ report, rawHtml }: Page, el: Element, url: string) {
if (isHttp(url)) {
report.things.push({
type: "media-http",
description: getHtml(rawHtml, el),
});
} else if (isAbsolute(url)) {
report.things.push({
type: "media-absolute",
description: getHtml(rawHtml, el),
});
}
}
function isHttp(url: string) {
const r = /^(https?:\/\/|\/\/)/;
return r.test(url);
}
function isAbsolute(url: string) {
return url.startsWith("/");
}
function getHtml(rawHtml: string, el: Element) {
return rawHtml.slice(el.startIndex!, el.endIndex! + 1);
}
function getText(el: Element) {
let text = "";
for (const child of el.children) {
if (child.type === "text") text += child.data.trim();
else if (child.type === "tag") text += getText(child);
}
return text;
}
function processFile(content: string): Promise<Report> {
return new Promise((resolve, reject) => {
const handler = new DomHandler(
(error, dom) => {
if (error) {
reject(error);
} else {
let report: Report = {
things: [],
};
for (const el of dom) {
if (el.type === "tag") {
recurseElement({ report, rawHtml: content }, el);
}
}
resolve(report);
}
},
{ withEndIndices: true, withStartIndices: true }
);
const parser = new Parser(handler);
parser.parseComplete(content);
});
}
type Reports = Map<string, Report>;
let reports: Reports = new Map();
async function recurseDirectory(reports: Reports, path: string) {
const dir = await readdir(path, { withFileTypes: true });
return await Promise.all(
dir.map(async (file) => {
const filePath = join(path, file.name);
if (file.isDirectory()) await recurseDirectory(reports, filePath);
else {
if (!file.name.endsWith(".html")) return;
const content = await readFile(filePath, "utf-8");
reports.set(filePath, await processFile(content));
}
})
);
}
await recurseDirectory(reports, dirPath);
const totalThings = [...reports.entries()].flatMap(([name, r]) =>
r.things.map((t) => ({ ...t, fileName: name }))
);
const kinds = [...new Set(totalThings.map((t) => t.type))].sort((a, b) =>
["link-http", "link-no-href"].includes(b) ? -1 : 0
);
console.log(
`Finished with ${Object.keys(reports).length} files read, ${
totalThings.length
} things`
);
for (const kind of kinds) {
const count = totalThings.filter((t) => t.type === kind).length;
console.log(`==> ${kind}: ${count}`);
}
console.log("This means:");
const pathBasedCount = totalThings.filter((t) =>
["media-absolute", "link-absolute"].includes(t.type)
).length;
console.log(
`==> ${pathBasedCount} problems that affect users using legacy IPFS gateways`
);
const mediaHttp = totalThings.filter((t) => t.type === "media-http").length;
console.log(
`==> ${mediaHttp} problems that make the website not self-contained, making it miss content if HTTP is unavailable`
);
const linkHttp = totalThings.filter((t) => t.type === "link-http").length;
console.log(
`==> ${linkHttp} links to HTTP sites, which is not a real concern unless it's a key part of the site's navigation.`
);
const code = basicElement("code");
const reportOutput = "report.html";
console.info(`Writing report to ${reportOutput}`);
const html = render(
doctype(),
metaUtf8,
title("site-analyzer report"),
ul(
li(
`${pathBasedCount} problems that affect users using legacy IPFS gateways`
),
li(
`${mediaHttp} problems that make the website not self-contained, making it miss content if HTTP is unavailable`
),
li(
`${linkHttp} links to HTTP sites, which is not a real concern unless it's a key part of the site's navigation.`
)
),
ul(...kinds.map((k) => li(a({ href: "#" + k }, k)))),
...kinds.flatMap((kind) => [
h2({ id: kind }, kind),
ol(
...totalThings
.filter((t) => t.type === kind)
.map((t) =>
li(code(relative(dirPath, t.fileName)), ": ", code(t.description))
)
),
])
);
await writeFile(reportOutput, html);