Use Node.js instead

This commit is contained in:
Cat /dev/Nulo 2022-11-25 18:05:58 -03:00
parent c3f8ed8ee4
commit ad0ef24257
8 changed files with 197 additions and 133 deletions

4
.gitignore vendored
View file

@ -1,3 +1,3 @@
zig-cache
zig-out
node_modules/
*.html

3
.gitmodules vendored
View file

@ -1,3 +0,0 @@
[submodule "rem"]
path = rem
url = https://github.com/chwayne/rem

View file

@ -1,40 +0,0 @@
const std = @import("std");
pub fn build(b: *std.build.Builder) void {
const rem_pkg = std.build.Pkg{
.name = "rem",
.source = .{ .path = "./rem/rem.zig" },
};
// Standard target options allows the person running `zig build` to choose
// what target to build for. Here we do not override the defaults, which
// means any target is allowed, and the default is native. Other options
// for restricting supported target set are available.
const target = b.standardTargetOptions(.{});
// Standard release options allow the person running `zig build` to select
// between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall.
const mode = b.standardReleaseOptions();
const exe = b.addExecutable("site-analyzer", "src/main.zig");
exe.setTarget(target);
exe.setBuildMode(mode);
exe.addPackage(rem_pkg);
exe.install();
const run_cmd = exe.run();
run_cmd.step.dependOn(b.getInstallStep());
if (b.args) |args| {
run_cmd.addArgs(args);
}
const run_step = b.step("run", "Run the app");
run_step.dependOn(&run_cmd.step);
const exe_tests = b.addTest("src/main.zig");
exe_tests.setTarget(target);
exe_tests.setBuildMode(mode);
const test_step = b.step("test", "Run unit tests");
test_step.dependOn(&exe_tests.step);
}

88
index.js Normal file
View file

@ -0,0 +1,88 @@
import { Parser } from "htmlparser2";
import { DomHandler } from "domhandler";
import { performance } from "perf_hooks";
const noop = () => {};
// const log = noop;
const { log } = console;
function readStdin() {
return new Promise((resolve, reject) => {
let buffer = "";
process.stdin.resume();
process.stdin.on("data", (d) => (buffer = buffer.concat(d.toString())));
process.stdin.on("close", () => resolve(buffer));
});
}
const stdin = await readStdin();
const rawHtml = stdin;
function isHttp(url) {
const r = /^(https?:\/\/|\/\/)/;
return r.test(url);
}
function isAbsolute(url) {
return url.startsWith("/");
}
function getHtml(el) {
// return rawHtml.slice(el.startIndex, el.endIndex);
let text = "";
for (const child of el.children) {
if (child.type === "text") text += child.data.trim();
else if (child.type === "tag") text += getHtml(child);
}
return text;
}
function checkUrl(type, el, url) {
if (isHttp(url)) {
log(`HTTP/S ${type}:`, getHtml(el));
} else if (isAbsolute(url)) {
log(`Absolute ${type}:`, getHtml(el));
}
}
function recursive(el) {
if (el.name === "a") {
if (el.attribs.href) {
checkUrl("link", el, el.attribs.href);
} else {
log("Link with no href:", getHtml(el));
}
}
if (["audio", "video", "img"].includes(el.name)) {
if (el.attribs.src) {
checkUrl(el.name, el, el.attribs.src);
} else {
log(`${el.name} with no src:`, getHtml(el));
}
}
for (const child of el.children) {
if (child.type === "tag") {
recursive(child);
}
}
}
const handler = new DomHandler(
(error, dom) => {
if (error) {
// TODO: Handle error
} else {
// for (let i = 0; i < 1000; i++) {
console.time();
for (const el of dom) {
if (el.type === "tag") {
recursive(el);
}
}
console.timeEnd();
// }
}
},
{ withEndIndices: true, withStartIndices: true }
);
const parser = new Parser(handler);
parser.parseComplete(rawHtml);

21
package.json Normal file
View file

@ -0,0 +1,21 @@
{
"name": "site-analyzer",
"type": "module",
"version": "0.0.1",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"domhandler": "^5.0.3",
"htmlparser2": "^8.0.1"
},
"devDependencies": {
"@types/htmlparser2": "^3.10.3",
"@types/node": "^18.11.9"
}
}

86
pnpm-lock.yaml Normal file
View file

@ -0,0 +1,86 @@
lockfileVersion: 5.4
specifiers:
'@types/htmlparser2': ^3.10.3
'@types/node': ^18.11.9
domhandler: ^5.0.3
htmlparser2: ^8.0.1
dependencies:
domhandler: 5.0.3
htmlparser2: 8.0.1
devDependencies:
'@types/htmlparser2': 3.10.3
'@types/node': 18.11.9
packages:
/@types/domutils/1.7.4:
resolution: {integrity: sha512-w542nRQ0vpXQjLYP52LKqrugQtUq580dEDiDIyZ6IBmV8a3LXjGVNxfj/jUQxS0kDsbZAWsSxQOcTfVX3HRdwg==}
dependencies:
domhandler: 2.4.2
dev: true
/@types/htmlparser2/3.10.3:
resolution: {integrity: sha512-XA74aD+acytofnZic9n83Rxy/IZ259299bYPx5SEyx7uymPi79lRyKDkhJlsuCaPHB7rEoTEhRN4Vm2G5WmHHg==}
dependencies:
'@types/domutils': 1.7.4
'@types/node': 18.11.9
domhandler: 2.4.2
dev: true
/@types/node/18.11.9:
resolution: {integrity: sha512-CRpX21/kGdzjOpFsZSkcrXMGIBWMGNIHXXBVFSH+ggkftxg+XYP20TESbh+zFvFj3EQOl5byk0HTRn1IL6hbqg==}
dev: true
/dom-serializer/2.0.0:
resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==}
dependencies:
domelementtype: 2.3.0
domhandler: 5.0.3
entities: 4.4.0
dev: false
/domelementtype/1.3.1:
resolution: {integrity: sha512-BSKB+TSpMpFI/HOxCNr1O8aMOTZ8hT3pM3GQ0w/mWRmkhEDSFJkkyzz4XQsBV44BChwGkrDfMyjVD0eA2aFV3w==}
dev: true
/domelementtype/2.3.0:
resolution: {integrity: sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==}
dev: false
/domhandler/2.4.2:
resolution: {integrity: sha512-JiK04h0Ht5u/80fdLMCEmV4zkNh2BcoMFBmZ/91WtYZ8qVXSKjiw7fXMgFPnHcSZgOo3XdinHvmnDUeMf5R4wA==}
dependencies:
domelementtype: 1.3.1
dev: true
/domhandler/5.0.3:
resolution: {integrity: sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==}
engines: {node: '>= 4'}
dependencies:
domelementtype: 2.3.0
dev: false
/domutils/3.0.1:
resolution: {integrity: sha512-z08c1l761iKhDFtfXO04C7kTdPBLi41zwOZl00WS8b5eiaebNpY00HKbztwBq+e3vyqWNwWF3mP9YLUeqIrF+Q==}
dependencies:
dom-serializer: 2.0.0
domelementtype: 2.3.0
domhandler: 5.0.3
dev: false
/entities/4.4.0:
resolution: {integrity: sha512-oYp7156SP8LkeGD0GF85ad1X9Ai79WtRsZ2gxJqtBuzH+98YUV6jkHEKlZkMbcrjJjIVJNIDP/3WL9wQkoPbWA==}
engines: {node: '>=0.12'}
dev: false
/htmlparser2/8.0.1:
resolution: {integrity: sha512-4lVbmc1diZC7GUJQtRQ5yBAeUCL1exyMwmForWkRLnwyzWBFxN633SALPMGYaWZvKe9j1pRZJpauvmxENSp/EA==}
dependencies:
domelementtype: 2.3.0
domhandler: 5.0.3
domutils: 3.0.1
entities: 4.4.0
dev: false

1
rem

@ -1 +0,0 @@
Subproject commit 53d2307030c9b9ce3501b007de86b49b0838d3ae

View file

@ -1,87 +0,0 @@
const std = @import("std");
const rem = @import("rem");
fn utf8DecodeString(allocator: std.mem.Allocator, string: []const u8) ![]u21 {
var list = std.ArrayList(u21).init(allocator);
errdefer list.deinit();
var decoded_it = (try std.unicode.Utf8View.init(string)).iterator();
while (decoded_it.nextCodepoint()) |codepoint| {
try list.append(codepoint);
}
return list.toOwnedSlice();
}
pub fn main() !u8 {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const string = try std.io.getStdIn().readToEndAlloc(allocator, 1024 * 1024);
// The string must be decoded before it can be passed to the parser.
// const input = &rem.util.utf8DecodeStringComptime(string);
const input = try utf8DecodeString(allocator, string);
// Create the DOM in which the parsed Document will be created.
var dom = rem.dom.Dom{ .allocator = allocator };
defer dom.deinit();
var parser = try rem.Parser.init(&dom, input, allocator, .abort, false);
defer parser.deinit();
try parser.run();
const errors = parser.errors();
if (errors.len > 0) {
std.log.err("A parsing error occured!\n{s}\n", .{@tagName(errors[0])});
return 1;
}
// const writer = std.io.getStdOut().writer();
const document = parser.getDocument();
// try rem.util.printDocument(writer, document, &dom, allocator);
check(document.element.?);
return 0;
}
const startsWith = std.mem.startsWith;
const startsWithIgnoreCase = std.ascii.startsWithIgnoreCase;
fn isHttps(url: []const u8) bool {
return startsWithIgnoreCase(url, "//") or
startsWithIgnoreCase(url, "http://") or
startsWithIgnoreCase(url, "https://");
}
fn isAbsolute(url: []const u8) bool {
return startsWithIgnoreCase(url, "/");
}
fn check(element: *const rem.dom.Element) void {
// std.log.info("{any}", .{element.element_type});
switch (element.element_type) {
.html_img => {
if (element.attributes.getEntry("alt") == null) {
std.log.err("img with no alt text", .{});
}
},
.html_a => {
if (element.attributes.getEntry("href")) |entry| {
const href = entry.value_ptr.*;
if (isHttps(href)) {
std.log.warn("link to http/s: {s}", .{href});
} else if (isAbsolute(href)) {
std.log.warn("absolute link: {s}", .{href});
}
} else {
std.log.warn("link with no href", .{});
}
},
else => {},
}
// for (element.children.items) |child| {
// switch (child) {
// .element => |el| check(el),
// .cdata => |cdata| std.log.info("cdata: {s}", .{cdata.data.items}),
// }
// }
}