Use Node.js instead
This commit is contained in:
parent
c3f8ed8ee4
commit
ad0ef24257
8 changed files with 197 additions and 133 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -1,3 +1,3 @@
|
||||||
zig-cache
|
node_modules/
|
||||||
zig-out
|
|
||||||
|
|
||||||
|
*.html
|
||||||
|
|
3
.gitmodules
vendored
3
.gitmodules
vendored
|
@ -1,3 +0,0 @@
|
||||||
[submodule "rem"]
|
|
||||||
path = rem
|
|
||||||
url = https://github.com/chwayne/rem
|
|
40
build.zig
40
build.zig
|
@ -1,40 +0,0 @@
|
||||||
const std = @import("std");
|
|
||||||
|
|
||||||
pub fn build(b: *std.build.Builder) void {
|
|
||||||
const rem_pkg = std.build.Pkg{
|
|
||||||
.name = "rem",
|
|
||||||
.source = .{ .path = "./rem/rem.zig" },
|
|
||||||
};
|
|
||||||
|
|
||||||
// Standard target options allows the person running `zig build` to choose
|
|
||||||
// what target to build for. Here we do not override the defaults, which
|
|
||||||
// means any target is allowed, and the default is native. Other options
|
|
||||||
// for restricting supported target set are available.
|
|
||||||
const target = b.standardTargetOptions(.{});
|
|
||||||
|
|
||||||
// Standard release options allow the person running `zig build` to select
|
|
||||||
// between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall.
|
|
||||||
const mode = b.standardReleaseOptions();
|
|
||||||
|
|
||||||
const exe = b.addExecutable("site-analyzer", "src/main.zig");
|
|
||||||
exe.setTarget(target);
|
|
||||||
exe.setBuildMode(mode);
|
|
||||||
exe.addPackage(rem_pkg);
|
|
||||||
exe.install();
|
|
||||||
|
|
||||||
const run_cmd = exe.run();
|
|
||||||
run_cmd.step.dependOn(b.getInstallStep());
|
|
||||||
if (b.args) |args| {
|
|
||||||
run_cmd.addArgs(args);
|
|
||||||
}
|
|
||||||
|
|
||||||
const run_step = b.step("run", "Run the app");
|
|
||||||
run_step.dependOn(&run_cmd.step);
|
|
||||||
|
|
||||||
const exe_tests = b.addTest("src/main.zig");
|
|
||||||
exe_tests.setTarget(target);
|
|
||||||
exe_tests.setBuildMode(mode);
|
|
||||||
|
|
||||||
const test_step = b.step("test", "Run unit tests");
|
|
||||||
test_step.dependOn(&exe_tests.step);
|
|
||||||
}
|
|
88
index.js
Normal file
88
index.js
Normal file
|
@ -0,0 +1,88 @@
|
||||||
|
import { Parser } from "htmlparser2";
|
||||||
|
import { DomHandler } from "domhandler";
|
||||||
|
import { performance } from "perf_hooks";
|
||||||
|
|
||||||
|
const noop = () => {};
|
||||||
|
// const log = noop;
|
||||||
|
const { log } = console;
|
||||||
|
|
||||||
|
function readStdin() {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
let buffer = "";
|
||||||
|
process.stdin.resume();
|
||||||
|
process.stdin.on("data", (d) => (buffer = buffer.concat(d.toString())));
|
||||||
|
process.stdin.on("close", () => resolve(buffer));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
const stdin = await readStdin();
|
||||||
|
const rawHtml = stdin;
|
||||||
|
|
||||||
|
function isHttp(url) {
|
||||||
|
const r = /^(https?:\/\/|\/\/)/;
|
||||||
|
return r.test(url);
|
||||||
|
}
|
||||||
|
function isAbsolute(url) {
|
||||||
|
return url.startsWith("/");
|
||||||
|
}
|
||||||
|
|
||||||
|
function getHtml(el) {
|
||||||
|
// return rawHtml.slice(el.startIndex, el.endIndex);
|
||||||
|
let text = "";
|
||||||
|
for (const child of el.children) {
|
||||||
|
if (child.type === "text") text += child.data.trim();
|
||||||
|
else if (child.type === "tag") text += getHtml(child);
|
||||||
|
}
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
|
function checkUrl(type, el, url) {
|
||||||
|
if (isHttp(url)) {
|
||||||
|
log(`HTTP/S ${type}:`, getHtml(el));
|
||||||
|
} else if (isAbsolute(url)) {
|
||||||
|
log(`Absolute ${type}:`, getHtml(el));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function recursive(el) {
|
||||||
|
if (el.name === "a") {
|
||||||
|
if (el.attribs.href) {
|
||||||
|
checkUrl("link", el, el.attribs.href);
|
||||||
|
} else {
|
||||||
|
log("Link with no href:", getHtml(el));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (["audio", "video", "img"].includes(el.name)) {
|
||||||
|
if (el.attribs.src) {
|
||||||
|
checkUrl(el.name, el, el.attribs.src);
|
||||||
|
} else {
|
||||||
|
log(`${el.name} with no src:`, getHtml(el));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const child of el.children) {
|
||||||
|
if (child.type === "tag") {
|
||||||
|
recursive(child);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const handler = new DomHandler(
|
||||||
|
(error, dom) => {
|
||||||
|
if (error) {
|
||||||
|
// TODO: Handle error
|
||||||
|
} else {
|
||||||
|
// for (let i = 0; i < 1000; i++) {
|
||||||
|
console.time();
|
||||||
|
for (const el of dom) {
|
||||||
|
if (el.type === "tag") {
|
||||||
|
recursive(el);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
console.timeEnd();
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ withEndIndices: true, withStartIndices: true }
|
||||||
|
);
|
||||||
|
const parser = new Parser(handler);
|
||||||
|
parser.parseComplete(rawHtml);
|
21
package.json
Normal file
21
package.json
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
{
|
||||||
|
"name": "site-analyzer",
|
||||||
|
"type": "module",
|
||||||
|
"version": "0.0.1",
|
||||||
|
"description": "",
|
||||||
|
"main": "index.js",
|
||||||
|
"scripts": {
|
||||||
|
"test": "echo \"Error: no test specified\" && exit 1"
|
||||||
|
},
|
||||||
|
"keywords": [],
|
||||||
|
"author": "",
|
||||||
|
"license": "ISC",
|
||||||
|
"dependencies": {
|
||||||
|
"domhandler": "^5.0.3",
|
||||||
|
"htmlparser2": "^8.0.1"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/htmlparser2": "^3.10.3",
|
||||||
|
"@types/node": "^18.11.9"
|
||||||
|
}
|
||||||
|
}
|
86
pnpm-lock.yaml
Normal file
86
pnpm-lock.yaml
Normal file
|
@ -0,0 +1,86 @@
|
||||||
|
lockfileVersion: 5.4
|
||||||
|
|
||||||
|
specifiers:
|
||||||
|
'@types/htmlparser2': ^3.10.3
|
||||||
|
'@types/node': ^18.11.9
|
||||||
|
domhandler: ^5.0.3
|
||||||
|
htmlparser2: ^8.0.1
|
||||||
|
|
||||||
|
dependencies:
|
||||||
|
domhandler: 5.0.3
|
||||||
|
htmlparser2: 8.0.1
|
||||||
|
|
||||||
|
devDependencies:
|
||||||
|
'@types/htmlparser2': 3.10.3
|
||||||
|
'@types/node': 18.11.9
|
||||||
|
|
||||||
|
packages:
|
||||||
|
|
||||||
|
/@types/domutils/1.7.4:
|
||||||
|
resolution: {integrity: sha512-w542nRQ0vpXQjLYP52LKqrugQtUq580dEDiDIyZ6IBmV8a3LXjGVNxfj/jUQxS0kDsbZAWsSxQOcTfVX3HRdwg==}
|
||||||
|
dependencies:
|
||||||
|
domhandler: 2.4.2
|
||||||
|
dev: true
|
||||||
|
|
||||||
|
/@types/htmlparser2/3.10.3:
|
||||||
|
resolution: {integrity: sha512-XA74aD+acytofnZic9n83Rxy/IZ259299bYPx5SEyx7uymPi79lRyKDkhJlsuCaPHB7rEoTEhRN4Vm2G5WmHHg==}
|
||||||
|
dependencies:
|
||||||
|
'@types/domutils': 1.7.4
|
||||||
|
'@types/node': 18.11.9
|
||||||
|
domhandler: 2.4.2
|
||||||
|
dev: true
|
||||||
|
|
||||||
|
/@types/node/18.11.9:
|
||||||
|
resolution: {integrity: sha512-CRpX21/kGdzjOpFsZSkcrXMGIBWMGNIHXXBVFSH+ggkftxg+XYP20TESbh+zFvFj3EQOl5byk0HTRn1IL6hbqg==}
|
||||||
|
dev: true
|
||||||
|
|
||||||
|
/dom-serializer/2.0.0:
|
||||||
|
resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==}
|
||||||
|
dependencies:
|
||||||
|
domelementtype: 2.3.0
|
||||||
|
domhandler: 5.0.3
|
||||||
|
entities: 4.4.0
|
||||||
|
dev: false
|
||||||
|
|
||||||
|
/domelementtype/1.3.1:
|
||||||
|
resolution: {integrity: sha512-BSKB+TSpMpFI/HOxCNr1O8aMOTZ8hT3pM3GQ0w/mWRmkhEDSFJkkyzz4XQsBV44BChwGkrDfMyjVD0eA2aFV3w==}
|
||||||
|
dev: true
|
||||||
|
|
||||||
|
/domelementtype/2.3.0:
|
||||||
|
resolution: {integrity: sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==}
|
||||||
|
dev: false
|
||||||
|
|
||||||
|
/domhandler/2.4.2:
|
||||||
|
resolution: {integrity: sha512-JiK04h0Ht5u/80fdLMCEmV4zkNh2BcoMFBmZ/91WtYZ8qVXSKjiw7fXMgFPnHcSZgOo3XdinHvmnDUeMf5R4wA==}
|
||||||
|
dependencies:
|
||||||
|
domelementtype: 1.3.1
|
||||||
|
dev: true
|
||||||
|
|
||||||
|
/domhandler/5.0.3:
|
||||||
|
resolution: {integrity: sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==}
|
||||||
|
engines: {node: '>= 4'}
|
||||||
|
dependencies:
|
||||||
|
domelementtype: 2.3.0
|
||||||
|
dev: false
|
||||||
|
|
||||||
|
/domutils/3.0.1:
|
||||||
|
resolution: {integrity: sha512-z08c1l761iKhDFtfXO04C7kTdPBLi41zwOZl00WS8b5eiaebNpY00HKbztwBq+e3vyqWNwWF3mP9YLUeqIrF+Q==}
|
||||||
|
dependencies:
|
||||||
|
dom-serializer: 2.0.0
|
||||||
|
domelementtype: 2.3.0
|
||||||
|
domhandler: 5.0.3
|
||||||
|
dev: false
|
||||||
|
|
||||||
|
/entities/4.4.0:
|
||||||
|
resolution: {integrity: sha512-oYp7156SP8LkeGD0GF85ad1X9Ai79WtRsZ2gxJqtBuzH+98YUV6jkHEKlZkMbcrjJjIVJNIDP/3WL9wQkoPbWA==}
|
||||||
|
engines: {node: '>=0.12'}
|
||||||
|
dev: false
|
||||||
|
|
||||||
|
/htmlparser2/8.0.1:
|
||||||
|
resolution: {integrity: sha512-4lVbmc1diZC7GUJQtRQ5yBAeUCL1exyMwmForWkRLnwyzWBFxN633SALPMGYaWZvKe9j1pRZJpauvmxENSp/EA==}
|
||||||
|
dependencies:
|
||||||
|
domelementtype: 2.3.0
|
||||||
|
domhandler: 5.0.3
|
||||||
|
domutils: 3.0.1
|
||||||
|
entities: 4.4.0
|
||||||
|
dev: false
|
1
rem
1
rem
|
@ -1 +0,0 @@
|
||||||
Subproject commit 53d2307030c9b9ce3501b007de86b49b0838d3ae
|
|
87
src/main.zig
87
src/main.zig
|
@ -1,87 +0,0 @@
|
||||||
const std = @import("std");
|
|
||||||
const rem = @import("rem");
|
|
||||||
|
|
||||||
fn utf8DecodeString(allocator: std.mem.Allocator, string: []const u8) ![]u21 {
|
|
||||||
var list = std.ArrayList(u21).init(allocator);
|
|
||||||
errdefer list.deinit();
|
|
||||||
|
|
||||||
var decoded_it = (try std.unicode.Utf8View.init(string)).iterator();
|
|
||||||
while (decoded_it.nextCodepoint()) |codepoint| {
|
|
||||||
try list.append(codepoint);
|
|
||||||
}
|
|
||||||
return list.toOwnedSlice();
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn main() !u8 {
|
|
||||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
|
||||||
defer _ = gpa.deinit();
|
|
||||||
const allocator = gpa.allocator();
|
|
||||||
|
|
||||||
const string = try std.io.getStdIn().readToEndAlloc(allocator, 1024 * 1024);
|
|
||||||
// The string must be decoded before it can be passed to the parser.
|
|
||||||
// const input = &rem.util.utf8DecodeStringComptime(string);
|
|
||||||
const input = try utf8DecodeString(allocator, string);
|
|
||||||
|
|
||||||
// Create the DOM in which the parsed Document will be created.
|
|
||||||
var dom = rem.dom.Dom{ .allocator = allocator };
|
|
||||||
defer dom.deinit();
|
|
||||||
|
|
||||||
var parser = try rem.Parser.init(&dom, input, allocator, .abort, false);
|
|
||||||
defer parser.deinit();
|
|
||||||
try parser.run();
|
|
||||||
|
|
||||||
const errors = parser.errors();
|
|
||||||
if (errors.len > 0) {
|
|
||||||
std.log.err("A parsing error occured!\n{s}\n", .{@tagName(errors[0])});
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// const writer = std.io.getStdOut().writer();
|
|
||||||
const document = parser.getDocument();
|
|
||||||
|
|
||||||
// try rem.util.printDocument(writer, document, &dom, allocator);
|
|
||||||
check(document.element.?);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
const startsWith = std.mem.startsWith;
|
|
||||||
const startsWithIgnoreCase = std.ascii.startsWithIgnoreCase;
|
|
||||||
|
|
||||||
fn isHttps(url: []const u8) bool {
|
|
||||||
return startsWithIgnoreCase(url, "//") or
|
|
||||||
startsWithIgnoreCase(url, "http://") or
|
|
||||||
startsWithIgnoreCase(url, "https://");
|
|
||||||
}
|
|
||||||
fn isAbsolute(url: []const u8) bool {
|
|
||||||
return startsWithIgnoreCase(url, "/");
|
|
||||||
}
|
|
||||||
|
|
||||||
fn check(element: *const rem.dom.Element) void {
|
|
||||||
// std.log.info("{any}", .{element.element_type});
|
|
||||||
switch (element.element_type) {
|
|
||||||
.html_img => {
|
|
||||||
if (element.attributes.getEntry("alt") == null) {
|
|
||||||
std.log.err("img with no alt text", .{});
|
|
||||||
}
|
|
||||||
},
|
|
||||||
.html_a => {
|
|
||||||
if (element.attributes.getEntry("href")) |entry| {
|
|
||||||
const href = entry.value_ptr.*;
|
|
||||||
if (isHttps(href)) {
|
|
||||||
std.log.warn("link to http/s: {s}", .{href});
|
|
||||||
} else if (isAbsolute(href)) {
|
|
||||||
std.log.warn("absolute link: {s}", .{href});
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
std.log.warn("link with no href", .{});
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
else => {},
|
|
||||||
}
|
|
||||||
// for (element.children.items) |child| {
|
|
||||||
// switch (child) {
|
|
||||||
// .element => |el| check(el),
|
|
||||||
// .cdata => |cdata| std.log.info("cdata: {s}", .{cdata.data.items}),
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
}
|
|
Loading…
Reference in a new issue