Use Node.js instead
This commit is contained in:
parent
c3f8ed8ee4
commit
ad0ef24257
8 changed files with 197 additions and 133 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -1,3 +1,3 @@
|
|||
zig-cache
|
||||
zig-out
|
||||
node_modules/
|
||||
|
||||
*.html
|
||||
|
|
3
.gitmodules
vendored
3
.gitmodules
vendored
|
@ -1,3 +0,0 @@
|
|||
[submodule "rem"]
|
||||
path = rem
|
||||
url = https://github.com/chwayne/rem
|
40
build.zig
40
build.zig
|
@ -1,40 +0,0 @@
|
|||
const std = @import("std");
|
||||
|
||||
pub fn build(b: *std.build.Builder) void {
|
||||
const rem_pkg = std.build.Pkg{
|
||||
.name = "rem",
|
||||
.source = .{ .path = "./rem/rem.zig" },
|
||||
};
|
||||
|
||||
// Standard target options allows the person running `zig build` to choose
|
||||
// what target to build for. Here we do not override the defaults, which
|
||||
// means any target is allowed, and the default is native. Other options
|
||||
// for restricting supported target set are available.
|
||||
const target = b.standardTargetOptions(.{});
|
||||
|
||||
// Standard release options allow the person running `zig build` to select
|
||||
// between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall.
|
||||
const mode = b.standardReleaseOptions();
|
||||
|
||||
const exe = b.addExecutable("site-analyzer", "src/main.zig");
|
||||
exe.setTarget(target);
|
||||
exe.setBuildMode(mode);
|
||||
exe.addPackage(rem_pkg);
|
||||
exe.install();
|
||||
|
||||
const run_cmd = exe.run();
|
||||
run_cmd.step.dependOn(b.getInstallStep());
|
||||
if (b.args) |args| {
|
||||
run_cmd.addArgs(args);
|
||||
}
|
||||
|
||||
const run_step = b.step("run", "Run the app");
|
||||
run_step.dependOn(&run_cmd.step);
|
||||
|
||||
const exe_tests = b.addTest("src/main.zig");
|
||||
exe_tests.setTarget(target);
|
||||
exe_tests.setBuildMode(mode);
|
||||
|
||||
const test_step = b.step("test", "Run unit tests");
|
||||
test_step.dependOn(&exe_tests.step);
|
||||
}
|
88
index.js
Normal file
88
index.js
Normal file
|
@ -0,0 +1,88 @@
|
|||
import { Parser } from "htmlparser2";
|
||||
import { DomHandler } from "domhandler";
|
||||
import { performance } from "perf_hooks";
|
||||
|
||||
const noop = () => {};
|
||||
// const log = noop;
|
||||
const { log } = console;
|
||||
|
||||
function readStdin() {
|
||||
return new Promise((resolve, reject) => {
|
||||
let buffer = "";
|
||||
process.stdin.resume();
|
||||
process.stdin.on("data", (d) => (buffer = buffer.concat(d.toString())));
|
||||
process.stdin.on("close", () => resolve(buffer));
|
||||
});
|
||||
}
|
||||
const stdin = await readStdin();
|
||||
const rawHtml = stdin;
|
||||
|
||||
function isHttp(url) {
|
||||
const r = /^(https?:\/\/|\/\/)/;
|
||||
return r.test(url);
|
||||
}
|
||||
function isAbsolute(url) {
|
||||
return url.startsWith("/");
|
||||
}
|
||||
|
||||
function getHtml(el) {
|
||||
// return rawHtml.slice(el.startIndex, el.endIndex);
|
||||
let text = "";
|
||||
for (const child of el.children) {
|
||||
if (child.type === "text") text += child.data.trim();
|
||||
else if (child.type === "tag") text += getHtml(child);
|
||||
}
|
||||
return text;
|
||||
}
|
||||
|
||||
function checkUrl(type, el, url) {
|
||||
if (isHttp(url)) {
|
||||
log(`HTTP/S ${type}:`, getHtml(el));
|
||||
} else if (isAbsolute(url)) {
|
||||
log(`Absolute ${type}:`, getHtml(el));
|
||||
}
|
||||
}
|
||||
|
||||
function recursive(el) {
|
||||
if (el.name === "a") {
|
||||
if (el.attribs.href) {
|
||||
checkUrl("link", el, el.attribs.href);
|
||||
} else {
|
||||
log("Link with no href:", getHtml(el));
|
||||
}
|
||||
}
|
||||
if (["audio", "video", "img"].includes(el.name)) {
|
||||
if (el.attribs.src) {
|
||||
checkUrl(el.name, el, el.attribs.src);
|
||||
} else {
|
||||
log(`${el.name} with no src:`, getHtml(el));
|
||||
}
|
||||
}
|
||||
|
||||
for (const child of el.children) {
|
||||
if (child.type === "tag") {
|
||||
recursive(child);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const handler = new DomHandler(
|
||||
(error, dom) => {
|
||||
if (error) {
|
||||
// TODO: Handle error
|
||||
} else {
|
||||
// for (let i = 0; i < 1000; i++) {
|
||||
console.time();
|
||||
for (const el of dom) {
|
||||
if (el.type === "tag") {
|
||||
recursive(el);
|
||||
}
|
||||
}
|
||||
console.timeEnd();
|
||||
// }
|
||||
}
|
||||
},
|
||||
{ withEndIndices: true, withStartIndices: true }
|
||||
);
|
||||
const parser = new Parser(handler);
|
||||
parser.parseComplete(rawHtml);
|
21
package.json
Normal file
21
package.json
Normal file
|
@ -0,0 +1,21 @@
|
|||
{
|
||||
"name": "site-analyzer",
|
||||
"type": "module",
|
||||
"version": "0.0.1",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"domhandler": "^5.0.3",
|
||||
"htmlparser2": "^8.0.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/htmlparser2": "^3.10.3",
|
||||
"@types/node": "^18.11.9"
|
||||
}
|
||||
}
|
86
pnpm-lock.yaml
Normal file
86
pnpm-lock.yaml
Normal file
|
@ -0,0 +1,86 @@
|
|||
lockfileVersion: 5.4
|
||||
|
||||
specifiers:
|
||||
'@types/htmlparser2': ^3.10.3
|
||||
'@types/node': ^18.11.9
|
||||
domhandler: ^5.0.3
|
||||
htmlparser2: ^8.0.1
|
||||
|
||||
dependencies:
|
||||
domhandler: 5.0.3
|
||||
htmlparser2: 8.0.1
|
||||
|
||||
devDependencies:
|
||||
'@types/htmlparser2': 3.10.3
|
||||
'@types/node': 18.11.9
|
||||
|
||||
packages:
|
||||
|
||||
/@types/domutils/1.7.4:
|
||||
resolution: {integrity: sha512-w542nRQ0vpXQjLYP52LKqrugQtUq580dEDiDIyZ6IBmV8a3LXjGVNxfj/jUQxS0kDsbZAWsSxQOcTfVX3HRdwg==}
|
||||
dependencies:
|
||||
domhandler: 2.4.2
|
||||
dev: true
|
||||
|
||||
/@types/htmlparser2/3.10.3:
|
||||
resolution: {integrity: sha512-XA74aD+acytofnZic9n83Rxy/IZ259299bYPx5SEyx7uymPi79lRyKDkhJlsuCaPHB7rEoTEhRN4Vm2G5WmHHg==}
|
||||
dependencies:
|
||||
'@types/domutils': 1.7.4
|
||||
'@types/node': 18.11.9
|
||||
domhandler: 2.4.2
|
||||
dev: true
|
||||
|
||||
/@types/node/18.11.9:
|
||||
resolution: {integrity: sha512-CRpX21/kGdzjOpFsZSkcrXMGIBWMGNIHXXBVFSH+ggkftxg+XYP20TESbh+zFvFj3EQOl5byk0HTRn1IL6hbqg==}
|
||||
dev: true
|
||||
|
||||
/dom-serializer/2.0.0:
|
||||
resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==}
|
||||
dependencies:
|
||||
domelementtype: 2.3.0
|
||||
domhandler: 5.0.3
|
||||
entities: 4.4.0
|
||||
dev: false
|
||||
|
||||
/domelementtype/1.3.1:
|
||||
resolution: {integrity: sha512-BSKB+TSpMpFI/HOxCNr1O8aMOTZ8hT3pM3GQ0w/mWRmkhEDSFJkkyzz4XQsBV44BChwGkrDfMyjVD0eA2aFV3w==}
|
||||
dev: true
|
||||
|
||||
/domelementtype/2.3.0:
|
||||
resolution: {integrity: sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==}
|
||||
dev: false
|
||||
|
||||
/domhandler/2.4.2:
|
||||
resolution: {integrity: sha512-JiK04h0Ht5u/80fdLMCEmV4zkNh2BcoMFBmZ/91WtYZ8qVXSKjiw7fXMgFPnHcSZgOo3XdinHvmnDUeMf5R4wA==}
|
||||
dependencies:
|
||||
domelementtype: 1.3.1
|
||||
dev: true
|
||||
|
||||
/domhandler/5.0.3:
|
||||
resolution: {integrity: sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==}
|
||||
engines: {node: '>= 4'}
|
||||
dependencies:
|
||||
domelementtype: 2.3.0
|
||||
dev: false
|
||||
|
||||
/domutils/3.0.1:
|
||||
resolution: {integrity: sha512-z08c1l761iKhDFtfXO04C7kTdPBLi41zwOZl00WS8b5eiaebNpY00HKbztwBq+e3vyqWNwWF3mP9YLUeqIrF+Q==}
|
||||
dependencies:
|
||||
dom-serializer: 2.0.0
|
||||
domelementtype: 2.3.0
|
||||
domhandler: 5.0.3
|
||||
dev: false
|
||||
|
||||
/entities/4.4.0:
|
||||
resolution: {integrity: sha512-oYp7156SP8LkeGD0GF85ad1X9Ai79WtRsZ2gxJqtBuzH+98YUV6jkHEKlZkMbcrjJjIVJNIDP/3WL9wQkoPbWA==}
|
||||
engines: {node: '>=0.12'}
|
||||
dev: false
|
||||
|
||||
/htmlparser2/8.0.1:
|
||||
resolution: {integrity: sha512-4lVbmc1diZC7GUJQtRQ5yBAeUCL1exyMwmForWkRLnwyzWBFxN633SALPMGYaWZvKe9j1pRZJpauvmxENSp/EA==}
|
||||
dependencies:
|
||||
domelementtype: 2.3.0
|
||||
domhandler: 5.0.3
|
||||
domutils: 3.0.1
|
||||
entities: 4.4.0
|
||||
dev: false
|
1
rem
1
rem
|
@ -1 +0,0 @@
|
|||
Subproject commit 53d2307030c9b9ce3501b007de86b49b0838d3ae
|
87
src/main.zig
87
src/main.zig
|
@ -1,87 +0,0 @@
|
|||
const std = @import("std");
|
||||
const rem = @import("rem");
|
||||
|
||||
fn utf8DecodeString(allocator: std.mem.Allocator, string: []const u8) ![]u21 {
|
||||
var list = std.ArrayList(u21).init(allocator);
|
||||
errdefer list.deinit();
|
||||
|
||||
var decoded_it = (try std.unicode.Utf8View.init(string)).iterator();
|
||||
while (decoded_it.nextCodepoint()) |codepoint| {
|
||||
try list.append(codepoint);
|
||||
}
|
||||
return list.toOwnedSlice();
|
||||
}
|
||||
|
||||
pub fn main() !u8 {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const string = try std.io.getStdIn().readToEndAlloc(allocator, 1024 * 1024);
|
||||
// The string must be decoded before it can be passed to the parser.
|
||||
// const input = &rem.util.utf8DecodeStringComptime(string);
|
||||
const input = try utf8DecodeString(allocator, string);
|
||||
|
||||
// Create the DOM in which the parsed Document will be created.
|
||||
var dom = rem.dom.Dom{ .allocator = allocator };
|
||||
defer dom.deinit();
|
||||
|
||||
var parser = try rem.Parser.init(&dom, input, allocator, .abort, false);
|
||||
defer parser.deinit();
|
||||
try parser.run();
|
||||
|
||||
const errors = parser.errors();
|
||||
if (errors.len > 0) {
|
||||
std.log.err("A parsing error occured!\n{s}\n", .{@tagName(errors[0])});
|
||||
return 1;
|
||||
}
|
||||
|
||||
// const writer = std.io.getStdOut().writer();
|
||||
const document = parser.getDocument();
|
||||
|
||||
// try rem.util.printDocument(writer, document, &dom, allocator);
|
||||
check(document.element.?);
|
||||
return 0;
|
||||
}
|
||||
const startsWith = std.mem.startsWith;
|
||||
const startsWithIgnoreCase = std.ascii.startsWithIgnoreCase;
|
||||
|
||||
fn isHttps(url: []const u8) bool {
|
||||
return startsWithIgnoreCase(url, "//") or
|
||||
startsWithIgnoreCase(url, "http://") or
|
||||
startsWithIgnoreCase(url, "https://");
|
||||
}
|
||||
fn isAbsolute(url: []const u8) bool {
|
||||
return startsWithIgnoreCase(url, "/");
|
||||
}
|
||||
|
||||
fn check(element: *const rem.dom.Element) void {
|
||||
// std.log.info("{any}", .{element.element_type});
|
||||
switch (element.element_type) {
|
||||
.html_img => {
|
||||
if (element.attributes.getEntry("alt") == null) {
|
||||
std.log.err("img with no alt text", .{});
|
||||
}
|
||||
},
|
||||
.html_a => {
|
||||
if (element.attributes.getEntry("href")) |entry| {
|
||||
const href = entry.value_ptr.*;
|
||||
if (isHttps(href)) {
|
||||
std.log.warn("link to http/s: {s}", .{href});
|
||||
} else if (isAbsolute(href)) {
|
||||
std.log.warn("absolute link: {s}", .{href});
|
||||
}
|
||||
} else {
|
||||
std.log.warn("link with no href", .{});
|
||||
}
|
||||
},
|
||||
|
||||
else => {},
|
||||
}
|
||||
// for (element.children.items) |child| {
|
||||
// switch (child) {
|
||||
// .element => |el| check(el),
|
||||
// .cdata => |cdata| std.log.info("cdata: {s}", .{cdata.data.items}),
|
||||
// }
|
||||
// }
|
||||
}
|
Loading…
Reference in a new issue