From b58c5ce8caaada52e63f4ceaec26c1972c93d931 Mon Sep 17 00:00:00 2001 From: Nulo Date: Sun, 12 Sep 2021 22:28:40 -0300 Subject: [PATCH] Initial commit --- .gitignore | 3 + .gitmodules | 3 + cli.zig | 17 +++ commonmark-spec | 1 + html.zig | 31 ++++ main.zig | 4 + parser.zig | 374 ++++++++++++++++++++++++++++++++++++++++++++++ readme.md | 29 ++++ readme_en.md | 27 ++++ run-spec-tests.sh | 5 + 10 files changed, 494 insertions(+) create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 cli.zig create mode 160000 commonmark-spec create mode 100644 html.zig create mode 100644 main.zig create mode 100644 parser.zig create mode 100644 readme.md create mode 100644 readme_en.md create mode 100755 run-spec-tests.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7305c87 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +cli +zig-cache/ +zig-out/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..c3e6a18 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "commonmark-spec"] + path = commonmark-spec + url = https://github.com/commonmark/commonmark-spec diff --git a/cli.zig b/cli.zig new file mode 100644 index 0000000..4e94b1a --- /dev/null +++ b/cli.zig @@ -0,0 +1,17 @@ +const std = @import("std"); +const aaronsw = @import("main.zig"); + +pub fn main() !void { + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + defer _ = gpa.deinit(); + const allocator = &gpa.allocator; + + const str = try std.io.getStdIn().reader().readAllAlloc(allocator, 10241024); + defer allocator.free(str); + + const writer = std.io.getStdOut().writer(); + + var doc = try aaronsw.parse(allocator, str); + defer doc.deinit(); + try aaronsw.html.printChildren(writer, doc.children); +} diff --git a/commonmark-spec b/commonmark-spec new file mode 160000 index 0000000..499ebba --- /dev/null +++ b/commonmark-spec @@ -0,0 +1 @@ +Subproject commit 499ebbad90163881f51498c4c620652d0c66fb2e diff --git a/html.zig b/html.zig new file mode 100644 index 0000000..000b2ac --- /dev/null +++ b/html.zig @@ -0,0 +1,31 @@ +const std = @import("std"); +const Children = @import("parser.zig").Children; + +pub fn printChildren(writer: anytype, children: Children) @TypeOf(writer).Error!void { + for (children.items) |child| { + switch (child) { + .document => unreachable, + .heading => |heading| { + try writer.print("{1s}", .{ heading.level, heading.text }); + }, + .paragraph => |paragraph| { + try writer.print("

{s}

", .{paragraph.text}); + }, + .block_quote => |block_quote| { + try writer.print("
", .{}); + try printChildren(writer, block_quote.children); + try writer.print("
", .{}); + }, + .list => |list| { + try writer.print("", .{}); + }, + .list_item => |list_item| { + try writer.print("
  • ", .{}); + try printChildren(writer, list_item.children); + try writer.print("
  • ", .{}); + }, + } + } +} diff --git a/main.zig b/main.zig new file mode 100644 index 0000000..a183b64 --- /dev/null +++ b/main.zig @@ -0,0 +1,4 @@ +pub const html = @import("html.zig"); +const parser = @import("parser.zig"); +pub const parse = parser.parse; +pub const Children = parser.Children; diff --git a/parser.zig b/parser.zig new file mode 100644 index 0000000..517e3d7 --- /dev/null +++ b/parser.zig @@ -0,0 +1,374 @@ +const std = @import("std"); + +fn last(slice: anytype) ?*@typeInfo(@TypeOf(slice)).Pointer.child { + if (slice.len == 0) return null; + return &slice[slice.len - 1]; +} + +pub const Children = std.ArrayList(Block); +fn deinitChildren(children: *Children) void { + for (children.items) |*child| { + switch (child.*) { + .document => unreachable, + .paragraph => |*paragraph| paragraph.deinit(children.allocator), + .heading => |*heading| heading.deinit(children.allocator), + .block_quote => |*block_quote| block_quote.deinit(children.allocator), + .list => |*list| list.deinit(children.allocator), + .list_item => |*list_item| list_item.deinit(children.allocator), + } + } + children.deinit(); +} + +const Block = union(enum) { + document: Document, + paragraph: Paragraph, + heading: Heading, + block_quote: BlockQuote, + list: List, + list_item: ListItem, +}; + +const Document = struct { + children: Children, + pub fn deinit(self: *Document) void { + deinitChildren(&self.children); + } +}; +const Paragraph = struct { + text: []u8, + pub fn deinit(self: *Paragraph, allocator: *std.mem.Allocator) void { + allocator.free(self.text); + } +}; +const Heading = struct { + level: u8, + text: []u8, + pub fn deinit(self: *Heading, allocator: *std.mem.Allocator) void { + allocator.free(self.text); + } +}; +const BlockQuote = struct { + children: Children, + pub fn deinit(self: *BlockQuote, _: *std.mem.Allocator) void { + deinitChildren(&self.children); + } +}; +const List = struct { + pub const Kind = union(enum) { + bullet: u8, + }; + kind: Kind, + tight: bool, + children: Children, + pub fn deinit(self: *List, _: *std.mem.Allocator) void { + deinitChildren(&self.children); + } +}; +const ListItem = struct { + children: Children, + pub fn deinit(self: *ListItem, _: *std.mem.Allocator) void { + deinitChildren(&self.children); + } +}; + +fn arrayListAppend(comptime T: type, list: *std.ArrayList(T), item: T) !*T { + const ptr = try list.addOne(); + ptr.* = item; + return ptr; +} + +fn reallocAppend( + allocator: *std.mem.Allocator, + alloc_str: []u8, + append_str: []const u8, +) ![]u8 { + const ptr = try allocator.realloc(alloc_str, alloc_str.len + append_str.len); + std.mem.copy(u8, ptr[alloc_str.len..], append_str); + return ptr; +} + +fn detectIndent(str: []const u8) usize { + var indent: usize = 0; + for (str) |char| { + if (char == ' ') { + indent += 1; + } else break; + } + return indent; +} + +fn checkIfBlockStillMatches(line: *[]const u8, block: Block) bool { + return switch (block) { + .document => true, + .heading => |_| false, + .paragraph => |_| { + if (line.*.len == 0) return false; + return true; + }, + .block_quote => |_| { + if (std.mem.startsWith(u8, line.*, ">")) { + line.* = line.*[1..]; + const indent = detectIndent(line.*); + line.* = line.*[indent..]; + return true; + } else return false; + }, + .list => |_| { + if (std.mem.startsWith(u8, line.*, "* ")) { + line.* = line.*[2..]; + return true; + } else if (detectIndent(line.*) >= 4) { + return true; + } else return false; + }, + .list_item => |_| { + const indent = detectIndent(line.*); + if (indent >= 4) { + line.* = line.*[4..]; + return true; + } else return false; + }, + }; +} +fn getHeadingLevel(line: []const u8) u8 { + var level: u8 = 0; + for (line) |char| { + if (level == 6) break; + if (char == '#') level += 1 else break; + } + return level; +} +fn checkIfBlockStarts(allocator: *std.mem.Allocator, line: *[]const u8) !?Block { + if (std.mem.startsWith(u8, line.*, ">")) { + line.* = line.*[1..]; + const indent = detectIndent(line.*); + line.* = line.*[indent..]; + return Block{ .block_quote = .{ .children = Children.init(allocator) } }; + } else if (std.mem.startsWith(u8, line.*, "* ")) { + line.* = line.*[2..]; + return Block{ .list = .{ + .children = Children.init(allocator), + .kind = .{ .bullet = '*' }, + .tight = false, + } }; + } else if (std.mem.startsWith(u8, line.*, "#")) { + const level = getHeadingLevel(line.*); + if (line.*[level..].len != 0) { + if (line.*[level] != ' ' or line.*[level] != '\t') return null; + line.* = line.*[level + 1 ..]; + } + return Block{ .heading = .{ + .text = try allocator.alloc(u8, 0), + .level = level, + } }; + } + return null; +} + +const StackItem = struct { + block: *Block, + matched: bool, +}; +const Stack = std.ArrayList(StackItem); + +// Verifies that the stack used to parse documents is not broken. +fn verifyStack(stack: Stack) void { + for (stack.items) |item, index| { + if (index == 0) { + std.debug.assert(std.mem.eql(u8, @tagName(item.block.*), "document")); + } else { + const parent = stack.items[index - 1]; + const parent_children = switch (parent.block.*) { + .document => |document| document.children.items, + .paragraph, .heading => { + if (index == stack.items.len - 1) break else unreachable; + }, + .block_quote => |block_quote| block_quote.children.items, + .list => |list| list.children.items, + .list_item => |list_item| list_item.children.items, + }; + std.debug.assert(std.meta.eql(item.block, last(parent_children).?)); + } + } +} + +pub fn parse(allocator: *std.mem.Allocator, str: []const u8) !Document { + var doc_block = Block{ .document = Document{ .children = Children.init(allocator) } }; + errdefer doc_block.document.deinit(); + + var stack = Stack.init(allocator); + defer stack.deinit(); + const doc_stack_item = StackItem{ .block = &doc_block, .matched = true }; + try stack.append(doc_stack_item); + + var iter = std.mem.split(u8, str, "\n"); + + lineLoop: while (iter.next()) |line| { + var rest = line; + verifyStack(stack); + + for (stack.items) |*item, i| { + item.matched = checkIfBlockStillMatches(&rest, item.block.*); + if (!item.matched) { + switch (item.block.*) { + .paragraph, .list_item => { + try stack.resize(i); + break; + }, + .block_quote => { + if (rest.len == 0) { + try stack.resize(i); + break; + } + }, + else => {}, + } + } + } + + if (try checkIfBlockStarts(allocator, &rest)) |block| { + for (stack.items) |item, i| { + if (!item.matched or + // If a new block started, finish the paragraph + item.block.* == .paragraph) + { + try stack.resize(i); + break; + } + } + + var last_block = stack.items[stack.items.len - 1].block; + const last_block_children = switch (last_block.*) { + .document => |*document| &document.children, + .heading, .paragraph => unreachable, + .block_quote => |*block_quote| &block_quote.children, + .list => |*list| &list.children, + .list_item => |*list_item| &list_item.children, + }; + const new_block = try arrayListAppend(Block, last_block_children, block); + try stack.append(.{ .block = new_block, .matched = false }); + } + + if (rest.len == 0) continue :lineLoop; + + var index = stack.items.len - 1; + while (true) : (index -= 1) { + switch (stack.items[index].block.*) { + .heading => |*heading| { + heading.text = try reallocAppend(allocator, heading.text, rest); + try stack.resize(index); + break; + }, + .paragraph => |*paragraph| { + if (paragraph.text.len != 0) { + paragraph.text = try reallocAppend(allocator, paragraph.text, " "); + } + paragraph.text = try reallocAppend(allocator, paragraph.text, rest); + break; + }, + .document => |*document| { + const paragraph_tmp = .{ .text = try allocator.alloc(u8, 0) }; + const paragraph_block_tmp = .{ .paragraph = paragraph_tmp }; + const paragraph = try arrayListAppend(Block, &document.children, paragraph_block_tmp); + try stack.append(.{ .block = paragraph, .matched = false }); + index = stack.items.len; + }, + .block_quote => |*block_quote| { + const paragraph_tmp = .{ .text = try allocator.alloc(u8, 0) }; + const paragraph_block_tmp = .{ .paragraph = paragraph_tmp }; + const paragraph = try arrayListAppend(Block, &block_quote.children, paragraph_block_tmp); + try stack.append(.{ .block = paragraph, .matched = false }); + index = stack.items.len; + }, + .list => |*list| { + const item_tmp = .{ .children = Children.init(allocator) }; + const item_block_tmp = .{ .list_item = item_tmp }; + const item = try arrayListAppend(Block, &list.children, item_block_tmp); + try stack.append(.{ .block = item, .matched = false }); + index = stack.items.len; + }, + .list_item => |*list_item| { + const paragraph_tmp = .{ .text = try allocator.alloc(u8, 0) }; + const paragraph_block_tmp = .{ .paragraph = paragraph_tmp }; + const paragraph = try arrayListAppend(Block, &list_item.children, paragraph_block_tmp); + try stack.append(.{ .block = paragraph, .matched = false }); + index = stack.items.len; + }, + } + } + } + + // TODO: use std.log when I figure out how to make the messages print when testing + std.debug.print("parser: stack: {any}\n", .{stack.items}); + verifyStack(stack); + + return doc_block.document; +} + +const testing = std.testing; + +test "block quotes" { + const str = + \\Hello + \\ + \\> Block quotes are + \\written like so. + \\> + \\> They can span multiple paragraphs, + \\> if you like. + ; + + var doc = try parse(std.testing.allocator, str); + defer doc.deinit(); + + try testing.expectEqual(@as(usize, 2), doc.children.items.len); + + try testing.expectEqualStrings("paragraph", @tagName(doc.children.items[0])); + try testing.expectEqualStrings("Hello", doc.children.items[0].paragraph.text); + + try testing.expectEqualStrings("block_quote", @tagName(doc.children.items[1])); + try testing.expectEqual(@as(usize, 2), doc.children.items[1].block_quote.children.items.len); + try testing.expectEqualStrings( + "paragraph", + @tagName(doc.children.items[1].block_quote.children.items[0]), + ); + try testing.expectEqualStrings( + "Block quotes are written like so.", + doc.children.items[1].block_quote.children.items[0].paragraph.text, + ); + try testing.expectEqualStrings( + "paragraph", + @tagName(doc.children.items[1].block_quote.children.items[1]), + ); + try testing.expectEqualStrings( + "They can span multiple paragraphs, if you like.", + doc.children.items[1].block_quote.children.items[1].paragraph.text, + ); +} + +test "headings" { + const str = + \\Hello + \\## Hey how + \\Testing + \\# headings + ; + + var doc = try parse(std.testing.allocator, str); + defer doc.deinit(); + + try testing.expectEqual(@as(usize, 4), doc.children.items.len); + + try testing.expectEqualStrings("paragraph", @tagName(doc.children.items[0])); + try testing.expectEqualStrings("Hello", doc.children.items[0].paragraph.text); + + try testing.expectEqualStrings("heading", @tagName(doc.children.items[1])); + try testing.expectEqualStrings("Hey how", doc.children.items[1].heading.text); + + try testing.expectEqualStrings("paragraph", @tagName(doc.children.items[2])); + try testing.expectEqualStrings("Testing", doc.children.items[2].paragraph.text); + + try testing.expectEqualStrings("heading", @tagName(doc.children.items[3])); + try testing.expectEqualStrings("headings", doc.children.items[3].heading.text); +} diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..cf97108 --- /dev/null +++ b/readme.md @@ -0,0 +1,29 @@ +# AaronSw + +[English](readme_en.md) + +Un parser de Markdown escrito en [Zig](https://ziglang.org/es). + +## Objetivos + +En este orden: + +* Ser útil (para un proyecto personal mio) +* Ser sencillo y extensible +* Seguir la especificación de [CommonMark](https://spec.commonmark.org) +* Ser eficiente en recursos + +## Ejemplo de uso + +Ver [cli.zig](cli.zig) + +## CommonMark + +Actualmente: `114 passed, 538 failed, 0 errored, 0 skipped` + +Para correr los tests: + +```sh +git submodule sync commonmark-spec/ +./run-spec-tests.sh +``` diff --git a/readme_en.md b/readme_en.md new file mode 100644 index 0000000..692a343 --- /dev/null +++ b/readme_en.md @@ -0,0 +1,27 @@ +[Castellano](readme.md) + +A Markdown parser written in [Zig](https://ziglang.org). + +## Objectives + +In this order: + +* Be useful (for a secret personal project) +* Be simple and extensible +* Follow the [CommonMark specification](https://spec.commonmark.org) +* Use resources efficiently + +## Example usage + +See [cli.zig](cli.zig) + +## CommonMark + +Currently: `114 passed, 538 failed, 0 errored, 0 skipped` + +To run the tests: + +```sh +git submodule sync commonmark-spec/ +./run-spec-tests.sh +``` diff --git a/run-spec-tests.sh b/run-spec-tests.sh new file mode 100755 index 0000000..7337256 --- /dev/null +++ b/run-spec-tests.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +zig build-exe cli.zig +cd commonmark-spec +python3 test/spec_tests.py --program ../cli