const std = @import("std"); fn last(slice: anytype) ?*@typeInfo(@TypeOf(slice)).Pointer.child { if (slice.len == 0) return null; return &slice[slice.len - 1]; } pub const Children = std.ArrayList(Block); fn deinitChildren(children: *Children) void { for (children.items) |*child| { switch (child.*) { .document => unreachable, .paragraph => |*paragraph| paragraph.deinit(children.allocator), .heading => |*heading| heading.deinit(children.allocator), .block_quote => |*block_quote| block_quote.deinit(children.allocator), .list => |*list| list.deinit(children.allocator), .list_item => |*list_item| list_item.deinit(children.allocator), } } children.deinit(); } const Block = union(enum) { document: Document, paragraph: Paragraph, heading: Heading, block_quote: BlockQuote, list: List, list_item: ListItem, }; const Document = struct { children: Children, pub fn deinit(self: *Document) void { deinitChildren(&self.children); } }; const Paragraph = struct { text: []u8, pub fn deinit(self: *Paragraph, allocator: std.mem.Allocator) void { allocator.free(self.text); } }; const Heading = struct { level: u8, text: []u8, pub fn deinit(self: *Heading, allocator: std.mem.Allocator) void { allocator.free(self.text); } }; const BlockQuote = struct { children: Children, pub fn deinit(self: *BlockQuote, _: std.mem.Allocator) void { deinitChildren(&self.children); } }; const List = struct { pub const Kind = union(enum) { bullet: u8, }; kind: Kind, tight: bool, children: Children, pub fn deinit(self: *List, _: std.mem.Allocator) void { deinitChildren(&self.children); } }; const ListItem = struct { children: Children, pub fn deinit(self: *ListItem, _: std.mem.Allocator) void { deinitChildren(&self.children); } }; fn arrayListAppend(comptime T: type, list: *std.ArrayList(T), item: T) !*T { const ptr = try list.addOne(); ptr.* = item; return ptr; } fn reallocAppend( allocator: std.mem.Allocator, alloc_str: []u8, append_str: []const u8, ) ![]u8 { const ptr = try allocator.realloc(alloc_str, alloc_str.len + append_str.len); std.mem.copy(u8, ptr[alloc_str.len..], append_str); return ptr; } fn detectIndent(str: []const u8) usize { var indent: usize = 0; for (str) |char| { if (char == ' ') { indent += 1; } else break; } return indent; } fn checkIfBlockStillMatches(line: *[]const u8, block: Block) bool { return switch (block) { .document => true, .heading => |_| false, .paragraph => |_| { if (line.*.len == 0) return false; return true; }, .block_quote => |_| { if (std.mem.startsWith(u8, line.*, ">")) { line.* = line.*[1..]; const indent = detectIndent(line.*); line.* = line.*[indent..]; return true; } else return false; }, .list => |_| { if (std.mem.startsWith(u8, line.*, "* ")) { line.* = line.*[2..]; return true; } else if (detectIndent(line.*) >= 4) { return true; } else return false; }, .list_item => |_| { const indent = detectIndent(line.*); if (indent >= 4) { line.* = line.*[4..]; return true; } else return false; }, }; } const ATXHeading = struct { spaces: u2, level: u3, closing_sequence: usize, closing_sequence_spaces: usize, pub fn content(self: ATXHeading, line: []const u8) []const u8 { // It's possible it might not have a space after the heading as it can // be an empty heading. const has_space: u1 = if (line.len > self.spaces + self.level) 1 else 0; return line[self.spaces + self.level + has_space .. line.len - self.closing_sequence - self.closing_sequence_spaces]; } }; fn parseATXHeading(line: []const u8) ?ATXHeading { var spaces: u2 = 0; for (line) |char| { if (char == ' ' and spaces != 3) spaces += 1 else if (char == '#') break else return null; } var level: u3 = 0; for (line[spaces..]) |char| { if (char == '#' and level != 6) level += 1 else if (char == ' ' or char == '\t' or char == '\n') break else return null; } if (level == 0) return null; var heading = ATXHeading{ .level = level, .spaces = spaces, .closing_sequence = 0, .closing_sequence_spaces = 0, }; closing_sequence_block: { const rest = heading.content(line); var closing_sequence: usize = 0; var closing_sequence_spaces: usize = 0; if (rest.len == 0) break :closing_sequence_block; var i: usize = rest.len - 1; while (i >= 0) : (i -= 1) { const char = rest[i]; if (char == ' ') closing_sequence_spaces += 1 else if (char == '#') break else break :closing_sequence_block; if (i == 0) break; } while (i >= 0) : (i -= 1) { const char = rest[i]; if (char == '#') closing_sequence += 1 else if (char == ' ' or char == '\t') break else break :closing_sequence_block; if (i == 0) break; } heading.closing_sequence = closing_sequence; heading.closing_sequence_spaces = closing_sequence_spaces; } return heading; } fn checkIfBlockStarts(allocator: std.mem.Allocator, line: *[]const u8) !?Block { if (std.mem.startsWith(u8, line.*, ">")) { line.* = line.*[1..]; const indent = detectIndent(line.*); line.* = line.*[indent..]; return Block{ .block_quote = .{ .children = Children.init(allocator) } }; } else if (std.mem.startsWith(u8, line.*, "* ")) { line.* = line.*[2..]; return Block{ .list = .{ .children = Children.init(allocator), .kind = .{ .bullet = '*' }, .tight = false, } }; } else if (parseATXHeading(line.*)) |heading| { line.* = heading.content(line.*); return Block{ .heading = .{ .text = try allocator.alloc(u8, 0), .level = heading.level, } }; } return null; } const StackItem = struct { block: *Block, matched: bool, }; const Stack = std.ArrayList(StackItem); // Verifies that the stack used to parse documents is not broken. fn verifyStack(stack: Stack) void { for (stack.items) |item, index| { if (index == 0) { std.debug.assert(std.mem.eql(u8, @tagName(item.block.*), "document")); } else { const parent = stack.items[index - 1]; const parent_children = switch (parent.block.*) { .document => |document| document.children.items, .paragraph, .heading => { if (index == stack.items.len - 1) break else unreachable; }, .block_quote => |block_quote| block_quote.children.items, .list => |list| list.children.items, .list_item => |list_item| list_item.children.items, }; std.debug.assert(std.meta.eql(item.block, last(parent_children).?)); } } } pub fn parse(allocator: std.mem.Allocator, str: []const u8) !Document { var doc_block = Block{ .document = Document{ .children = Children.init(allocator) } }; errdefer doc_block.document.deinit(); var stack = Stack.init(allocator); defer stack.deinit(); const doc_stack_item = StackItem{ .block = &doc_block, .matched = true }; try stack.append(doc_stack_item); var iter = std.mem.split(u8, str, "\n"); lineLoop: while (iter.next()) |line| { var rest = line; verifyStack(stack); for (stack.items) |*item, i| { item.matched = checkIfBlockStillMatches(&rest, item.block.*); if (!item.matched) { switch (item.block.*) { .paragraph, .list_item => { try stack.resize(i); break; }, .block_quote => { if (rest.len == 0) { try stack.resize(i); break; } }, else => {}, } } } if (try checkIfBlockStarts(allocator, &rest)) |block| { for (stack.items) |item, i| { if (!item.matched or // If a new block started, finish the paragraph item.block.* == .paragraph) { try stack.resize(i); break; } } var last_block = stack.items[stack.items.len - 1].block; const last_block_children = switch (last_block.*) { .document => |*document| &document.children, .heading, .paragraph => unreachable, .block_quote => |*block_quote| &block_quote.children, .list => |*list| &list.children, .list_item => |*list_item| &list_item.children, }; const new_block = try arrayListAppend(Block, last_block_children, block); try stack.append(.{ .block = new_block, .matched = false }); } if (rest.len == 0) continue :lineLoop; var index = stack.items.len - 1; while (true) : (index -= 1) { switch (stack.items[index].block.*) { .heading => |*heading| { heading.text = try reallocAppend(allocator, heading.text, rest); try stack.resize(index); break; }, .paragraph => |*paragraph| { if (paragraph.text.len != 0) { paragraph.text = try reallocAppend(allocator, paragraph.text, " "); } paragraph.text = try reallocAppend(allocator, paragraph.text, rest); break; }, .document => |*document| { const paragraph_tmp = .{ .text = try allocator.alloc(u8, 0) }; const paragraph_block_tmp = .{ .paragraph = paragraph_tmp }; const paragraph = try arrayListAppend(Block, &document.children, paragraph_block_tmp); try stack.append(.{ .block = paragraph, .matched = false }); index = stack.items.len; }, .block_quote => |*block_quote| { const paragraph_tmp = .{ .text = try allocator.alloc(u8, 0) }; const paragraph_block_tmp = .{ .paragraph = paragraph_tmp }; const paragraph = try arrayListAppend(Block, &block_quote.children, paragraph_block_tmp); try stack.append(.{ .block = paragraph, .matched = false }); index = stack.items.len; }, .list => |*list| { const item_tmp = .{ .children = Children.init(allocator) }; const item_block_tmp = .{ .list_item = item_tmp }; const item = try arrayListAppend(Block, &list.children, item_block_tmp); try stack.append(.{ .block = item, .matched = false }); index = stack.items.len; }, .list_item => |*list_item| { const paragraph_tmp = .{ .text = try allocator.alloc(u8, 0) }; const paragraph_block_tmp = .{ .paragraph = paragraph_tmp }; const paragraph = try arrayListAppend(Block, &list_item.children, paragraph_block_tmp); try stack.append(.{ .block = paragraph, .matched = false }); index = stack.items.len; }, } } } // TODO: use std.log when I figure out how to make the messages print when testing std.debug.print("parser: stack: {any}\n", .{stack.items}); verifyStack(stack); return doc_block.document; } const testing = std.testing; test "block quotes" { const str = \\Hello \\ \\> Block quotes are \\written like so. \\> \\> They can span multiple paragraphs, \\> if you like. ; var doc = try parse(std.testing.allocator, str); defer doc.deinit(); try testing.expectEqual(@as(usize, 2), doc.children.items.len); try testing.expectEqualStrings("paragraph", @tagName(doc.children.items[0])); try testing.expectEqualStrings("Hello", doc.children.items[0].paragraph.text); try testing.expectEqualStrings("block_quote", @tagName(doc.children.items[1])); try testing.expectEqual(@as(usize, 2), doc.children.items[1].block_quote.children.items.len); try testing.expectEqualStrings( "paragraph", @tagName(doc.children.items[1].block_quote.children.items[0]), ); try testing.expectEqualStrings( "Block quotes are written like so.", doc.children.items[1].block_quote.children.items[0].paragraph.text, ); try testing.expectEqualStrings( "paragraph", @tagName(doc.children.items[1].block_quote.children.items[1]), ); try testing.expectEqualStrings( "They can span multiple paragraphs, if you like.", doc.children.items[1].block_quote.children.items[1].paragraph.text, ); } test "headings" { const str = \\Hello \\## Hey how \\Testing \\# headings ; var doc = try parse(std.testing.allocator, str); defer doc.deinit(); try testing.expectEqual(@as(usize, 4), doc.children.items.len); try testing.expectEqualStrings("paragraph", @tagName(doc.children.items[0])); try testing.expectEqualStrings("Hello", doc.children.items[0].paragraph.text); try testing.expectEqualStrings("heading", @tagName(doc.children.items[1])); try testing.expectEqualStrings("Hey how", doc.children.items[1].heading.text); try testing.expectEqualStrings("paragraph", @tagName(doc.children.items[2])); try testing.expectEqualStrings("Testing", doc.children.items[2].paragraph.text); try testing.expectEqualStrings("heading", @tagName(doc.children.items[3])); try testing.expectEqualStrings("headings", doc.children.items[3].heading.text); }