markdown-zig/parser.zig

435 lines
15 KiB
Zig

const std = @import("std");
fn last(slice: anytype) ?*@typeInfo(@TypeOf(slice)).Pointer.child {
if (slice.len == 0) return null;
return &slice[slice.len - 1];
}
pub const Children = std.ArrayList(Block);
fn deinitChildren(children: *Children) void {
for (children.items) |*child| {
switch (child.*) {
.document => unreachable,
.paragraph => |*paragraph| paragraph.deinit(children.allocator),
.heading => |*heading| heading.deinit(children.allocator),
.block_quote => |*block_quote| block_quote.deinit(children.allocator),
.list => |*list| list.deinit(children.allocator),
.list_item => |*list_item| list_item.deinit(children.allocator),
}
}
children.deinit();
}
const Block = union(enum) {
document: Document,
paragraph: Paragraph,
heading: Heading,
block_quote: BlockQuote,
list: List,
list_item: ListItem,
};
const Document = struct {
children: Children,
pub fn deinit(self: *Document) void {
deinitChildren(&self.children);
}
};
const Paragraph = struct {
text: []u8,
pub fn deinit(self: *Paragraph, allocator: std.mem.Allocator) void {
allocator.free(self.text);
}
};
const Heading = struct {
level: u8,
text: []u8,
pub fn deinit(self: *Heading, allocator: std.mem.Allocator) void {
allocator.free(self.text);
}
};
const BlockQuote = struct {
children: Children,
pub fn deinit(self: *BlockQuote, _: std.mem.Allocator) void {
deinitChildren(&self.children);
}
};
const List = struct {
pub const Kind = union(enum) {
bullet: u8,
};
kind: Kind,
tight: bool,
children: Children,
pub fn deinit(self: *List, _: std.mem.Allocator) void {
deinitChildren(&self.children);
}
};
const ListItem = struct {
children: Children,
pub fn deinit(self: *ListItem, _: std.mem.Allocator) void {
deinitChildren(&self.children);
}
};
fn arrayListAppend(comptime T: type, list: *std.ArrayList(T), item: T) !*T {
const ptr = try list.addOne();
ptr.* = item;
return ptr;
}
fn reallocAppend(
allocator: std.mem.Allocator,
alloc_str: []u8,
append_str: []const u8,
) ![]u8 {
const ptr = try allocator.realloc(alloc_str, alloc_str.len + append_str.len);
std.mem.copy(u8, ptr[alloc_str.len..], append_str);
return ptr;
}
fn detectIndent(str: []const u8) usize {
var indent: usize = 0;
for (str) |char| {
if (char == ' ') {
indent += 1;
} else break;
}
return indent;
}
fn checkIfBlockStillMatches(line: *[]const u8, block: Block) bool {
return switch (block) {
.document => true,
.heading => |_| false,
.paragraph => |_| {
if (line.*.len == 0) return false;
return true;
},
.block_quote => |_| {
if (std.mem.startsWith(u8, line.*, ">")) {
line.* = line.*[1..];
const indent = detectIndent(line.*);
line.* = line.*[indent..];
return true;
} else return false;
},
.list => |_| {
if (std.mem.startsWith(u8, line.*, "* ")) {
line.* = line.*[2..];
return true;
} else if (detectIndent(line.*) >= 4) {
return true;
} else return false;
},
.list_item => |_| {
const indent = detectIndent(line.*);
if (indent >= 4) {
line.* = line.*[4..];
return true;
} else return false;
},
};
}
const ATXHeading = struct {
spaces: u2,
level: u3,
closing_sequence: usize,
closing_sequence_spaces: usize,
pub fn content(self: ATXHeading, line: []const u8) []const u8 {
// It's possible it might not have a space after the heading as it can
// be an empty heading.
const has_space: u1 = if (line.len > self.spaces + self.level) 1 else 0;
return line[self.spaces + self.level + has_space .. line.len - self.closing_sequence - self.closing_sequence_spaces];
}
};
fn parseATXHeading(line: []const u8) ?ATXHeading {
var spaces: u2 = 0;
for (line) |char| {
if (char == ' ' and spaces != 3)
spaces += 1
else if (char == '#')
break
else
return null;
}
var level: u3 = 0;
for (line[spaces..]) |char| {
if (char == '#' and level != 6)
level += 1
else if (char == ' ' or char == '\t' or char == '\n')
break
else
return null;
}
if (level == 0) return null;
var heading = ATXHeading{
.level = level,
.spaces = spaces,
.closing_sequence = 0,
.closing_sequence_spaces = 0,
};
closing_sequence_block: {
const rest = heading.content(line);
var closing_sequence: usize = 0;
var closing_sequence_spaces: usize = 0;
if (rest.len == 0) break :closing_sequence_block;
var i: usize = rest.len - 1;
while (i >= 0) : (i -= 1) {
const char = rest[i];
if (char == ' ')
closing_sequence_spaces += 1
else if (char == '#')
break
else
break :closing_sequence_block;
if (i == 0) break;
}
while (i >= 0) : (i -= 1) {
const char = rest[i];
if (char == '#')
closing_sequence += 1
else if (char == ' ' or char == '\t')
break
else
break :closing_sequence_block;
if (i == 0) break;
}
heading.closing_sequence = closing_sequence;
heading.closing_sequence_spaces = closing_sequence_spaces;
}
return heading;
}
fn checkIfBlockStarts(allocator: std.mem.Allocator, line: *[]const u8) !?Block {
if (std.mem.startsWith(u8, line.*, ">")) {
line.* = line.*[1..];
const indent = detectIndent(line.*);
line.* = line.*[indent..];
return Block{ .block_quote = .{ .children = Children.init(allocator) } };
} else if (std.mem.startsWith(u8, line.*, "* ")) {
line.* = line.*[2..];
return Block{ .list = .{
.children = Children.init(allocator),
.kind = .{ .bullet = '*' },
.tight = false,
} };
} else if (parseATXHeading(line.*)) |heading| {
line.* = heading.content(line.*);
return Block{ .heading = .{
.text = try allocator.alloc(u8, 0),
.level = heading.level,
} };
}
return null;
}
const StackItem = struct {
block: *Block,
matched: bool,
};
const Stack = std.ArrayList(StackItem);
// Verifies that the stack used to parse documents is not broken.
fn verifyStack(stack: Stack) void {
for (stack.items) |item, index| {
if (index == 0) {
std.debug.assert(std.mem.eql(u8, @tagName(item.block.*), "document"));
} else {
const parent = stack.items[index - 1];
const parent_children = switch (parent.block.*) {
.document => |document| document.children.items,
.paragraph, .heading => {
if (index == stack.items.len - 1) break else unreachable;
},
.block_quote => |block_quote| block_quote.children.items,
.list => |list| list.children.items,
.list_item => |list_item| list_item.children.items,
};
std.debug.assert(std.meta.eql(item.block, last(parent_children).?));
}
}
}
pub fn parse(allocator: std.mem.Allocator, str: []const u8) !Document {
var doc_block = Block{ .document = Document{ .children = Children.init(allocator) } };
errdefer doc_block.document.deinit();
var stack = Stack.init(allocator);
defer stack.deinit();
const doc_stack_item = StackItem{ .block = &doc_block, .matched = true };
try stack.append(doc_stack_item);
var iter = std.mem.split(u8, str, "\n");
lineLoop: while (iter.next()) |line| {
var rest = line;
verifyStack(stack);
for (stack.items) |*item, i| {
item.matched = checkIfBlockStillMatches(&rest, item.block.*);
if (!item.matched) {
switch (item.block.*) {
.paragraph, .list_item => {
try stack.resize(i);
break;
},
.block_quote => {
if (rest.len == 0) {
try stack.resize(i);
break;
}
},
else => {},
}
}
}
if (try checkIfBlockStarts(allocator, &rest)) |block| {
for (stack.items) |item, i| {
if (!item.matched or
// If a new block started, finish the paragraph
item.block.* == .paragraph)
{
try stack.resize(i);
break;
}
}
var last_block = stack.items[stack.items.len - 1].block;
const last_block_children = switch (last_block.*) {
.document => |*document| &document.children,
.heading, .paragraph => unreachable,
.block_quote => |*block_quote| &block_quote.children,
.list => |*list| &list.children,
.list_item => |*list_item| &list_item.children,
};
const new_block = try arrayListAppend(Block, last_block_children, block);
try stack.append(.{ .block = new_block, .matched = false });
}
if (rest.len == 0) continue :lineLoop;
var index = stack.items.len - 1;
while (true) : (index -= 1) {
switch (stack.items[index].block.*) {
.heading => |*heading| {
heading.text = try reallocAppend(allocator, heading.text, rest);
try stack.resize(index);
break;
},
.paragraph => |*paragraph| {
if (paragraph.text.len != 0) {
paragraph.text = try reallocAppend(allocator, paragraph.text, " ");
}
paragraph.text = try reallocAppend(allocator, paragraph.text, rest);
break;
},
.document => |*document| {
const paragraph_tmp = .{ .text = try allocator.alloc(u8, 0) };
const paragraph_block_tmp = .{ .paragraph = paragraph_tmp };
const paragraph = try arrayListAppend(Block, &document.children, paragraph_block_tmp);
try stack.append(.{ .block = paragraph, .matched = false });
index = stack.items.len;
},
.block_quote => |*block_quote| {
const paragraph_tmp = .{ .text = try allocator.alloc(u8, 0) };
const paragraph_block_tmp = .{ .paragraph = paragraph_tmp };
const paragraph = try arrayListAppend(Block, &block_quote.children, paragraph_block_tmp);
try stack.append(.{ .block = paragraph, .matched = false });
index = stack.items.len;
},
.list => |*list| {
const item_tmp = .{ .children = Children.init(allocator) };
const item_block_tmp = .{ .list_item = item_tmp };
const item = try arrayListAppend(Block, &list.children, item_block_tmp);
try stack.append(.{ .block = item, .matched = false });
index = stack.items.len;
},
.list_item => |*list_item| {
const paragraph_tmp = .{ .text = try allocator.alloc(u8, 0) };
const paragraph_block_tmp = .{ .paragraph = paragraph_tmp };
const paragraph = try arrayListAppend(Block, &list_item.children, paragraph_block_tmp);
try stack.append(.{ .block = paragraph, .matched = false });
index = stack.items.len;
},
}
}
}
// TODO: use std.log when I figure out how to make the messages print when testing
std.debug.print("parser: stack: {any}\n", .{stack.items});
verifyStack(stack);
return doc_block.document;
}
const testing = std.testing;
test "block quotes" {
const str =
\\Hello
\\
\\> Block quotes are
\\written like so.
\\>
\\> They can span multiple paragraphs,
\\> if you like.
;
var doc = try parse(std.testing.allocator, str);
defer doc.deinit();
try testing.expectEqual(@as(usize, 2), doc.children.items.len);
try testing.expectEqualStrings("paragraph", @tagName(doc.children.items[0]));
try testing.expectEqualStrings("Hello", doc.children.items[0].paragraph.text);
try testing.expectEqualStrings("block_quote", @tagName(doc.children.items[1]));
try testing.expectEqual(@as(usize, 2), doc.children.items[1].block_quote.children.items.len);
try testing.expectEqualStrings(
"paragraph",
@tagName(doc.children.items[1].block_quote.children.items[0]),
);
try testing.expectEqualStrings(
"Block quotes are written like so.",
doc.children.items[1].block_quote.children.items[0].paragraph.text,
);
try testing.expectEqualStrings(
"paragraph",
@tagName(doc.children.items[1].block_quote.children.items[1]),
);
try testing.expectEqualStrings(
"They can span multiple paragraphs, if you like.",
doc.children.items[1].block_quote.children.items[1].paragraph.text,
);
}
test "headings" {
const str =
\\Hello
\\## Hey how
\\Testing
\\# headings
;
var doc = try parse(std.testing.allocator, str);
defer doc.deinit();
try testing.expectEqual(@as(usize, 4), doc.children.items.len);
try testing.expectEqualStrings("paragraph", @tagName(doc.children.items[0]));
try testing.expectEqualStrings("Hello", doc.children.items[0].paragraph.text);
try testing.expectEqualStrings("heading", @tagName(doc.children.items[1]));
try testing.expectEqualStrings("Hey how", doc.children.items[1].heading.text);
try testing.expectEqualStrings("paragraph", @tagName(doc.children.items[2]));
try testing.expectEqualStrings("Testing", doc.children.items[2].paragraph.text);
try testing.expectEqualStrings("heading", @tagName(doc.children.items[3]));
try testing.expectEqualStrings("headings", doc.children.items[3].heading.text);
}