comb/parse_code_blocks.odin

package comb

import "core:strings"
import tt "core:testing"

Code_Block :: struct {
	line_start: int,
	line_end:   int,
	char_start: int,
	char_end:   int,
	language:   string,
	tags:       map[string]string,
	text:       string,
}

destroy_code_block :: proc(cb: Code_Block) {
	delete(cb.tags)
}

ParseError :: enum {
	None = 0,
	Incomplete_Code_Block,
	Invalid_Tag,
}

parse_code_blocks :: proc(s: string) -> (blocks: [dynamic]Code_Block, err: ParseError) {
	it := s
	in_code_block := false
	current_block := Code_Block{}
	line_index := 0
	char_index := 0
	for line in strings.split_lines_iterator(&it) {
		defer line_index += 1
		// Add +1 for \n character
		defer char_index += len(line) + 1
		trimmed_line := strings.trim_space(line)
		strings.starts_with(trimmed_line, "```") or_continue
		in_code_block = !in_code_block
		// We just finished parsing a code block
		if !in_code_block {
			current_block.line_end = line_index
			current_block.char_end = char_index - 1
			current_block.text = s[current_block.char_start:current_block.char_end]
			append(&blocks, current_block)
			current_block = {}
			continue
		}

		// It looks like we're starting a code block, but there's no language specified
		// That means we shouldn't try to extract it
		if len(trimmed_line) == 3 {
			in_code_block = false
			continue
		}

		current_block.line_start = line_index
		current_block.char_start = char_index + len(line) + 1
		remaining := strings.trim_space(trimmed_line[3:])
		first_space := strings.index_rune(remaining, ' ')
		// There are no keys after the language name
		if first_space == -1 {
			current_block.language = remaining
			continue
		}

		current_block.language = remaining[:first_space]
		// The +1 is safe because we know there are characters after the space (or else it would have been trimmed)
		tag_str := remaining[first_space + 1:]
		opening_curly := strings.index_rune(tag_str, '{')
		closing_curly := strings.last_index_byte(tag_str, '}')
		if closing_curly == opening_curly + 1 do continue

		// TODO: allow curlies if they're in strings
		if opening_curly == -1 || closing_curly == -1 || closing_curly < opening_curly {
			// TODO: improve error reporting
			return blocks, .Invalid_Tag
		}
		tag_content_str := strings.trim_space(tag_str[opening_curly + 1:closing_curly])
		if tag_content_str == "" do continue

		tag_content_str_it := tag_content_str
		for pair in strings.split_iterator(&tag_content_str_it, ",") {
			pair := strings.trim_space(pair)
			key_start, key_end, value_start := 0, 0, 0
			value_end := len(pair)
			pair_loop: for c, i in pair {
				if c == '=' {
					key_end = i
					value_start = i + 1
					current_block.tags[string(pair[key_start:key_end])] = string(
						pair[value_start:value_end],
					)
					break pair_loop
				}
			}
		}
	}

	// TODO: improve error reporting
	if in_code_block do return blocks, .Incomplete_Code_Block

	return blocks, .None
}


@(test)
parse_blocks_correctly :: proc(t: ^tt.T) {
	blocks, err := parse_code_blocks(example_1)
	defer delete(blocks)
	defer for cb in blocks do destroy_code_block(cb)
	tt.expect(t, err == nil, "parse_blocks_correctly should not error")
	tt.expect_value(t, len(blocks), 3)
	{
		b := blocks[0]
		tt.expect_value(t, b.language, "python")
		tt.expect_value(t, b.text, `print("Hello world")`)
	}

	{
		b := blocks[1]
		tt.expect_value(t, len(b.tags), 1)
		val :=
			b.tags["Foo"] or_else tt.fail_now(t, "the second code block should have a `Foo` key")
		tt.expect_value(t, val, "bar")
	}

	{
		b := blocks[2]
		tt.expect_value(t, len(b.tags), 2)
		val :=
			b.tags["Foo"] or_else tt.fail_now(t, "the second code block should have a `Foo` key")
		tt.expect_value(t, val, "bar")
		val =
			b.tags["Bazz"] or_else tt.fail_now(t, "the second code block should have a `Bazz` key")
		tt.expect_value(t, val, "quq")
	}
}