//
// === Parser for Code & Data ===
//
// Zisp s-expressions are defined in terms of an extremely minimal set of data
// types; only that which is necessary to build representations of more complex
// expressions and data types:
//
//   +---------+-------------+---------------+--------+---------+--------+
//   | TYPE    | Bare String | Quoted String | Rune   | Pair    | Nil    |
//   +---------+-------------+---------------+--------+---------+--------+
//   | EXAMPLE | foobar      | "foo bar"     | #name  | (X . Y) | ()     |
//   +---------+-------------+---------------+--------+---------+--------+
//
// Bare strings and quoted strings are polymorphic sub-types of the generic
// string type.  This lets the compiler distinguish between identifiers and
// string literals.
//
// There are also non-negative integers, but they are only used for datum
// labels; regular number literals are handled by the "decoder" (see below).
//
// The parser recognizes various "syntax sugar" and transforms it into uses of
// the above data types.  The most ubiquitous example is of course the list:
//
//   (datum1 datum2 ...)  ->  (datum1 . (datum2 . (... . ())))
//
// The following table summarizes the other supported transformations:
//
//   #datum  -> (#HASH . datum)        #rune(...)  -> (#rune ...)
//
//   [...]   -> (#SQUARE ...)          dat1dat2    -> (#JOIN dat1 . dat2)
//
//   {...}   -> (#BRACE ...)           dat1.dat2   -> (#DOT dat1 . dat2)
//
//   'datum  -> (#QUOTE . datum)       dat1:dat2   -> (#COLON dat1 . dat2)
//
//   `datum  -> (#GRAVE . datum)       dat1|dat2   -> (#PIPE dat1 . dat2)
//
//   ,datum  -> (#COMMA . datum)       #%n=datum   -> (#LABEL hex . datum)
//
//   #%n     -> (#LABEL . hex)
//
// A separate process called "decoding" can transform these objects into other
// data types.  For example, (#HASH x y z) could become a vector, so that the
// expression #(x y z) works just like in Scheme.
//
// Decoding also resolves datum labels, and goes over bare strings to find ones
// that are actually a number literal.  This lets us offload the complexity of
// number parsing elsewhere, so the parser remains extremely simple.
//
// Further notes about the syntax sugar table and examples above:
//
// * The terms datum, dat1, and dat2 each refer to an arbitrary datum; ellipsis
//   means zero or more data; hex is a hexadecimal number of up to 12 digits.
//
// * The #datum form only applies to expressions that cannot be mistaken for a
//   rune, such as #(...) or #"..." or #'string etc.; a hash sign followed by a
//   bare string will parse as a rune instead (or raise an error if it contains
//   characters that are illegal in rune names).
//
// * A backslash causes the immediately following character to lose any special
//   meaning it would normally have, and be considered as part of a bare string
//   instead.  (This does not apply to whitespace.)  For example, the following
//   three character sequences are each a valid bare string:
//
//     foo\(bar\)  \]blah  \#\'xyz
//
//   This can be used to follow a hash with a bare string without it being
//   parsed as a rune:
//
//     #\foo  -> (#HASH . foo)
//
// * Though not represented in the table due to notational difficulty, the
//   format "#rune(...)" doesn't require a list in the second position; any
//   datum works, so long as there's no ambiguity; for example:
//
//     #rune1#rune2  -> (#rune1 . #rune2)
//
//     #rune"text"   -> (#rune . "text")
//
//     #rune'string  -> (#rune #QUOTE . string)
//
//   As a counter-example, following a rune immediately with a bare string is
//   not possible, since it's ambiguous:
//
//     #abcdefgh  ;Could be (#abcdef . gh) or (#abcde . fgh) or ...
//
//   The parser will see this as an attempt to use an 8-letter rune name, and
//   raise an error, since rune names are limited to 6 characters.
//
// * Syntax sugar can combine arbitrarily; some examples follow:
//
//     #{...}            -> (#HASH #BRACE ...)
//
//     #'foo             -> (#HASH #QUOTE . foo)
//
//     ##'[...]          -> (#HASH #HASH #QUOTE #SQUARE ...)
//
//     {x y}[i j]        -> (#JOIN (#BRACE x y) #SQUARE i j)
//
//     foo.bar.baz{x y}  -> (#JOIN (#DOT (#DOT foo . bar) . baz) #BRACE x y)
//
// * While 'foo parses as (quote foo) in Scheme, it parses as (#QUOTE . foo) in
//   Zisp; the operand of #QUOTE is the entire cdr.  The same principle is used
//   when parsing other sugar; some examples follow:
//
//            Incorrect                              Correct
//
//     #(x y z) -> (#HASH (x y z))            #(x y z) -> (#HASH x y z)
//
//     [x y z]  -> (#SQUARE (x y z))          [x y z]  -> (#SQUARE x y z)
//
//     #{x}     -> (#HASH (#BRACE (x)))       #{x}     -> (#HASH #BRACE x)
//
//     foo(x y) -> (#JOIN foo (x y))          foo(x y) -> (#JOIN foo x y)
//
// * Runes are case-sensitive, and the parser only emits runes using upper-case
//   letters when expressing syntax sugar.  This way, there can be no accidental
//   clash with runes that appear verbatim in code, as long as only lower-case
//   letters are used for rune literals in code.
//
//
// === Decoding ===
//
// A separate process called "decoding" can transform simple data structures,
// consisting of only the above types, into a richer set of Zisp data types.
//
// For example, the decoder may turn (#HASH ...) into a vector, as one would
// expect a vector literal like #(...) to work in Scheme.
//
// Runes may be decoded in isolation as well, rather than transforming a list
// whose head they appear in.  This can implement #true and #false.
//
// The decoder may also perform arbitrary transforms on any type; for example,
// it may turn bare strings into numbers wher appropriate.  This can implement
// number literals.
//
// The decoder recognizes (#QUOTE ...) to implement the traditional quoting
// mechanism, but with a significant difference:
//
// Traditional quote is "unhygienic" in Scheme terms.  An expression such as
// '(foo bar) will always be read as (quote (foo bar)) regardless of what sort
// of lexical context it appears in, so the semantics will depend on whatever
// the identifier "quote" is bound to in that lexical context, meaning that the
// expression may end up evaluating to something other than the list (foo bar).
//
// The Zisp decoder, which transforms not text to text, but objects to objects,
// can turn (#QUOTE ...) into an abstract object which encapsulates the notion
// of quoting, which the Zisp evaluator can recognize and act upon, ensuring
// that an expression like '(foo bar) always turns into the list (foo bar).
//
// One way to think about this, in Scheme (R6RS / syntax-case) terms, is that
// expressions like '(foo bar) turn directly into a *syntax object* when read,
// rather than a regular list object.
//
// The decoder is, of course, configurable and extensible.  The transformations
// mentioned above would be performed only when it's told to decode data which
// represents Zisp code.  The decoder may be given a different configuration,
// telling it to decode, for example, data which represents a different kind of
// domain-specific data, such as application settings, build system commands,
// complex data records with non-standard data types, and so on.
//
//
// === Trampolining strategy ===
//
// Instead of using recursion directly, the parser is written in a "trampoline"
// style, which ensures that parsing depth is not limited by the stack size.
//
// All state and context is passed around via a struct pointer.  The parser has
// a main loop, which calls a function as dictated by state.context.next, and
// the function may update the state to have another function called next.
//
// If a function wants to call the parser recursively, it pushes some of the
// current context onto a stack, including what function the recursive parser
// should "return" to, and then updates the state to instruct the main loop to
// call one of the starting functions.
//
// If a function wants to make the parser return, either from a recursive parse
// or from the main loop, it sets the .result field, and tries to pop the saved
// context.  If the context stack was empty, the main loop returns.
//
// While it's possible to just set .next and return, to make the main loop call
// another function next (possibly even setting .result to pass a value to it),
// this is completely unnecessary.  A few non-recursive calls won't blow the
// stack.  It's only recursive parsing that we use the above strategy for.
//
//
// === .start_parse VS .start_datum ===
//
// The difference between .start_parse and .start_datum is that the former will
// allow whitespace and comments at the beginning, whereas the latter expects a
// datum immediately without anything else in front of it.
//
// When calling the parser recursively, it may seem logical to always make it
// start with .start_datum, because we already cleared whitespace and comments
// out of the way.  However, in some cases, we must use .start_parse instead.
//
// This is because of datum comments.  When one appears, we start a recursive
// parser, but instead of making it return to a function that will consume the
// result, we make it return to the original starting point, so the result is
// ignored and the parser retries what it was originally doing.  If we always
// used .start_datum, this would never allow whitespace after a datum comment,
// since we would be back at .start_datum after the comment is out of the way:
//
//   (foo #;bar baz)  ;must use .start_parse at the start of each element
//
//
// === List parsing strategy ===
//
// When it comes to pairs and lists, we basically treat everything as a list,
// and a pair is just seen as the shortest possible improper list.  This saves
// memory: If we implemented list parsing as pair parsing, we would be calling
// the parser recursively, deeper and deeper, for every list element.  Though
// we're not limited by stack space thanks to the strategy described above, it
// would still waste memory and the time it takes to allocate memory.
//
//
// === Buffering ===
//
// We use a small circular buffer just so we can backtrack.  We read bytes into
// this buffer one by one, because we don't want to consume more bytes from the
// input than what we actually parse.  Consider the following input stream:
//
//   (a b c) (x y z)
//
// If we used proper buffering, like reading up to 4K bytes per read, then the
// whole stream would be consumed at once before it's parsed.  Then, the parser
// would return the first datum, and the rest of the stream would be lost.  The
// parser would need some way to reset the input stream's read head to the end
// of the first datum, but not all stream types may support this.
//
// For efficiency, call the parser on an input stream with implicit buffering.
//
//
// === Notation used in comments ===
//
// Some comments throughout the file give you an example of where the parser
// currently might be in a stream.  These illustrations use the pipe symbol,
// which looks like a cursor, to indicate the current position:
//
//   (foo| bar baz)   <- parser arrived at the end of the string foo
//
//   (foo bar baz)|   <- parser reached EOF (assuming no trailing spaces)
//

const builtin = @import("builtin");
const std = @import("std");

const lib = @import("../lib.zig");
const value = @import("../value.zig");

const List = std.ArrayListUnmanaged;

const Value = value.Value;

const cons = value.pair.cons;

const is_test = builtin.is_test;
const is_debug = builtin.mode == .Debug;

const detailed_debug = false;

// In debug, we want to see if we leak, so very small numbers.
const init_stack_capacity = if (is_debug) 32 else 32;
const init_chars_capacity = if (is_debug) 512 else 512;

// zig fmt: off
const DOT    = value.rune.pack("DOT");
const COLON  = value.rune.pack("COLON");
const PIPE   = value.rune.pack("PIPE");
const JOIN   = value.rune.pack("JOIN");
const LABEL  = value.rune.pack("LABEL");
const HASH   = value.rune.pack("HASH");
const QUOTE  = value.rune.pack("QUOTE");
const GRAVE  = value.rune.pack("GRAVE");
const COMMA  = value.rune.pack("COMMA");
const SQUARE = value.rune.pack("SQUARE");
const BRACE  = value.rune.pack("BRACE");
// zig fmt: on

const S_DOT = value.sstr.pack(".");

const Context = struct {
    // What to do next.
    next: Fn = .parse_unit,
    // For storing a context value, like accumulated list elements.
    val: Value = undefined,
    // For storing a context char, like list opening bracket.
    char: u8 = undefined,
};

const ParseError = error{
    InvalidCharacter,
    UnclosedString,
    UnexpectedEof,
    UnicodeError,
    RuneTooLong,
    OutOfMemory,
    OutOfRange,
    ReadError,
};

const State = struct {
    input: std.io.AnyReader,
    stack_alloc: std.mem.Allocator,
    chars_alloc: std.mem.Allocator,

    context: Context = .{},
    stack: List(Context) = undefined,

    chars: List(u8) = undefined,

    // TODO: Implement bool flag for when skipping a unit

    result: Value = undefined,
    unused_char: ?u8 = null,

    err_code: anyerror = undefined,
    err_msg: []const u8 = undefined,

    fn init(
        input: std.io.AnyReader,
        stack_alloc: std.mem.Allocator,
        chars_alloc: std.mem.Allocator,
    ) !State {
        var s: State = .{
            .input = input,
            .stack_alloc = stack_alloc,
            .chars_alloc = chars_alloc,
        };
        s.stack = try .initCapacity(s.stack_alloc, init_stack_capacity);
        s.chars = try .initCapacity(s.chars_alloc, init_chars_capacity);
        return s;
    }

    fn deinit(s: *State) void {
        s.stack.deinit(s.stack_alloc);
        s.chars.deinit(s.chars_alloc);
    }

    fn err(s: *State, e: ParseError, msg: []const u8) ParseError {
        s.err_msg = msg;
        return e;
    }

    fn read(s: *State) !?u8 {
        if (is_debug and s.unused_char != null) {
            @panic("Called read() while there was an unused character!");
        }
        const c = s.input.readByte() catch |e| switch (e) {
            error.EndOfStream => return null,
            else => {
                s.err_code = e;
                return error.ReadError;
            },
        };
        if (detailed_debug) {
            std.debug.print("{c}", .{c});
        }
        return c;
    }

    fn readNoEof(s: *State, emsg: []const u8) !u8 {
        return if (try s.read()) |c| c else s.err(error.UnexpectedEof, emsg);
    }

    fn getUnused(s: *State) ?u8 {
        if (s.unused_char) |c| {
            s.unused_char = null;
            return c;
        }
        return null;
    }

    fn skipLine(s: *State) !void {
        while (try s.read()) |c| if (c == '\n') break;
    }

    fn addChar(s: *State, c: u8) !void {
        try s.chars.append(s.chars_alloc, c);
    }

    fn getBareString(s: *State) Value {
        defer s.chars.clearRetainingCapacity();
        return if (s.chars.items.len <= 6)
            value.sstr.pack(s.chars.items)
        else
            value.istr.intern(s.chars.items, false);
    }

    fn getQuotedString(s: *State) Value {
        defer s.chars.clearRetainingCapacity();
        return if (s.chars.items.len <= 6)
            value.sstr.packQuoted(s.chars.items)
        else
            value.istr.intern(s.chars.items, true);
    }

    fn getRune(s: *State) Value {
        defer s.chars.clearRetainingCapacity();
        return value.rune.pack(s.chars.items);
    }

    fn push(s: *State, next: Fn) !void {
        try s.stack.append(s.stack_alloc, .{ .next = next });
    }

    fn pushContext(s: *State, next: Fn) !void {
        try s.stack.append(s.stack_alloc, .{
            .next = next,
            .val = s.context.val,
            .char = s.context.char,
        });
    }

    fn subr(s: *State, start: Fn, next: Fn) !void {
        try s.pushContext(next);
        s.context.next = start;
    }

    fn jump(s: *State, next: Fn, val: ?Value) void {
        if (val) |v| s.result = v;
        s.context.next = next;
    }

    fn abort(s: *State, next: Fn, unused_c: u8) void {
        s.result = value.undef;
        s.unused_char = unused_c;
        s.context.next = next;
    }

    fn retval(s: *State, val: Value) void {
        s.result = val;
        if (s.stack.pop()) |c| {
            s.context = c;
        } else {
            s.context.next = .done;
        }
    }
};

pub fn parse(input: std.io.AnyReader) Value {
    var debug_alloc: std.heap.DebugAllocator(.{}) = undefined;
    if (!is_test and is_debug) {
        debug_alloc = .init;
    }
    defer if (!is_test and is_debug) {
        if (debug_alloc.deinit() == .leak) {
            @panic("leak");
        }
    };

    const alloc = if (is_test)
        std.testing.allocator
    else if (is_debug)
        debug_alloc.allocator()
    else
        std.heap.smp_allocator;

    const stack_mem = init_stack_capacity * @sizeOf(Context);
    const chars_mem = init_chars_capacity;
    var stack_sfa = std.heap.stackFallback(stack_mem, alloc);
    var chars_sfa = std.heap.stackFallback(chars_mem, alloc);
    const stack_alloc = stack_sfa.get();
    const chars_alloc = chars_sfa.get();
    var s = State.init(input, stack_alloc, chars_alloc) catch @panic("");
    defer s.deinit();

    while (s.context.next != .done) callNext(&s) catch {
        if (s.unused_char) |c| {
            std.debug.panic(
                "Parse error: {} at: {s}, char: {c}\n",
                .{ s.err_code, s.err_msg, c },
            );
        } else {
            std.debug.panic(
                "Parse error: {} at: {s}\n",
                .{ s.err_code, s.err_msg },
            );
        }
    };
    if (s.unused_char) |c| {
        std.debug.panic("Invalid character: {c}\n", .{c});
    }
    return s.result;
}

const Fn = enum {
    parse_unit,
    return_context,
    end_one_datum,
    parse_join_datum,
    end_join_datum,
    join_data,
    parse_hash_datum,
    end_hash_datum,
    parse_rune_datum,
    end_rune_datum,
    end_label_datum,
    parse_list_element,
    continue_list,
    end_improper_list,
    close_improper_list,
    end_quote_expr,
    done,
};

fn callNext(s: *State) !void {
    if (detailed_debug) {
        const stack = s.stack.items;
        std.debug.print("\n\n{}:{} ctx:'{c}' unused:'{c}' \n", .{
            stack.len,
            s.context.next,
            s.context.char,
            s.unused_char orelse '_',
        });
        if (stack.len > 0) {
            var i = stack.len;
            while (i > 0) : (i -= 1) {
                const prev = stack[i - 1];
                std.debug.print("{}:{} ctx:'{c}'\n", .{
                    i - 1,
                    prev.next,
                    prev.char,
                });
            }
        }
    }
    try switch (s.context.next) {
        .parse_unit => parseUnit(s),
        .return_context => returnContext(s),
        .end_one_datum => endOneDatum(s),
        .parse_join_datum => parseJoinDatum(s),
        .end_join_datum => endJoinDatum(s),
        .join_data => joinData(s),
        .parse_hash_datum => parseHashDatum(s),
        .end_hash_datum => endHashDatum(s),
        .parse_rune_datum => parseRuneDatum(s),
        .end_rune_datum => endRuneDatum(s),
        .end_label_datum => endLabelDatum(s),
        .parse_list_element => parseListElement(s),
        .continue_list => continueList(s),
        .end_improper_list => endImproperList(s),
        .close_improper_list => closeImproperList(s),
        .end_quote_expr => endQuoteExpr(s),
        .done => unreachable,
    };
}

fn parseUnit(s: *State) !void {
    var c1 = s.getUnused() orelse try s.read();
    while (c1) |c| : (c1 = try s.read()) {
        switch (try checkBlank(s, c)) {
            .yes => {},
            .skip_unit => try s.push(.parse_unit),
            .skip_line => try s.skipLine(),
            .no => return parseDatum(s, c),
        }
    }
    return s.retval(value.eof.eof);
}

fn checkBlank(s: *State, c: u8) !enum { yes, skip_unit, skip_line, no } {
    return switch (c) {
        '\t'...'\r', ' ' => .yes,
        ';' => switch (try s.read() orelse '\n') {
            '\n' => .yes,
            '~' => .skip_unit,
            else => .skip_line,
        },
        else => .no,
    };
}

fn parseDatum(s: *State, c: u8) !void {
    return parseOneDatum(s, c, .end_one_datum);
}

fn endOneDatum(s: *State) !void {
    if (s.result.eq(value.undef)) {
        return s.retval(value.undef);
    }
    const d = s.result;
    const c1 = s.getUnused() orelse try s.read();
    if (c1) |c| {
        switch (try checkBlank(s, c)) {
            .yes => {},
            .skip_unit => return skipUnitAndReturn(s, d),
            .skip_line => try s.skipLine(),
            .no => return parseJoin(s, d, c),
        }
    }
    return s.retval(d);
}

fn skipUnitAndReturn(s: *State, d: Value) !void {
    s.context.val = d;
    return s.subr(.parse_unit, .return_context);
}

fn returnContext(s: *State) !void {
    return s.retval(s.context.val);
}

fn parseJoin(s: *State, d: Value, c: u8) !void {
    s.context.val = d;
    s.context.char = c;
    switch (c) {
        '.', ':', '|' => {
            s.context.char = c;
            s.unused_char = try s.readNoEof("join datum");
        },
        else => {
            s.context.char = 0;
            s.unused_char = c;
        },
    }
    return s.subr(.parse_join_datum, .join_data);
}

fn parseJoinDatum(s: *State) !void {
    return parseOneDatum(s, s.getUnused().?, .end_join_datum);
}

fn endJoinDatum(s: *State) !void {
    return s.retval(s.result);
}

fn joinData(s: *State) !void {
    const head = s.context.val;
    const join = s.context.char;
    const tail = s.result;
    if (tail.eq(value.undef)) {
        if (join == 0) {
            return s.retval(head);
        } else {
            return s.err(error.InvalidCharacter, "join datum");
        }
    }
    const rune = switch (join) {
        0 => JOIN,
        '.' => DOT,
        ':' => COLON,
        '|' => PIPE,
        else => unreachable,
    };
    const data = cons(head, tail);
    return s.jump(.end_one_datum, cons(rune, data));
}

fn parseOneDatum(s: *State, c: u8, next: Fn) !void {
    if (isBareChar(c)) {
        const d, s.unused_char = try parseBareString(s, c);
        return s.jump(next, d);
    }
    return parseCladDatum(s, c, next);
}

fn parseCladDatum(s: *State, c: u8, next: Fn) !void {
    if (c == '\\') {
        const bs, s.unused_char = try parseBareEscString(s);
        return s.jump(next, bs);
    }
    if (c == '"') {
        const qs = try parseQuotedString(s);
        return s.jump(next, qs);
    }
    return switch (c) {
        '#' => parseHashExpression(s, next),
        '(', '[', '{' => parseList(s, c, next),
        '\'', '`', ',' => parseQuoteExpr(s, c, next),
        else => s.abort(next, c),
    };
}

fn isBareChar(c: u8) bool {
    return switch (c) {
        // zig fmt: off
        'a'...'z' , 'A'...'Z' , '0'...'9',
        '!' , '$' , '%' , '&' , '*' , '+',
        '-' , '/' , '<' , '=' , '>' , '?',
        '@' , '^' , '_' , '~' , '.' => true,
        // zig fmt: on
        else => false,
    };
}

fn isBareEsc(c: u8) bool {
    return switch (c) {
        33...126 => true,
        else => false,
    };
}

fn parseBareString(s: *State, c: u8) !struct { Value, ?u8 } {
    try s.addChar(c);
    return parseBareStringRest(s);
}

fn parseBareEscString(s: *State) !struct { Value, ?u8 } {
    try s.addChar(try parseBareEsc(s));
    return parseBareStringRest(s);
}

fn parseBareStringRest(s: *State) !struct { Value, ?u8 } {
    while (try s.read()) |c| {
        if (isBareChar(c)) {
            try s.addChar(c);
        } else if (c == '\\') {
            try s.addChar(try parseBareEsc(s));
        } else {
            return .{ s.getBareString(), c };
        }
    }
    return .{ s.getBareString(), null };
}

fn parseBareEsc(s: *State) !u8 {
    const c = try s.readNoEof("bare escape");
    if (isBareEsc(c)) {
        return c;
    } else {
        return s.err(error.InvalidCharacter, "bare escape");
    }
}

fn parseQuotedString(s: *State) !Value {
    while (try s.read()) |c| {
        if (c == '"') {
            return s.getQuotedString();
        }
        if (c != '\\') {
            try s.addChar(c);
        } else {
            try parseQuotedEsc(s);
        }
    }
    return error.UnclosedString;
}

fn parseQuotedEsc(s: *State) !void {
    const c = try s.readNoEof("quoted escape");
    if (c == 'u') {
        return parseUniHexHandleErrors(s);
    }
    try s.addChar(switch (c) {
        '\\', '"' => c,
        '0' => 0,
        'a' => 7,
        'b' => 8,
        't' => 9,
        'n' => 10,
        'v' => 11,
        'f' => 12,
        'r' => 13,
        'e' => 27,
        'x' => try parseHexByte(s, "hex escape"),
        else => return s.err(error.InvalidCharacter, "quoted escape"),
    });
}

fn parseUniHexHandleErrors(s: *State) !void {
    return parseUniHex(s) catch |err| switch (err) {
        error.Utf8CannotEncodeSurrogateHalf => e: {
            s.err_code = err;
            s.err_msg = "unicode escape";
            break :e error.UnicodeError;
        },
        else => |e| e,
    };
}

fn parseUniHex(s: *State) !void {
    const msg = "unicode escape";

    if (try s.readNoEof(msg) != '{') {
        return s.err(error.InvalidCharacter, msg);
    }

    const uc, const unused_c = try parseHex(s, u21, msg);
    if (unused_c) |c| {
        if (c != '}') {
            return s.err(error.InvalidCharacter, msg);
        }
    } else {
        return s.err(error.UnexpectedEof, msg);
    }

    const n = try std.unicode.utf8CodepointSequenceLength(uc);
    const buf = try s.chars.addManyAsSlice(s.chars_alloc, n);
    _ = try std.unicode.utf8Encode(uc, buf);
}

fn parseHashExpression(s: *State, next: Fn) !void {
    const c = try s.readNoEof("hash expression");
    if (try checkBlank(s, c) != .no) {
        return s.err(error.InvalidCharacter, "hash expression");
    }
    if (std.ascii.isAlphabetic(c)) {
        const r, const unused_c = try parseRune(s, c);
        return parseRuneEnd(s, r, unused_c, next);
    }
    if (c == '%') {
        const l, const unused_c = try parseLabel(s);
        return parseLabelEnd(s, l, unused_c, next);
    }
    if (isBareChar(c)) {
        // Reserved for future extensions to syntax sugar.
        return s.err(error.InvalidCharacter, "hash expression");
    }
    // fast-path to avoid subr
    if (c == '\\') {
        const bs, s.unused_char = try parseBareEscString(s);
        return s.jump(next, cons(HASH, bs));
    }
    if (c == '"') {
        const qs = try parseQuotedString(s);
        return s.jump(next, cons(HASH, qs));
    }
    s.unused_char = c;
    return s.subr(.parse_hash_datum, next);
}

fn parseHashDatum(s: *State) !void {
    return parseCladDatum(s, s.getUnused().?, .end_hash_datum);
}

fn endHashDatum(s: *State) !void {
    if (s.result.eq(value.undef)) {
        return s.err(error.InvalidCharacter, "hash datum");
    }
    return s.retval(cons(HASH, s.result));
}

fn parseRune(s: *State, c1: u8) !struct { Value, ?u8 } {
    try s.addChar(c1);
    var len: usize = 1;
    while (try s.read()) |c| : (len += 1) {
        if (len == 6 or !std.ascii.isAlphanumeric(c)) {
            return .{ s.getRune(), c };
        }
        try s.addChar(c);
    }
    return .{ s.getRune(), null };
}

fn parseRuneEnd(s: *State, r: Value, c1: ?u8, next: Fn) !void {
    const c = c1 orelse return s.jump(next, r);
    if (c == '\\') {
        const bs, s.unused_char = try parseBareString(s, c);
        return s.jump(next, cons(r, bs));
    }
    if (c == '"') {
        const qs = try parseQuotedString(s);
        return s.jump(next, cons(r, qs));
    }
    s.unused_char = c;
    switch (c) {
        '#', '(', '[', '{', '\'', '`', ',' => {
            try s.push(next);
            return s.jump(.parse_rune_datum, r);
        },
        else => return s.jump(next, r),
    }
}

fn parseRuneDatum(s: *State) !void {
    s.context.val = s.result;
    return parseCladDatum(s, s.getUnused().?, .end_rune_datum);
}

fn endRuneDatum(s: *State) !void {
    const r = s.context.val;
    const d = s.result;
    if (d.eq(value.undef)) {
        s.retval(r);
    }
    return s.retval(cons(r, d));
}

fn parseLabel(s: *State) !struct { Value, ?u8 } {
    const label, const unused_c = try parseHex(s, u48, "datum label");
    return .{ value.fixnum.pack(label), unused_c };
}

fn parseLabelEnd(s: *State, l: Value, c1: ?u8, next: Fn) !void {
    const c = c1 orelse return s.err(error.UnexpectedEof, "datum label");
    if (c == '%') {
        return s.jump(next, cons(LABEL, l));
    }
    if (c == '=') {
        try s.push(next);
        s.context.val = l;
        return s.subr(.parse_unit, .end_label_datum);
    }
    return s.err(error.InvalidCharacter, "datum label");
}

fn endLabelDatum(s: *State) !void {
    const l = s.context.val;
    const d = s.result;
    if (d.eq(value.undef)) {
        return s.err(error.InvalidCharacter, "label datum");
    }
    return s.retval(cons(LABEL, cons(l, d)));
}

fn parseList(s: *State, open: u8, next: Fn) !void {
    const head = switch (open) {
        '(' => value.nil.nil,
        '[' => cons(SQUARE, value.nil.nil),
        '{' => cons(BRACE, value.nil.nil),
        else => unreachable,
    };
    const close: u8 = switch (open) {
        '(' => ')',
        '[' => ']',
        '{' => '}',
        else => unreachable,
    };
    while (try s.read()) |c| {
        if (c == close) {
            return s.jump(next, head);
        }
        switch (try checkBlank(s, c)) {
            .yes => {},
            .skip_unit => {
                try listParserSetup(s, head, close, next);
                // Parse twice in a row, ignoring the first result.
                return s.subr(.parse_unit, .parse_unit);
            },
            .skip_line => try s.skipLine(),
            .no => {
                try listParserSetup(s, head, close, next);
                s.unused_char = c;
                return s.jump(.parse_list_element, null);
            },
        }
    }
    return s.err(error.UnexpectedEof, "list");
}

fn listParserSetup(s: *State, head: Value, close: u8, next: Fn) !void {
    s.context.val = head;
    s.context.char = close;
    try s.push(next);
    try s.pushContext(.continue_list);
}

fn parseListElement(s: *State) !void {
    return parseDatum(s, s.getUnused().?);
}

fn continueList(s: *State) !void {
    const close = s.context.char;

    if (s.result.eq(value.undef)) {
        const c = s.getUnused().?;
        if (c == close) {
            return endList(s);
        }
        return s.err(error.InvalidCharacter, "list");
    }

    if (s.result.eq(S_DOT)) {
        return s.subr(.parse_unit, .end_improper_list);
    }

    s.context.val = cons(s.result, s.context.val);

    var c1 = s.getUnused() orelse try s.read();
    while (c1) |c| : (c1 = try s.read()) {
        if (c == close) {
            return endList(s);
        }
        switch (try checkBlank(s, c)) {
            .yes => {},
            .skip_unit => {
                try s.pushContext(.continue_list);
                return s.subr(.parse_unit, .parse_unit);
            },
            .skip_line => try s.skipLine(),
            .no => {
                s.unused_char = c;
                return s.subr(.parse_list_element, .continue_list);
            },
        }
    }
    return s.err(error.UnexpectedEof, "list");
}

fn endList(s: *State) !void {
    return s.retval(lib.list.reverse(s.context.val));
}

fn endImproperList(s: *State) !void {
    const tail = s.result;
    if (tail.eq(value.undef)) {
        return s.err(error.InvalidCharacter, "list tail");
    }
    s.context.val = lib.list.reverseWithTail(s.context.val, tail);
    return closeImproperList(s);
}

fn closeImproperList(s: *State) !void {
    const result = s.context.val;
    const close = s.context.char;
    var c1 = s.getUnused() orelse try s.read();
    while (c1) |c| : (c1 = try s.readNoEof("after list tail")) {
        if (c == close) {
            return s.retval(result);
        }
        switch (try checkBlank(s, c)) {
            .yes => {},
            .skip_unit => return s.subr(.parse_unit, .close_improper_list),
            .skip_line => try s.skipLine(),
            .no => return s.err(error.InvalidCharacter, "after list tail"),
        }
    }
    unreachable;
}

fn parseQuoteExpr(s: *State, c1: u8, next: Fn) !void {
    const q = switch (c1) {
        '\'' => QUOTE,
        '`' => GRAVE,
        ',' => COMMA,
        else => unreachable,
    };

    // fast-path to avoid subr
    const c = try s.readNoEof("quote expression");
    if (isBareChar(c) or c == '\\') {
        const bs, s.unused_char = try parseBareString(s, c);
        return s.jump(next, cons(q, bs));
    }

    s.context.val = q;
    s.unused_char = c;
    return s.subr(.parse_list_element, .end_quote_expr);
}

fn endQuoteExpr(s: *State) !void {
    if (s.result.eq(value.undef)) {
        return s.err(error.InvalidCharacter, "quote expression datum");
    }
    const q = s.context.val;
    const d = s.result;
    return s.retval(cons(q, d));
}

// Helpers

fn parseHex(
    s: *State,
    u_type: type,
    emsg: []const u8,
) !struct { u_type, ?u8 } {
    var uc: u_type = undefined;

    const c1 = try s.readNoEof(emsg);
    uc = try parseHexDigit(s, c1, emsg);

    while (try s.read()) |c| {
        if (!std.ascii.isHex(c)) {
            return .{ uc, c };
        }
        const shl = std.math.shlExact;
        uc = shl(u_type, uc, 4) catch return s.err(error.OutOfRange, emsg);
        uc |= try parseHexDigit(s, c, emsg);
    }
    return .{ uc, null };
}

fn parseHexByte(s: *State, emsg: []const u8) !u8 {
    const h1 = try s.readNoEof(emsg);
    const h2 = try s.readNoEof(emsg);
    const hi = try parseHexDigit(s, h1, emsg);
    const lo = try parseHexDigit(s, h2, emsg);
    return hi << 4 | lo;
}

fn parseHexDigit(s: *State, c: u8, emsg: []const u8) !u8 {
    return switch (c) {
        '0'...'9' => c - '0',
        'A'...'F' => c - 'A' + 10,
        'a'...'f' => c - 'a' + 10,
        else => s.err(error.InvalidCharacter, emsg),
    };
}