Implement escaped newline in strings.

author: Taylan Kammer <taylan.kammer@gmail.com> 2025-04-07 09:20:45 +0200
committer: Taylan Kammer <taylan.kammer@gmail.com> 2025-04-07 09:20:45 +0200
commit: a8785d355b5ef81930af52dbc58f3bbb660d1ac2 (patch)
tree: 28095bcca9ee7603dcbb94ad3fa41b4c89853f01 /src
parent: 70089dacfa6bab5a1e1d0d5aa257e2d671493beb (diff)
4 files changed, 90 insertions, 81 deletions
diff --git a/src/test/parse.zig b/src/test/parse.zig
index f1a7857..dd26098 100644
--- a/src/test/parse.zig
+++ b/src/test/parse.zig
@@ -55,7 +55,7 @@ test "parse long bare string" {
     try expect(parse("-foo.bar.baz").eq(str("-foo.bar.baz")));
     try expect(parse("0foo.bar.baz").eq(str("0foo.bar.baz")));
     try expect(parse("!$%&*+-/<=>?@^_~").eq(str("!$%&*+-/<=>?@^_~")));
-    try expect(parse("|foo\\x20bar\\x0abaz|").eq(str("foo bar\nbaz")));
+    try expect(parse("|foo\\x20;bar\\x0a;baz|").eq(str("foo bar\nbaz")));
 }
 
 test "parse" {
diff --git a/src/zisp/io/Parser.zig b/src/zisp/io/Parser.zig
index 14db959..4a2ed35 100644
--- a/src/zisp/io/Parser.zig
+++ b/src/zisp/io/Parser.zig
@@ -188,7 +188,7 @@ fn addChar(p: *Parser, c: u8) !void {
     try p.chars.append(p.alloc.chars, c);
 }
 
-fn getString(p: *Parser) Value {
+fn getCharsAsString(p: *Parser) Value {
     defer p.chars.clearRetainingCapacity();
     return if (value.sstr.isValidSstr(p.chars.items))
         value.sstr.pack(p.chars.items)
@@ -196,7 +196,7 @@ fn getString(p: *Parser) Value {
         value.istr.intern(p.chars.items);
 }
 
-fn getRune(p: *Parser) Value {
+fn getCharsAsRune(p: *Parser) Value {
     defer p.chars.clearRetainingCapacity();
     return value.rune.pack(p.chars.items);
 }
@@ -433,84 +433,111 @@ fn parseBareString(p: *Parser, c1: u8) !Value {
             break;
         }
     }
-    return p.getString();
+    return p.getCharsAsString();
 }
 
 fn parseCladDatum(p: *Parser, c: u8, next: Fn) !void {
     return switch (c) {
-        '|' => p.jump(next, try p.parseEscapedString('|')),
-        '"' => p.jump(next, try p.parseEscapedString('"')),
-        '#' => p.parseHashExpression(next),
+        '|' => p.jump(next, try p.parseString('|')),
+        '"' => p.jump(next, try p.parseString('"')),
+        '#' => p.parseHashExpr(next),
         '(', '[', '{' => p.parseList(c, next),
         '\'', '`', ',' => p.parseQuoteExpr(c, next),
         else => p.abort(next, c),
     };
 }
 
-fn parseEscapedString(p: *Parser, close: u8) !Value {
-    while (try p.read()) |c| {
-        if (c == close) {
-            const s = p.getString();
+fn parseString(p: *Parser, comptime close: u8) !Value {
+    while (try p.read()) |c| sw: switch (c) {
+        close => {
+            const s = p.getCharsAsString();
             return if (close == '"') p.cons(QUOTE, s) else s;
-        }
-        if (c != '\\') {
-            try p.addChar(c);
-        } else {
-            try p.parseQuotedEsc(close);
-        }
-    }
-    return error.UnclosedString;
+        },
+        '\\' => switch (try p.readNoEof("string backslash escape")) {
+            '\\', '|', '"' => |c2| try p.addChar(c2),
+            '\t', ' ' => {
+                const c2 = try p.skipStringLfEscape();
+                continue :sw c2;
+            },
+            '\n' => {
+                const c2 = try p.skipStringIndent();
+                continue :sw c2;
+            },
+            'x' => try p.parseStringRawHexEsc(),
+            'u' => try p.parseStringUniHexEsc(),
+            else => |c2| try p.parseStringCharEsc(c2),
+        },
+        // Important to use a capture here, since it may come from a labeled
+        // continue statement passing a new char directly to the switch.
+        else => |c2| try p.addChar(c2),
+    };
+    return p.err(.UnclosedString, .{close} ++ " string");
 }
 
-fn parseQuotedEsc(p: *Parser, close: u8) !void {
-    const c = try p.readNoEof("quoted escape");
-    if (c == close) return p.addChar(close);
-    if (c == 'u') return p.parseUniHexHandleErrors();
-    try p.addChar(switch (c) {
-        '\\' => c,
-        '0' => 0,
-        'a' => 7,
-        'b' => 8,
-        't' => 9,
-        'n' => 10,
-        'v' => 11,
-        'f' => 12,
-        'r' => 13,
-        'e' => 27,
-        'x' => try p.parseHexByte("hex escape"),
-        else => return p.err(.InvalidCharacter, "quoted escape"),
-    });
+fn skipStringLfEscape(p: *Parser) !u8 {
+    const msg = "string linefeed escape";
+    while (try p.read()) |c| switch (c) {
+        '\t', ' ' => {},
+        '\n' => return p.skipStringIndent(),
+        else => return p.err(.InvalidCharacter, msg),
+    };
+    return p.err(.UnclosedString, msg);
 }
 
-fn parseUniHexHandleErrors(p: *Parser) !void {
-    return p.parseUniHex() catch |e| switch (e) {
-        error.Utf8CannotEncodeSurrogateHalf => p.err(
-            .UnicodeError,
-            "unicode escape",
-        ),
-        else => e,
+fn skipStringIndent(p: *Parser) !u8 {
+    while (try p.read()) |c| switch (c) {
+        '\t', ' ' => {},
+        else => return c,
     };
+    return p.err(.UnclosedString, "string linefeed escape");
 }
 
-fn parseUniHex(p: *Parser) !void {
-    const msg = "unicode escape";
-
-    if (try p.readNoEof(msg) != '{') {
-        return p.err(.InvalidCharacter, msg);
+fn parseStringRawHexEsc(p: *Parser) !void {
+    const msg = "string raw hex escape";
+    while (try p.read()) |c1| {
+        if (c1 == ';') return;
+        const c2 = try p.readNoEof(msg);
+        const hi = try p.parseHexDigit(c1, msg);
+        const lo = try p.parseHexDigit(c2, msg);
+        try p.addChar(hi << 4 | lo);
     }
+    return p.err(.UnclosedString, msg);
+}
+
+fn parseStringUniHexEsc(p: *Parser) !void {
+    const msg = "string unicode escape";
 
     const uc = try p.parseHex(u21, msg);
-    const c = p.getUnread() orelse return p.err(.UnexpectedEof, msg);
-    if (c != '}') {
+    const c = p.getUnread() orelse try p.readNoEof(msg);
+    if (c != ';') {
         return p.err(.InvalidCharacter, msg);
     }
 
-    const n = try std.unicode.utf8CodepointSequenceLength(uc);
+    const n = std.unicode.utf8CodepointSequenceLength(uc) catch {
+        return p.err(.UnicodeError, msg);
+    };
     const buf = try p.chars.addManyAsSlice(p.alloc.chars, n);
-    _ = try std.unicode.utf8Encode(uc, buf);
+    const n2 = std.unicode.utf8Encode(uc, buf) catch {
+        return p.err(.UnicodeError, msg);
+    };
+    std.debug.assert(n == n2);
+}
+
+fn parseStringCharEsc(p: *Parser, c: u8) !void {
+    try p.addChar(switch (c) {
+        'a' => 7,
+        'b' => 8,
+        't' => 9,
+        'n' => 10,
+        'v' => 11,
+        'f' => 12,
+        'r' => 13,
+        'e' => 27,
+        else => return p.err(.InvalidCharacter, "string backslash escape"),
+    });
 }
 
-fn parseHashExpression(p: *Parser, next: Fn) !void {
+fn parseHashExpr(p: *Parser, next: Fn) !void {
     const c = try p.readNoEof("hash expression");
     if (std.ascii.isAlphabetic(c)) {
         const r = try p.parseRune(c);
@@ -544,11 +571,11 @@ fn parseRune(p: *Parser, c1: u8) !Value {
     while (try p.read()) |c| : (len += 1) {
         if (len == 6 or !std.ascii.isAlphanumeric(c)) {
             p.unread(c);
-            return p.getRune();
+            return p.getCharsAsRune();
         }
         try p.addChar(c);
     }
-    return p.getRune();
+    return p.getCharsAsRune();
 }
 
 fn parseRuneEnd(p: *Parser, r: Value, next: Fn) !void {
@@ -558,10 +585,10 @@ fn parseRuneEnd(p: *Parser, r: Value, next: Fn) !void {
         return p.jump(next, p.cons(r, try p.parseBareString(c1)));
     }
     if (c == '"') {
-        return p.jump(next, p.cons(r, try p.parseEscapedString('"')));
+        return p.jump(next, p.cons(r, try p.parseString('"')));
     }
     if (c == '|') {
-        return p.jump(next, p.cons(r, try p.parseEscapedString('|')));
+        return p.jump(next, p.cons(r, try p.parseString('|')));
     }
     p.unread(c);
     switch (c) {
@@ -752,18 +779,9 @@ fn isBareChar(c: u8) bool {
     };
 }
 
-fn isBareEsc(c: u8) bool {
-    return switch (c) {
-        33...126 => true,
-        else => false,
-    };
-}
-
 fn parseHex(p: *Parser, T: type, comptime emsg: []const u8) !T {
-    var uc: T = undefined;
-
     const c1 = try p.readNoEof(emsg);
-    uc = try p.parseHexDigit(c1, emsg);
+    var uc: T = try p.parseHexDigit(c1, emsg);
 
     while (try p.read()) |c| {
         if (!std.ascii.isHex(c)) {
@@ -777,14 +795,6 @@ fn parseHex(p: *Parser, T: type, comptime emsg: []const u8) !T {
     return uc;
 }
 
-fn parseHexByte(p: *Parser, comptime emsg: []const u8) !u8 {
-    const h1 = try p.readNoEof(emsg);
-    const h2 = try p.readNoEof(emsg);
-    const hi = try p.parseHexDigit(h1, emsg);
-    const lo = try p.parseHexDigit(h2, emsg);
-    return hi << 4 | lo;
-}
-
 fn parseHexDigit(p: *Parser, c: u8, comptime emsg: []const u8) !u8 {
     return switch (c) {
         '0'...'9' => c - '0',
diff --git a/src/zisp/io/parser.zig b/src/zisp/io/parser.zig
index cb96e44..cfe2bf1 100644
--- a/src/zisp/io/parser.zig
+++ b/src/zisp/io/parser.zig
@@ -56,7 +56,7 @@ pub fn parse(input: std.io.AnyReader) Value {
 pub fn _parse(
     input: std.io.AnyReader,
     comptime panic: bool,
-) if (panic) Value else error{ParseError}!Value {
+) if (panic) Value else error{ ParseError, OutOfMemory }!Value {
     const alloc = std.heap.smp_allocator;
     var sfa = DefaultSfa.init(alloc);
     var p = initSfa(&sfa) catch |e| if (panic) @panic("OOM") else return e;
diff --git a/src/zisp/value.zig b/src/zisp/value.zig
index 465cbbb..449a577 100644
--- a/src/zisp/value.zig
+++ b/src/zisp/value.zig
@@ -134,10 +134,9 @@
 //
 // ==== Strings ====
 //
-// Another 48-bit space is used for strings of zero to six bytes.  These are
-// NUL-terminated if shorter than six bytes, meaning that NUL cannot appear in
-// them, and they must be valid UTF-8, meaning that some other values could be
-// hidden here in the future.  (UTF-8 sequences cannot contain 0xFE or 0xFF.)
+// Another 48-bit space is used for strings of zero to six bytes.  Like runes,
+// these are NUL-terminated if shorter than six bytes, meaning that NUL cannot
+// appear in them, but otherwise they allow arbitrary bytes.
 //
 // ==== Small rationals ====
 //
author	Taylan Kammer <taylan.kammer@gmail.com>	2025-04-07 09:20:45 +0200
committer	Taylan Kammer <taylan.kammer@gmail.com>	2025-04-07 09:20:45 +0200
commit	a8785d355b5ef81930af52dbc58f3bbb660d1ac2 (patch)
tree	28095bcca9ee7603dcbb94ad3fa41b4c89853f01 /src
parent	70089dacfa6bab5a1e1d0d5aa257e2d671493beb (diff)