diff options
| author | Taylan Kammer <taylan.kammer@gmail.com> | 2025-04-07 09:20:45 +0200 |
|---|---|---|
| committer | Taylan Kammer <taylan.kammer@gmail.com> | 2025-04-07 09:20:45 +0200 |
| commit | a8785d355b5ef81930af52dbc58f3bbb660d1ac2 (patch) | |
| tree | 28095bcca9ee7603dcbb94ad3fa41b4c89853f01 /src | |
| parent | 70089dacfa6bab5a1e1d0d5aa257e2d671493beb (diff) | |
Implement escaped newline in strings.
Diffstat (limited to 'src')
| -rw-r--r-- | src/test/parse.zig | 2 | ||||
| -rw-r--r-- | src/zisp/io/Parser.zig | 160 | ||||
| -rw-r--r-- | src/zisp/io/parser.zig | 2 | ||||
| -rw-r--r-- | src/zisp/value.zig | 7 |
4 files changed, 90 insertions, 81 deletions
diff --git a/src/test/parse.zig b/src/test/parse.zig index f1a7857..dd26098 100644 --- a/src/test/parse.zig +++ b/src/test/parse.zig @@ -55,7 +55,7 @@ test "parse long bare string" { try expect(parse("-foo.bar.baz").eq(str("-foo.bar.baz"))); try expect(parse("0foo.bar.baz").eq(str("0foo.bar.baz"))); try expect(parse("!$%&*+-/<=>?@^_~").eq(str("!$%&*+-/<=>?@^_~"))); - try expect(parse("|foo\\x20bar\\x0abaz|").eq(str("foo bar\nbaz"))); + try expect(parse("|foo\\x20;bar\\x0a;baz|").eq(str("foo bar\nbaz"))); } test "parse" { diff --git a/src/zisp/io/Parser.zig b/src/zisp/io/Parser.zig index 14db959..4a2ed35 100644 --- a/src/zisp/io/Parser.zig +++ b/src/zisp/io/Parser.zig @@ -188,7 +188,7 @@ fn addChar(p: *Parser, c: u8) !void { try p.chars.append(p.alloc.chars, c); } -fn getString(p: *Parser) Value { +fn getCharsAsString(p: *Parser) Value { defer p.chars.clearRetainingCapacity(); return if (value.sstr.isValidSstr(p.chars.items)) value.sstr.pack(p.chars.items) @@ -196,7 +196,7 @@ fn getString(p: *Parser) Value { value.istr.intern(p.chars.items); } -fn getRune(p: *Parser) Value { +fn getCharsAsRune(p: *Parser) Value { defer p.chars.clearRetainingCapacity(); return value.rune.pack(p.chars.items); } @@ -433,84 +433,111 @@ fn parseBareString(p: *Parser, c1: u8) !Value { break; } } - return p.getString(); + return p.getCharsAsString(); } fn parseCladDatum(p: *Parser, c: u8, next: Fn) !void { return switch (c) { - '|' => p.jump(next, try p.parseEscapedString('|')), - '"' => p.jump(next, try p.parseEscapedString('"')), - '#' => p.parseHashExpression(next), + '|' => p.jump(next, try p.parseString('|')), + '"' => p.jump(next, try p.parseString('"')), + '#' => p.parseHashExpr(next), '(', '[', '{' => p.parseList(c, next), '\'', '`', ',' => p.parseQuoteExpr(c, next), else => p.abort(next, c), }; } -fn parseEscapedString(p: *Parser, close: u8) !Value { - while (try p.read()) |c| { - if (c == close) { - const s = p.getString(); +fn parseString(p: *Parser, comptime close: u8) !Value { + while (try p.read()) |c| sw: switch (c) { + close => { + const s = p.getCharsAsString(); return if (close == '"') p.cons(QUOTE, s) else s; - } - if (c != '\\') { - try p.addChar(c); - } else { - try p.parseQuotedEsc(close); - } - } - return error.UnclosedString; + }, + '\\' => switch (try p.readNoEof("string backslash escape")) { + '\\', '|', '"' => |c2| try p.addChar(c2), + '\t', ' ' => { + const c2 = try p.skipStringLfEscape(); + continue :sw c2; + }, + '\n' => { + const c2 = try p.skipStringIndent(); + continue :sw c2; + }, + 'x' => try p.parseStringRawHexEsc(), + 'u' => try p.parseStringUniHexEsc(), + else => |c2| try p.parseStringCharEsc(c2), + }, + // Important to use a capture here, since it may come from a labeled + // continue statement passing a new char directly to the switch. + else => |c2| try p.addChar(c2), + }; + return p.err(.UnclosedString, .{close} ++ " string"); } -fn parseQuotedEsc(p: *Parser, close: u8) !void { - const c = try p.readNoEof("quoted escape"); - if (c == close) return p.addChar(close); - if (c == 'u') return p.parseUniHexHandleErrors(); - try p.addChar(switch (c) { - '\\' => c, - '0' => 0, - 'a' => 7, - 'b' => 8, - 't' => 9, - 'n' => 10, - 'v' => 11, - 'f' => 12, - 'r' => 13, - 'e' => 27, - 'x' => try p.parseHexByte("hex escape"), - else => return p.err(.InvalidCharacter, "quoted escape"), - }); +fn skipStringLfEscape(p: *Parser) !u8 { + const msg = "string linefeed escape"; + while (try p.read()) |c| switch (c) { + '\t', ' ' => {}, + '\n' => return p.skipStringIndent(), + else => return p.err(.InvalidCharacter, msg), + }; + return p.err(.UnclosedString, msg); } -fn parseUniHexHandleErrors(p: *Parser) !void { - return p.parseUniHex() catch |e| switch (e) { - error.Utf8CannotEncodeSurrogateHalf => p.err( - .UnicodeError, - "unicode escape", - ), - else => e, +fn skipStringIndent(p: *Parser) !u8 { + while (try p.read()) |c| switch (c) { + '\t', ' ' => {}, + else => return c, }; + return p.err(.UnclosedString, "string linefeed escape"); } -fn parseUniHex(p: *Parser) !void { - const msg = "unicode escape"; - - if (try p.readNoEof(msg) != '{') { - return p.err(.InvalidCharacter, msg); +fn parseStringRawHexEsc(p: *Parser) !void { + const msg = "string raw hex escape"; + while (try p.read()) |c1| { + if (c1 == ';') return; + const c2 = try p.readNoEof(msg); + const hi = try p.parseHexDigit(c1, msg); + const lo = try p.parseHexDigit(c2, msg); + try p.addChar(hi << 4 | lo); } + return p.err(.UnclosedString, msg); +} + +fn parseStringUniHexEsc(p: *Parser) !void { + const msg = "string unicode escape"; const uc = try p.parseHex(u21, msg); - const c = p.getUnread() orelse return p.err(.UnexpectedEof, msg); - if (c != '}') { + const c = p.getUnread() orelse try p.readNoEof(msg); + if (c != ';') { return p.err(.InvalidCharacter, msg); } - const n = try std.unicode.utf8CodepointSequenceLength(uc); + const n = std.unicode.utf8CodepointSequenceLength(uc) catch { + return p.err(.UnicodeError, msg); + }; const buf = try p.chars.addManyAsSlice(p.alloc.chars, n); - _ = try std.unicode.utf8Encode(uc, buf); + const n2 = std.unicode.utf8Encode(uc, buf) catch { + return p.err(.UnicodeError, msg); + }; + std.debug.assert(n == n2); +} + +fn parseStringCharEsc(p: *Parser, c: u8) !void { + try p.addChar(switch (c) { + 'a' => 7, + 'b' => 8, + 't' => 9, + 'n' => 10, + 'v' => 11, + 'f' => 12, + 'r' => 13, + 'e' => 27, + else => return p.err(.InvalidCharacter, "string backslash escape"), + }); } -fn parseHashExpression(p: *Parser, next: Fn) !void { +fn parseHashExpr(p: *Parser, next: Fn) !void { const c = try p.readNoEof("hash expression"); if (std.ascii.isAlphabetic(c)) { const r = try p.parseRune(c); @@ -544,11 +571,11 @@ fn parseRune(p: *Parser, c1: u8) !Value { while (try p.read()) |c| : (len += 1) { if (len == 6 or !std.ascii.isAlphanumeric(c)) { p.unread(c); - return p.getRune(); + return p.getCharsAsRune(); } try p.addChar(c); } - return p.getRune(); + return p.getCharsAsRune(); } fn parseRuneEnd(p: *Parser, r: Value, next: Fn) !void { @@ -558,10 +585,10 @@ fn parseRuneEnd(p: *Parser, r: Value, next: Fn) !void { return p.jump(next, p.cons(r, try p.parseBareString(c1))); } if (c == '"') { - return p.jump(next, p.cons(r, try p.parseEscapedString('"'))); + return p.jump(next, p.cons(r, try p.parseString('"'))); } if (c == '|') { - return p.jump(next, p.cons(r, try p.parseEscapedString('|'))); + return p.jump(next, p.cons(r, try p.parseString('|'))); } p.unread(c); switch (c) { @@ -752,18 +779,9 @@ fn isBareChar(c: u8) bool { }; } -fn isBareEsc(c: u8) bool { - return switch (c) { - 33...126 => true, - else => false, - }; -} - fn parseHex(p: *Parser, T: type, comptime emsg: []const u8) !T { - var uc: T = undefined; - const c1 = try p.readNoEof(emsg); - uc = try p.parseHexDigit(c1, emsg); + var uc: T = try p.parseHexDigit(c1, emsg); while (try p.read()) |c| { if (!std.ascii.isHex(c)) { @@ -777,14 +795,6 @@ fn parseHex(p: *Parser, T: type, comptime emsg: []const u8) !T { return uc; } -fn parseHexByte(p: *Parser, comptime emsg: []const u8) !u8 { - const h1 = try p.readNoEof(emsg); - const h2 = try p.readNoEof(emsg); - const hi = try p.parseHexDigit(h1, emsg); - const lo = try p.parseHexDigit(h2, emsg); - return hi << 4 | lo; -} - fn parseHexDigit(p: *Parser, c: u8, comptime emsg: []const u8) !u8 { return switch (c) { '0'...'9' => c - '0', diff --git a/src/zisp/io/parser.zig b/src/zisp/io/parser.zig index cb96e44..cfe2bf1 100644 --- a/src/zisp/io/parser.zig +++ b/src/zisp/io/parser.zig @@ -56,7 +56,7 @@ pub fn parse(input: std.io.AnyReader) Value { pub fn _parse( input: std.io.AnyReader, comptime panic: bool, -) if (panic) Value else error{ParseError}!Value { +) if (panic) Value else error{ ParseError, OutOfMemory }!Value { const alloc = std.heap.smp_allocator; var sfa = DefaultSfa.init(alloc); var p = initSfa(&sfa) catch |e| if (panic) @panic("OOM") else return e; diff --git a/src/zisp/value.zig b/src/zisp/value.zig index 465cbbb..449a577 100644 --- a/src/zisp/value.zig +++ b/src/zisp/value.zig @@ -134,10 +134,9 @@ // // ==== Strings ==== // -// Another 48-bit space is used for strings of zero to six bytes. These are -// NUL-terminated if shorter than six bytes, meaning that NUL cannot appear in -// them, and they must be valid UTF-8, meaning that some other values could be -// hidden here in the future. (UTF-8 sequences cannot contain 0xFE or 0xFF.) +// Another 48-bit space is used for strings of zero to six bytes. Like runes, +// these are NUL-terminated if shorter than six bytes, meaning that NUL cannot +// appear in them, but otherwise they allow arbitrary bytes. // // ==== Small rationals ==== // |
