diff options
| author | Taylan Kammer <taylan.kammer@gmail.com> | 2025-04-07 09:20:45 +0200 |
|---|---|---|
| committer | Taylan Kammer <taylan.kammer@gmail.com> | 2025-04-07 09:20:45 +0200 |
| commit | a8785d355b5ef81930af52dbc58f3bbb660d1ac2 (patch) | |
| tree | 28095bcca9ee7603dcbb94ad3fa41b4c89853f01 | |
| parent | 70089dacfa6bab5a1e1d0d5aa257e2d671493beb (diff) | |
Implement escaped newline in strings.
| -rw-r--r-- | spec/parser.bnf | 70 | ||||
| -rw-r--r-- | spec/syntax.abnf | 55 | ||||
| -rw-r--r-- | spec/syntax.zbnf | 55 | ||||
| -rw-r--r-- | src/test/parse.zig | 2 | ||||
| -rw-r--r-- | src/zisp/io/Parser.zig | 160 | ||||
| -rw-r--r-- | src/zisp/io/parser.zig | 2 | ||||
| -rw-r--r-- | src/zisp/value.zig | 7 |
7 files changed, 200 insertions, 151 deletions
diff --git a/spec/parser.bnf b/spec/parser.bnf deleted file mode 100644 index 338dc10..0000000 --- a/spec/parser.bnf +++ /dev/null @@ -1,70 +0,0 @@ -unit : blank* ( datum blank? | EOF ) ; - - -blank : 9...13 | comment ; - -datum : one_datum ( join_char? one_datum )* ; - -join_char : '.' | ':' ; - - -comment : ';' ( skip_unit | skip_line ) ; - -skip_unit : '~' unit ; - -skip_line : ( ~LF )* LF? ; - - -one_datum : bare_string | clad_datum ; - -bare_string : ( '.' | '+' | '-' | DIGIT ) ( bare_char | '.' )* - | bare_char+ - ; - -clad_datum : '\' bare_string - | '|' pipe_str_elt* '|' - | '"' quot_str_elt* '"' - | '#' hash_expr - | '(' list ')' - | '[' list ']' - | '{' list '}' - | quote_expr - ; - - -bare_char : ALPHA | DIGIT | bare_punct ; - -bare_punct : '!' | '$' | '%' | '&' | '*' | '+' | '-' | '/' - | '<' | '=' | '>' | '?' | '@' | '^' | '_' | '~' - ; - - -pipe_str_elt : ~( '|' | '\' ) | '\' pipe_esc ; - -quot_str_elt : ~( '"' | '\' ) | '\' quot_esc ; - -hash_expr : rune clad_datum? - | rune '\' bare_string - | '\' bare_string - | '%' label ( '%' | '=' datum ) - | clad_datum - ; - -list : unit* ( '.' unit )? blank* ; - -quote_expr : ( "'" | "`" | "," ) datum ; - - -pipe_esc : string_esc | '|' ; - -quot_esc : string_esc | '"' ; - -string_esc : '\' | 'a' | 'b' | 'e' | 'f' | 'n' | 'r' | 't' | 'v' - | 'x' HEXDIG{2} - | 'u' '{' HEXDIG+ '}' - ; - - -rune : ALPHA ( ALPHA | DIGIT ){0,5} ; - -label : HEXDIG{1,12} ; diff --git a/spec/syntax.abnf b/spec/syntax.abnf new file mode 100644 index 0000000..a3c4ab9 --- /dev/null +++ b/spec/syntax.abnf @@ -0,0 +1,55 @@ +Unit = *Blank Datum [ ';' SkipLine ] + + +Blank = HTAB / LF / %x0b / %x0c / CR / Comment + +Datum = OneDatum *( [JoinChar] OneDatum ) + +JoinChar = '.' / ':' + + +Comment = ';' ( SkipUnit / SkipLine LF ) + +SkipUnit = '~' Unit + +SkipLine = *( %x00-09 / %x0b-ff ) ; any but LF + + +OneDatum = BareString / CladDatum + +BareString = ( '.' / '+' / '-' / DIGIT ) *( BareChar / '.' ) + / 1*BareChar + +CladDatum = '|' *( PipeStrChar / '\' StringEsc ) '|' + / '"' *( QuotStrChar / '\' StringEsc ) '"' + / '#' HashExpr + / '(' List ')' / '[' List ']' / '{' List '}' + / "'" Datum / '`' Datum / ',' Datum + + +BareChar = ALPHA / DIGIT + / '!' / '$' / '%' / '&' / '*' / '+' / '-' / '/' + / '<' / '=' / '>' / '?' / '@' / '^' / '_' / '~' + + +PipeStrChar = %x00-5b / %x5d-7b / %x7d-ff ; any but | or \ + +QuotStrChar = %x00-21 / %x23-5b / %x5d-ff ; any but " or \ + +HashExpr = rune ( '\' BareString / [CladDatum] ) + / '\' BareString + / '%' Label ( '%' / '=' Datum ) + / CladDatum + +List = *Unit [ '.' Unit ] *Blank + + +StringEsc = '\' / '|' / '"' / *( HTAB / SP ) LF *( HTAB / SP ) + / 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e' + / 'x' 1*( 2*HEXDIG ) ';' + / 'u' 1*6( HEXDIG ) ';' + + +Rune = ALPHA *5( ALPHA / DIGIT ) + +Label = 1*12( HEXDIG ) diff --git a/spec/syntax.zbnf b/spec/syntax.zbnf new file mode 100644 index 0000000..5656864 --- /dev/null +++ b/spec/syntax.zbnf @@ -0,0 +1,55 @@ +Unit : Blank* ( Datum [Blank] | EOF ) + + +Blank : 9...13 | Comment + +Datum : OneDatum ( [JoinChar] OneDatum )* + +JoinChar : '.' | ':' + + +Comment : ';' ( SkipUnit | SkipLine ) + +SkipUnit : '~' Unit + +SkipLine : ( ~LF )* [LF] + + +OneDatum : BareString | CladDatum + +BareString : ( '.' | '+' | '-' | DIGIT ) ( BareChar | '.' )* + | BareChar+ + +CladDatum : '|' PipeStrElt* '|' + | '"' QuotStrElt* '"' + | '#' HashExpr + | '(' List ')' | '[' List ']' | '{' List '}' + | "'" Datum | '`' Datum | ',' Datum + + +BareChar : ALPHA | DIGIT + | '!' | '$' | '%' | '&' | '*' | '+' | '-' | '/' + | '<' | '=' | '>' | '?' | '@' | '^' | '_' | '~' + + +PipeStrElt : ~( '|' | '\' ) | '\' StringEsc + +QuotStrElt : ~( '"' | '\' ) | '\' StringEsc + +HashExpr : Rune [ '\' BareString | CladDatum ] + | '\' BareString + | '%' Label ( '%' | '=' Datum ) + | CladDatum + +List : Unit* [ '.' Unit ] Blank* + + +StringEsc : '\' | '|' | '"' | ( HTAB | SP )* LF ( HTAB | SP )* + | 'a' | 'b' | 't' | 'n' | 'v' | 'f' | 'r' | 'e' + | 'x' ( HEXDIG{2} )+ ';' + | 'u' HEXDIG{1,6} ';' + + +Rune : ALPHA ( ALPHA | DIGIT ){0,5} + +Label : HEXDIG{1,12} diff --git a/src/test/parse.zig b/src/test/parse.zig index f1a7857..dd26098 100644 --- a/src/test/parse.zig +++ b/src/test/parse.zig @@ -55,7 +55,7 @@ test "parse long bare string" { try expect(parse("-foo.bar.baz").eq(str("-foo.bar.baz"))); try expect(parse("0foo.bar.baz").eq(str("0foo.bar.baz"))); try expect(parse("!$%&*+-/<=>?@^_~").eq(str("!$%&*+-/<=>?@^_~"))); - try expect(parse("|foo\\x20bar\\x0abaz|").eq(str("foo bar\nbaz"))); + try expect(parse("|foo\\x20;bar\\x0a;baz|").eq(str("foo bar\nbaz"))); } test "parse" { diff --git a/src/zisp/io/Parser.zig b/src/zisp/io/Parser.zig index 14db959..4a2ed35 100644 --- a/src/zisp/io/Parser.zig +++ b/src/zisp/io/Parser.zig @@ -188,7 +188,7 @@ fn addChar(p: *Parser, c: u8) !void { try p.chars.append(p.alloc.chars, c); } -fn getString(p: *Parser) Value { +fn getCharsAsString(p: *Parser) Value { defer p.chars.clearRetainingCapacity(); return if (value.sstr.isValidSstr(p.chars.items)) value.sstr.pack(p.chars.items) @@ -196,7 +196,7 @@ fn getString(p: *Parser) Value { value.istr.intern(p.chars.items); } -fn getRune(p: *Parser) Value { +fn getCharsAsRune(p: *Parser) Value { defer p.chars.clearRetainingCapacity(); return value.rune.pack(p.chars.items); } @@ -433,84 +433,111 @@ fn parseBareString(p: *Parser, c1: u8) !Value { break; } } - return p.getString(); + return p.getCharsAsString(); } fn parseCladDatum(p: *Parser, c: u8, next: Fn) !void { return switch (c) { - '|' => p.jump(next, try p.parseEscapedString('|')), - '"' => p.jump(next, try p.parseEscapedString('"')), - '#' => p.parseHashExpression(next), + '|' => p.jump(next, try p.parseString('|')), + '"' => p.jump(next, try p.parseString('"')), + '#' => p.parseHashExpr(next), '(', '[', '{' => p.parseList(c, next), '\'', '`', ',' => p.parseQuoteExpr(c, next), else => p.abort(next, c), }; } -fn parseEscapedString(p: *Parser, close: u8) !Value { - while (try p.read()) |c| { - if (c == close) { - const s = p.getString(); +fn parseString(p: *Parser, comptime close: u8) !Value { + while (try p.read()) |c| sw: switch (c) { + close => { + const s = p.getCharsAsString(); return if (close == '"') p.cons(QUOTE, s) else s; - } - if (c != '\\') { - try p.addChar(c); - } else { - try p.parseQuotedEsc(close); - } - } - return error.UnclosedString; + }, + '\\' => switch (try p.readNoEof("string backslash escape")) { + '\\', '|', '"' => |c2| try p.addChar(c2), + '\t', ' ' => { + const c2 = try p.skipStringLfEscape(); + continue :sw c2; + }, + '\n' => { + const c2 = try p.skipStringIndent(); + continue :sw c2; + }, + 'x' => try p.parseStringRawHexEsc(), + 'u' => try p.parseStringUniHexEsc(), + else => |c2| try p.parseStringCharEsc(c2), + }, + // Important to use a capture here, since it may come from a labeled + // continue statement passing a new char directly to the switch. + else => |c2| try p.addChar(c2), + }; + return p.err(.UnclosedString, .{close} ++ " string"); } -fn parseQuotedEsc(p: *Parser, close: u8) !void { - const c = try p.readNoEof("quoted escape"); - if (c == close) return p.addChar(close); - if (c == 'u') return p.parseUniHexHandleErrors(); - try p.addChar(switch (c) { - '\\' => c, - '0' => 0, - 'a' => 7, - 'b' => 8, - 't' => 9, - 'n' => 10, - 'v' => 11, - 'f' => 12, - 'r' => 13, - 'e' => 27, - 'x' => try p.parseHexByte("hex escape"), - else => return p.err(.InvalidCharacter, "quoted escape"), - }); +fn skipStringLfEscape(p: *Parser) !u8 { + const msg = "string linefeed escape"; + while (try p.read()) |c| switch (c) { + '\t', ' ' => {}, + '\n' => return p.skipStringIndent(), + else => return p.err(.InvalidCharacter, msg), + }; + return p.err(.UnclosedString, msg); } -fn parseUniHexHandleErrors(p: *Parser) !void { - return p.parseUniHex() catch |e| switch (e) { - error.Utf8CannotEncodeSurrogateHalf => p.err( - .UnicodeError, - "unicode escape", - ), - else => e, +fn skipStringIndent(p: *Parser) !u8 { + while (try p.read()) |c| switch (c) { + '\t', ' ' => {}, + else => return c, }; + return p.err(.UnclosedString, "string linefeed escape"); } -fn parseUniHex(p: *Parser) !void { - const msg = "unicode escape"; - - if (try p.readNoEof(msg) != '{') { - return p.err(.InvalidCharacter, msg); +fn parseStringRawHexEsc(p: *Parser) !void { + const msg = "string raw hex escape"; + while (try p.read()) |c1| { + if (c1 == ';') return; + const c2 = try p.readNoEof(msg); + const hi = try p.parseHexDigit(c1, msg); + const lo = try p.parseHexDigit(c2, msg); + try p.addChar(hi << 4 | lo); } + return p.err(.UnclosedString, msg); +} + +fn parseStringUniHexEsc(p: *Parser) !void { + const msg = "string unicode escape"; const uc = try p.parseHex(u21, msg); - const c = p.getUnread() orelse return p.err(.UnexpectedEof, msg); - if (c != '}') { + const c = p.getUnread() orelse try p.readNoEof(msg); + if (c != ';') { return p.err(.InvalidCharacter, msg); } - const n = try std.unicode.utf8CodepointSequenceLength(uc); + const n = std.unicode.utf8CodepointSequenceLength(uc) catch { + return p.err(.UnicodeError, msg); + }; const buf = try p.chars.addManyAsSlice(p.alloc.chars, n); - _ = try std.unicode.utf8Encode(uc, buf); + const n2 = std.unicode.utf8Encode(uc, buf) catch { + return p.err(.UnicodeError, msg); + }; + std.debug.assert(n == n2); +} + +fn parseStringCharEsc(p: *Parser, c: u8) !void { + try p.addChar(switch (c) { + 'a' => 7, + 'b' => 8, + 't' => 9, + 'n' => 10, + 'v' => 11, + 'f' => 12, + 'r' => 13, + 'e' => 27, + else => return p.err(.InvalidCharacter, "string backslash escape"), + }); } -fn parseHashExpression(p: *Parser, next: Fn) !void { +fn parseHashExpr(p: *Parser, next: Fn) !void { const c = try p.readNoEof("hash expression"); if (std.ascii.isAlphabetic(c)) { const r = try p.parseRune(c); @@ -544,11 +571,11 @@ fn parseRune(p: *Parser, c1: u8) !Value { while (try p.read()) |c| : (len += 1) { if (len == 6 or !std.ascii.isAlphanumeric(c)) { p.unread(c); - return p.getRune(); + return p.getCharsAsRune(); } try p.addChar(c); } - return p.getRune(); + return p.getCharsAsRune(); } fn parseRuneEnd(p: *Parser, r: Value, next: Fn) !void { @@ -558,10 +585,10 @@ fn parseRuneEnd(p: *Parser, r: Value, next: Fn) !void { return p.jump(next, p.cons(r, try p.parseBareString(c1))); } if (c == '"') { - return p.jump(next, p.cons(r, try p.parseEscapedString('"'))); + return p.jump(next, p.cons(r, try p.parseString('"'))); } if (c == '|') { - return p.jump(next, p.cons(r, try p.parseEscapedString('|'))); + return p.jump(next, p.cons(r, try p.parseString('|'))); } p.unread(c); switch (c) { @@ -752,18 +779,9 @@ fn isBareChar(c: u8) bool { }; } -fn isBareEsc(c: u8) bool { - return switch (c) { - 33...126 => true, - else => false, - }; -} - fn parseHex(p: *Parser, T: type, comptime emsg: []const u8) !T { - var uc: T = undefined; - const c1 = try p.readNoEof(emsg); - uc = try p.parseHexDigit(c1, emsg); + var uc: T = try p.parseHexDigit(c1, emsg); while (try p.read()) |c| { if (!std.ascii.isHex(c)) { @@ -777,14 +795,6 @@ fn parseHex(p: *Parser, T: type, comptime emsg: []const u8) !T { return uc; } -fn parseHexByte(p: *Parser, comptime emsg: []const u8) !u8 { - const h1 = try p.readNoEof(emsg); - const h2 = try p.readNoEof(emsg); - const hi = try p.parseHexDigit(h1, emsg); - const lo = try p.parseHexDigit(h2, emsg); - return hi << 4 | lo; -} - fn parseHexDigit(p: *Parser, c: u8, comptime emsg: []const u8) !u8 { return switch (c) { '0'...'9' => c - '0', diff --git a/src/zisp/io/parser.zig b/src/zisp/io/parser.zig index cb96e44..cfe2bf1 100644 --- a/src/zisp/io/parser.zig +++ b/src/zisp/io/parser.zig @@ -56,7 +56,7 @@ pub fn parse(input: std.io.AnyReader) Value { pub fn _parse( input: std.io.AnyReader, comptime panic: bool, -) if (panic) Value else error{ParseError}!Value { +) if (panic) Value else error{ ParseError, OutOfMemory }!Value { const alloc = std.heap.smp_allocator; var sfa = DefaultSfa.init(alloc); var p = initSfa(&sfa) catch |e| if (panic) @panic("OOM") else return e; diff --git a/src/zisp/value.zig b/src/zisp/value.zig index 465cbbb..449a577 100644 --- a/src/zisp/value.zig +++ b/src/zisp/value.zig @@ -134,10 +134,9 @@ // // ==== Strings ==== // -// Another 48-bit space is used for strings of zero to six bytes. These are -// NUL-terminated if shorter than six bytes, meaning that NUL cannot appear in -// them, and they must be valid UTF-8, meaning that some other values could be -// hidden here in the future. (UTF-8 sequences cannot contain 0xFE or 0xFF.) +// Another 48-bit space is used for strings of zero to six bytes. Like runes, +// these are NUL-terminated if shorter than six bytes, meaning that NUL cannot +// appear in them, but otherwise they allow arbitrary bytes. // // ==== Small rationals ==== // |
