summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTaylan Kammer <taylan.kammer@gmail.com>2025-04-07 09:20:45 +0200
committerTaylan Kammer <taylan.kammer@gmail.com>2025-04-07 09:20:45 +0200
commita8785d355b5ef81930af52dbc58f3bbb660d1ac2 (patch)
tree28095bcca9ee7603dcbb94ad3fa41b4c89853f01
parent70089dacfa6bab5a1e1d0d5aa257e2d671493beb (diff)
Implement escaped newline in strings.
-rw-r--r--spec/parser.bnf70
-rw-r--r--spec/syntax.abnf55
-rw-r--r--spec/syntax.zbnf55
-rw-r--r--src/test/parse.zig2
-rw-r--r--src/zisp/io/Parser.zig160
-rw-r--r--src/zisp/io/parser.zig2
-rw-r--r--src/zisp/value.zig7
7 files changed, 200 insertions, 151 deletions
diff --git a/spec/parser.bnf b/spec/parser.bnf
deleted file mode 100644
index 338dc10..0000000
--- a/spec/parser.bnf
+++ /dev/null
@@ -1,70 +0,0 @@
-unit : blank* ( datum blank? | EOF ) ;
-
-
-blank : 9...13 | comment ;
-
-datum : one_datum ( join_char? one_datum )* ;
-
-join_char : '.' | ':' ;
-
-
-comment : ';' ( skip_unit | skip_line ) ;
-
-skip_unit : '~' unit ;
-
-skip_line : ( ~LF )* LF? ;
-
-
-one_datum : bare_string | clad_datum ;
-
-bare_string : ( '.' | '+' | '-' | DIGIT ) ( bare_char | '.' )*
- | bare_char+
- ;
-
-clad_datum : '\' bare_string
- | '|' pipe_str_elt* '|'
- | '"' quot_str_elt* '"'
- | '#' hash_expr
- | '(' list ')'
- | '[' list ']'
- | '{' list '}'
- | quote_expr
- ;
-
-
-bare_char : ALPHA | DIGIT | bare_punct ;
-
-bare_punct : '!' | '$' | '%' | '&' | '*' | '+' | '-' | '/'
- | '<' | '=' | '>' | '?' | '@' | '^' | '_' | '~'
- ;
-
-
-pipe_str_elt : ~( '|' | '\' ) | '\' pipe_esc ;
-
-quot_str_elt : ~( '"' | '\' ) | '\' quot_esc ;
-
-hash_expr : rune clad_datum?
- | rune '\' bare_string
- | '\' bare_string
- | '%' label ( '%' | '=' datum )
- | clad_datum
- ;
-
-list : unit* ( '.' unit )? blank* ;
-
-quote_expr : ( "'" | "`" | "," ) datum ;
-
-
-pipe_esc : string_esc | '|' ;
-
-quot_esc : string_esc | '"' ;
-
-string_esc : '\' | 'a' | 'b' | 'e' | 'f' | 'n' | 'r' | 't' | 'v'
- | 'x' HEXDIG{2}
- | 'u' '{' HEXDIG+ '}'
- ;
-
-
-rune : ALPHA ( ALPHA | DIGIT ){0,5} ;
-
-label : HEXDIG{1,12} ;
diff --git a/spec/syntax.abnf b/spec/syntax.abnf
new file mode 100644
index 0000000..a3c4ab9
--- /dev/null
+++ b/spec/syntax.abnf
@@ -0,0 +1,55 @@
+Unit = *Blank Datum [ ';' SkipLine ]
+
+
+Blank = HTAB / LF / %x0b / %x0c / CR / Comment
+
+Datum = OneDatum *( [JoinChar] OneDatum )
+
+JoinChar = '.' / ':'
+
+
+Comment = ';' ( SkipUnit / SkipLine LF )
+
+SkipUnit = '~' Unit
+
+SkipLine = *( %x00-09 / %x0b-ff ) ; any but LF
+
+
+OneDatum = BareString / CladDatum
+
+BareString = ( '.' / '+' / '-' / DIGIT ) *( BareChar / '.' )
+ / 1*BareChar
+
+CladDatum = '|' *( PipeStrChar / '\' StringEsc ) '|'
+ / '"' *( QuotStrChar / '\' StringEsc ) '"'
+ / '#' HashExpr
+ / '(' List ')' / '[' List ']' / '{' List '}'
+ / "'" Datum / '`' Datum / ',' Datum
+
+
+BareChar = ALPHA / DIGIT
+ / '!' / '$' / '%' / '&' / '*' / '+' / '-' / '/'
+ / '<' / '=' / '>' / '?' / '@' / '^' / '_' / '~'
+
+
+PipeStrChar = %x00-5b / %x5d-7b / %x7d-ff ; any but | or \
+
+QuotStrChar = %x00-21 / %x23-5b / %x5d-ff ; any but " or \
+
+HashExpr = rune ( '\' BareString / [CladDatum] )
+ / '\' BareString
+ / '%' Label ( '%' / '=' Datum )
+ / CladDatum
+
+List = *Unit [ '.' Unit ] *Blank
+
+
+StringEsc = '\' / '|' / '"' / *( HTAB / SP ) LF *( HTAB / SP )
+ / 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e'
+ / 'x' 1*( 2*HEXDIG ) ';'
+ / 'u' 1*6( HEXDIG ) ';'
+
+
+Rune = ALPHA *5( ALPHA / DIGIT )
+
+Label = 1*12( HEXDIG )
diff --git a/spec/syntax.zbnf b/spec/syntax.zbnf
new file mode 100644
index 0000000..5656864
--- /dev/null
+++ b/spec/syntax.zbnf
@@ -0,0 +1,55 @@
+Unit : Blank* ( Datum [Blank] | EOF )
+
+
+Blank : 9...13 | Comment
+
+Datum : OneDatum ( [JoinChar] OneDatum )*
+
+JoinChar : '.' | ':'
+
+
+Comment : ';' ( SkipUnit | SkipLine )
+
+SkipUnit : '~' Unit
+
+SkipLine : ( ~LF )* [LF]
+
+
+OneDatum : BareString | CladDatum
+
+BareString : ( '.' | '+' | '-' | DIGIT ) ( BareChar | '.' )*
+ | BareChar+
+
+CladDatum : '|' PipeStrElt* '|'
+ | '"' QuotStrElt* '"'
+ | '#' HashExpr
+ | '(' List ')' | '[' List ']' | '{' List '}'
+ | "'" Datum | '`' Datum | ',' Datum
+
+
+BareChar : ALPHA | DIGIT
+ | '!' | '$' | '%' | '&' | '*' | '+' | '-' | '/'
+ | '<' | '=' | '>' | '?' | '@' | '^' | '_' | '~'
+
+
+PipeStrElt : ~( '|' | '\' ) | '\' StringEsc
+
+QuotStrElt : ~( '"' | '\' ) | '\' StringEsc
+
+HashExpr : Rune [ '\' BareString | CladDatum ]
+ | '\' BareString
+ | '%' Label ( '%' | '=' Datum )
+ | CladDatum
+
+List : Unit* [ '.' Unit ] Blank*
+
+
+StringEsc : '\' | '|' | '"' | ( HTAB | SP )* LF ( HTAB | SP )*
+ | 'a' | 'b' | 't' | 'n' | 'v' | 'f' | 'r' | 'e'
+ | 'x' ( HEXDIG{2} )+ ';'
+ | 'u' HEXDIG{1,6} ';'
+
+
+Rune : ALPHA ( ALPHA | DIGIT ){0,5}
+
+Label : HEXDIG{1,12}
diff --git a/src/test/parse.zig b/src/test/parse.zig
index f1a7857..dd26098 100644
--- a/src/test/parse.zig
+++ b/src/test/parse.zig
@@ -55,7 +55,7 @@ test "parse long bare string" {
try expect(parse("-foo.bar.baz").eq(str("-foo.bar.baz")));
try expect(parse("0foo.bar.baz").eq(str("0foo.bar.baz")));
try expect(parse("!$%&*+-/<=>?@^_~").eq(str("!$%&*+-/<=>?@^_~")));
- try expect(parse("|foo\\x20bar\\x0abaz|").eq(str("foo bar\nbaz")));
+ try expect(parse("|foo\\x20;bar\\x0a;baz|").eq(str("foo bar\nbaz")));
}
test "parse" {
diff --git a/src/zisp/io/Parser.zig b/src/zisp/io/Parser.zig
index 14db959..4a2ed35 100644
--- a/src/zisp/io/Parser.zig
+++ b/src/zisp/io/Parser.zig
@@ -188,7 +188,7 @@ fn addChar(p: *Parser, c: u8) !void {
try p.chars.append(p.alloc.chars, c);
}
-fn getString(p: *Parser) Value {
+fn getCharsAsString(p: *Parser) Value {
defer p.chars.clearRetainingCapacity();
return if (value.sstr.isValidSstr(p.chars.items))
value.sstr.pack(p.chars.items)
@@ -196,7 +196,7 @@ fn getString(p: *Parser) Value {
value.istr.intern(p.chars.items);
}
-fn getRune(p: *Parser) Value {
+fn getCharsAsRune(p: *Parser) Value {
defer p.chars.clearRetainingCapacity();
return value.rune.pack(p.chars.items);
}
@@ -433,84 +433,111 @@ fn parseBareString(p: *Parser, c1: u8) !Value {
break;
}
}
- return p.getString();
+ return p.getCharsAsString();
}
fn parseCladDatum(p: *Parser, c: u8, next: Fn) !void {
return switch (c) {
- '|' => p.jump(next, try p.parseEscapedString('|')),
- '"' => p.jump(next, try p.parseEscapedString('"')),
- '#' => p.parseHashExpression(next),
+ '|' => p.jump(next, try p.parseString('|')),
+ '"' => p.jump(next, try p.parseString('"')),
+ '#' => p.parseHashExpr(next),
'(', '[', '{' => p.parseList(c, next),
'\'', '`', ',' => p.parseQuoteExpr(c, next),
else => p.abort(next, c),
};
}
-fn parseEscapedString(p: *Parser, close: u8) !Value {
- while (try p.read()) |c| {
- if (c == close) {
- const s = p.getString();
+fn parseString(p: *Parser, comptime close: u8) !Value {
+ while (try p.read()) |c| sw: switch (c) {
+ close => {
+ const s = p.getCharsAsString();
return if (close == '"') p.cons(QUOTE, s) else s;
- }
- if (c != '\\') {
- try p.addChar(c);
- } else {
- try p.parseQuotedEsc(close);
- }
- }
- return error.UnclosedString;
+ },
+ '\\' => switch (try p.readNoEof("string backslash escape")) {
+ '\\', '|', '"' => |c2| try p.addChar(c2),
+ '\t', ' ' => {
+ const c2 = try p.skipStringLfEscape();
+ continue :sw c2;
+ },
+ '\n' => {
+ const c2 = try p.skipStringIndent();
+ continue :sw c2;
+ },
+ 'x' => try p.parseStringRawHexEsc(),
+ 'u' => try p.parseStringUniHexEsc(),
+ else => |c2| try p.parseStringCharEsc(c2),
+ },
+ // Important to use a capture here, since it may come from a labeled
+ // continue statement passing a new char directly to the switch.
+ else => |c2| try p.addChar(c2),
+ };
+ return p.err(.UnclosedString, .{close} ++ " string");
}
-fn parseQuotedEsc(p: *Parser, close: u8) !void {
- const c = try p.readNoEof("quoted escape");
- if (c == close) return p.addChar(close);
- if (c == 'u') return p.parseUniHexHandleErrors();
- try p.addChar(switch (c) {
- '\\' => c,
- '0' => 0,
- 'a' => 7,
- 'b' => 8,
- 't' => 9,
- 'n' => 10,
- 'v' => 11,
- 'f' => 12,
- 'r' => 13,
- 'e' => 27,
- 'x' => try p.parseHexByte("hex escape"),
- else => return p.err(.InvalidCharacter, "quoted escape"),
- });
+fn skipStringLfEscape(p: *Parser) !u8 {
+ const msg = "string linefeed escape";
+ while (try p.read()) |c| switch (c) {
+ '\t', ' ' => {},
+ '\n' => return p.skipStringIndent(),
+ else => return p.err(.InvalidCharacter, msg),
+ };
+ return p.err(.UnclosedString, msg);
}
-fn parseUniHexHandleErrors(p: *Parser) !void {
- return p.parseUniHex() catch |e| switch (e) {
- error.Utf8CannotEncodeSurrogateHalf => p.err(
- .UnicodeError,
- "unicode escape",
- ),
- else => e,
+fn skipStringIndent(p: *Parser) !u8 {
+ while (try p.read()) |c| switch (c) {
+ '\t', ' ' => {},
+ else => return c,
};
+ return p.err(.UnclosedString, "string linefeed escape");
}
-fn parseUniHex(p: *Parser) !void {
- const msg = "unicode escape";
-
- if (try p.readNoEof(msg) != '{') {
- return p.err(.InvalidCharacter, msg);
+fn parseStringRawHexEsc(p: *Parser) !void {
+ const msg = "string raw hex escape";
+ while (try p.read()) |c1| {
+ if (c1 == ';') return;
+ const c2 = try p.readNoEof(msg);
+ const hi = try p.parseHexDigit(c1, msg);
+ const lo = try p.parseHexDigit(c2, msg);
+ try p.addChar(hi << 4 | lo);
}
+ return p.err(.UnclosedString, msg);
+}
+
+fn parseStringUniHexEsc(p: *Parser) !void {
+ const msg = "string unicode escape";
const uc = try p.parseHex(u21, msg);
- const c = p.getUnread() orelse return p.err(.UnexpectedEof, msg);
- if (c != '}') {
+ const c = p.getUnread() orelse try p.readNoEof(msg);
+ if (c != ';') {
return p.err(.InvalidCharacter, msg);
}
- const n = try std.unicode.utf8CodepointSequenceLength(uc);
+ const n = std.unicode.utf8CodepointSequenceLength(uc) catch {
+ return p.err(.UnicodeError, msg);
+ };
const buf = try p.chars.addManyAsSlice(p.alloc.chars, n);
- _ = try std.unicode.utf8Encode(uc, buf);
+ const n2 = std.unicode.utf8Encode(uc, buf) catch {
+ return p.err(.UnicodeError, msg);
+ };
+ std.debug.assert(n == n2);
+}
+
+fn parseStringCharEsc(p: *Parser, c: u8) !void {
+ try p.addChar(switch (c) {
+ 'a' => 7,
+ 'b' => 8,
+ 't' => 9,
+ 'n' => 10,
+ 'v' => 11,
+ 'f' => 12,
+ 'r' => 13,
+ 'e' => 27,
+ else => return p.err(.InvalidCharacter, "string backslash escape"),
+ });
}
-fn parseHashExpression(p: *Parser, next: Fn) !void {
+fn parseHashExpr(p: *Parser, next: Fn) !void {
const c = try p.readNoEof("hash expression");
if (std.ascii.isAlphabetic(c)) {
const r = try p.parseRune(c);
@@ -544,11 +571,11 @@ fn parseRune(p: *Parser, c1: u8) !Value {
while (try p.read()) |c| : (len += 1) {
if (len == 6 or !std.ascii.isAlphanumeric(c)) {
p.unread(c);
- return p.getRune();
+ return p.getCharsAsRune();
}
try p.addChar(c);
}
- return p.getRune();
+ return p.getCharsAsRune();
}
fn parseRuneEnd(p: *Parser, r: Value, next: Fn) !void {
@@ -558,10 +585,10 @@ fn parseRuneEnd(p: *Parser, r: Value, next: Fn) !void {
return p.jump(next, p.cons(r, try p.parseBareString(c1)));
}
if (c == '"') {
- return p.jump(next, p.cons(r, try p.parseEscapedString('"')));
+ return p.jump(next, p.cons(r, try p.parseString('"')));
}
if (c == '|') {
- return p.jump(next, p.cons(r, try p.parseEscapedString('|')));
+ return p.jump(next, p.cons(r, try p.parseString('|')));
}
p.unread(c);
switch (c) {
@@ -752,18 +779,9 @@ fn isBareChar(c: u8) bool {
};
}
-fn isBareEsc(c: u8) bool {
- return switch (c) {
- 33...126 => true,
- else => false,
- };
-}
-
fn parseHex(p: *Parser, T: type, comptime emsg: []const u8) !T {
- var uc: T = undefined;
-
const c1 = try p.readNoEof(emsg);
- uc = try p.parseHexDigit(c1, emsg);
+ var uc: T = try p.parseHexDigit(c1, emsg);
while (try p.read()) |c| {
if (!std.ascii.isHex(c)) {
@@ -777,14 +795,6 @@ fn parseHex(p: *Parser, T: type, comptime emsg: []const u8) !T {
return uc;
}
-fn parseHexByte(p: *Parser, comptime emsg: []const u8) !u8 {
- const h1 = try p.readNoEof(emsg);
- const h2 = try p.readNoEof(emsg);
- const hi = try p.parseHexDigit(h1, emsg);
- const lo = try p.parseHexDigit(h2, emsg);
- return hi << 4 | lo;
-}
-
fn parseHexDigit(p: *Parser, c: u8, comptime emsg: []const u8) !u8 {
return switch (c) {
'0'...'9' => c - '0',
diff --git a/src/zisp/io/parser.zig b/src/zisp/io/parser.zig
index cb96e44..cfe2bf1 100644
--- a/src/zisp/io/parser.zig
+++ b/src/zisp/io/parser.zig
@@ -56,7 +56,7 @@ pub fn parse(input: std.io.AnyReader) Value {
pub fn _parse(
input: std.io.AnyReader,
comptime panic: bool,
-) if (panic) Value else error{ParseError}!Value {
+) if (panic) Value else error{ ParseError, OutOfMemory }!Value {
const alloc = std.heap.smp_allocator;
var sfa = DefaultSfa.init(alloc);
var p = initSfa(&sfa) catch |e| if (panic) @panic("OOM") else return e;
diff --git a/src/zisp/value.zig b/src/zisp/value.zig
index 465cbbb..449a577 100644
--- a/src/zisp/value.zig
+++ b/src/zisp/value.zig
@@ -134,10 +134,9 @@
//
// ==== Strings ====
//
-// Another 48-bit space is used for strings of zero to six bytes. These are
-// NUL-terminated if shorter than six bytes, meaning that NUL cannot appear in
-// them, and they must be valid UTF-8, meaning that some other values could be
-// hidden here in the future. (UTF-8 sequences cannot contain 0xFE or 0xFF.)
+// Another 48-bit space is used for strings of zero to six bytes. Like runes,
+// these are NUL-terminated if shorter than six bytes, meaning that NUL cannot
+// appear in them, but otherwise they allow arbitrary bytes.
//
// ==== Small rationals ====
//