summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTaylan Kammer <taylan.kammer@gmail.com>2026-05-31 20:58:42 +0200
committerTaylan Kammer <taylan.kammer@gmail.com>2026-05-31 20:58:42 +0200
commit37ff7af18cd2e896506e6d228058204525b4a6eb (patch)
treeb45e29afac99b8e6eb21f5eaf040f640221220e8
parent6794e27eac3e866aa2b24999e2027b301a52ebf2 (diff)
More proper shebang line parsing.
-rw-r--r--docs/c1/1-parse.md14
-rw-r--r--docs/c1/grammar/abnf.txt32
-rw-r--r--docs/c1/grammar/peg.txt6
-rw-r--r--docs/c1/grammar/zbnf.txt5
-rw-r--r--src/zisp/io/Parser.zig45
5 files changed, 89 insertions, 13 deletions
diff --git a/docs/c1/1-parse.md b/docs/c1/1-parse.md
index 0fc0da5..4eb5776 100644
--- a/docs/c1/1-parse.md
+++ b/docs/c1/1-parse.md
@@ -590,6 +590,20 @@ Notes:
further decoding of enclosed data. This is not so, since quoting is related
to code evaluation, not decoding.
+
+## Shebang
+
+There is one final "syntax sugar" translation whose sole purpose is to allow a
+shebang line at the start of a file:
+
+ #!interpreter -> (#SHBANG & interpreter)
+
+ #!interpreter argline -> (#SHBANG interpreter & argline)
+
+Under default settings, the decoder will allow this datum to appear once at the
+beginning of a per-file decoding sequence, and simply discard it.
+
+
<!--
;; Local Variables:
;; fill-column: 80
diff --git a/docs/c1/grammar/abnf.txt b/docs/c1/grammar/abnf.txt
index a5b9eca..aa67646 100644
--- a/docs/c1/grammar/abnf.txt
+++ b/docs/c1/grammar/abnf.txt
@@ -2,11 +2,27 @@
; Compatible with: https://www.quut.com/abnfgen/
-; It's unclear whether this grammar is truly complete. It has been
-; verified not to produce text that is rejected by the Zisp parser
-; --except for Unicode escape sequences for surrogate code points--
-; but there may be some text that is accepted by the parser despite
-; not being grammatical according to these rules.
+; Unlike PEG, grammar rules in BNF are non-deterministic, which makes
+; it much more challenging to express our naive parse logic. Whether
+; this ABNF file is truly accurate is difficult to assess.
+
+; The abnfgen(1) tool linked above can be used to generate arbitrary
+; strings matching the grammar in this file. These can be fed into
+; the Zisp parser to reveal some potential bugs; either in the parser
+; itself, or this ABNF grammar.
+
+; Note that the tool may generate Zisp string literals with Unicode
+; escape sequences corresponding to surrogate code points; the parser
+; may reject these. This is expected; it's difficult to rewrite this
+; ABNF grammar to exclude those Unicode values.
+
+; Other minor inaccuracies that aren't important include: This ABNF
+; forces line comments to be terminated with an LF character, when in
+; fact the end-of-file may also terminate them; the same applies to
+; hash-bang parsing which doesn't actually have to end in LF. These
+; discrepancies won't make abnfgen(1) generate invalid strings; they
+; only make this ABNF more strict than the Zisp parser, so it won't
+; generate some strings that the parser would actually accept.
Stream = [ Unit *( Blank Unit ) ] *Blank [Trail]
@@ -52,7 +68,7 @@ RuneDotStr = "#" RuneName "\" SpecialStr
RuneClad = "#" RuneName CladDatum
-HashBang = "#" "!" *( SP / HTAB ) BareString
+HashBang = "#" "!" *( SP / HTAB ) HBLine LF
LabelRef = "#" "%" Label "%"
@@ -101,6 +117,10 @@ RuneName = ALPHA *5( ALPHA / DIGIT )
Label = 1*12( HEXDIG )
+HBLine = 1*HBChar [ 1*( SP / HTAB ) *HBChar ]
+
+HBChar = %x00-08 / %x0b-1f / %x21-ff ; any but HT, LF, SP
+
RJoinDatum = CladDatum / Rune / RuneStr / RuneDotStr / RuneClad
/ LabelRef / LabelDef / HashStr / HashDotStr / HashClad
diff --git a/docs/c1/grammar/peg.txt b/docs/c1/grammar/peg.txt
index 465123f..cee9c84 100644
--- a/docs/c1/grammar/peg.txt
+++ b/docs/c1/grammar/peg.txt
@@ -50,11 +50,12 @@ HexByte <- HEXDIG HEXDIG
UnicodeSV <- HEXDIG+
RuneExpr <- Rune ( '\' BareString / CladDatum )?
-HashBang <- '!' [\t ]* BareString
+HashBang <- '!' [\t ]* HBLine '\n'?
LabelExpr <- '%' Label ( '%' / '=' Datum )
HashDatum <- '\' BareString / CladDatum
Rune <- ALPHA ( ALPHA / DIGIT )*
+HBLine <- HBChars+ [\t ]* ( HBChars+ )?
Label <- HEXDIG+
ParenList <- '(' ListBody ')'
@@ -63,6 +64,9 @@ BraceList <- '{' ListBody '}'
ListBody <- Unit* ( Blank* '&' Unit )? Blank*
+HBChars : ~( SP | HTAB | LF )
+
+
DIGIT <- [0-9]
ALPHA <- [a-zA-Z]
HEXDIG <- [0-9a-fA-F]
diff --git a/docs/c1/grammar/zbnf.txt b/docs/c1/grammar/zbnf.txt
index 0cbceab..704db22 100644
--- a/docs/c1/grammar/zbnf.txt
+++ b/docs/c1/grammar/zbnf.txt
@@ -50,11 +50,12 @@ HexByte : HEXDIG HEXDIG
UnicodeSV : HEXDIG+
RuneExpr : Rune [ '\' BareString | CladDatum ]
-HashBang : '!' ( SP | HTAB )* BareString
+HashBang : '!' ( SP | HTAB )* HBLine [ LF ]
LabelExpr : '%' Label ( '%' | '=' Datum )
HashDatum : '\' BareString | CladDatum
Rune : ALPHA ( ALPHA | DIGIT )*
+HBLine : HBChars+ ( SP | HTAB )* [ HBChars+ ]
Label : HEXDIG+
ParenList : '(' ListBody ')'
@@ -63,6 +64,8 @@ BraceList : '{' ListBody '}'
ListBody : Unit* [ Blank* '&' Unit ] Blank*
+HBChars : ~( SP | HTAB | LF )
+
;; Local Variables:
;; eval: (flyspell-mode -1)
diff --git a/src/zisp/io/Parser.zig b/src/zisp/io/Parser.zig
index 8e2908d..e29868a 100644
--- a/src/zisp/io/Parser.zig
+++ b/src/zisp/io/Parser.zig
@@ -620,11 +620,46 @@ fn endRuneDatum(p: *Parser) !void {
}
fn parseHashBang(p: *Parser, next: Fn) !void {
- while (try p.readNoEof2("hash-bang")) |c| {
- if (c == ' ' or c == '\t') continue;
- const s = try p.getBareString(c);
- return p.jump(next, p.cons(SHBANG, s));
- }
+ const val = try p.getHashBangValue();
+ return p.jump(next, p.cons(SHBANG, val));
+}
+
+fn getHashBangValue(p: *Parser) !Value {
+ while (try p.readNoEof2("hash-bang")) |c| switch (c) {
+ ' ', '\t' => continue,
+ '\n' => return p.err(.InvalidCharacter, "hash-bang"),
+ else => {
+ try p.addChar(c);
+ while (try p.read()) |c2| switch (c2) {
+ '\n' => return p.getCharsAsString(),
+ ' ', '\t' => break,
+ else => try p.addChar(c2),
+ };
+ const interp = try p.getCharsAsString();
+ if (try p.getHashBangArgLine()) |arg_line| {
+ return p.cons(interp, arg_line);
+ } else {
+ return interp;
+ }
+ },
+ };
+ unreachable;
+}
+
+fn getHashBangArgLine(p: *Parser) !?Value {
+ while (try p.read()) |c| switch (c) {
+ ' ', '\t' => continue,
+ '\n' => return null,
+ else => {
+ try p.addChar(c);
+ while (try p.read()) |c2| {
+ if (c2 == '\n') break;
+ try p.addChar(c2);
+ }
+ return try p.getCharsAsString();
+ },
+ };
+ return null;
}
fn parseLabel(p: *Parser, next: Fn) !void {