More proper shebang line parsing.

author: Taylan Kammer <taylan.kammer@gmail.com> 2026-05-31 20:58:42 +0200
committer: Taylan Kammer <taylan.kammer@gmail.com> 2026-05-31 20:58:42 +0200
commit: 37ff7af18cd2e896506e6d228058204525b4a6eb (patch)
tree: b45e29afac99b8e6eb21f5eaf040f640221220e8
parent: 6794e27eac3e866aa2b24999e2027b301a52ebf2 (diff)
5 files changed, 89 insertions, 13 deletions
diff --git a/docs/c1/1-parse.md b/docs/c1/1-parse.md
index 0fc0da5..4eb5776 100644
--- a/docs/c1/1-parse.md
+++ b/docs/c1/1-parse.md
@@ -590,6 +590,20 @@ Notes:
   further decoding of enclosed data.  This is not so, since quoting is related
   to code evaluation, not decoding.
 
+
+## Shebang
+
+There is one final "syntax sugar" translation whose sole purpose is to allow a
+shebang line at the start of a file:
+
+    #!interpreter          ->  (#SHBANG & interpreter)
+
+    #!interpreter argline  ->  (#SHBANG interpreter & argline)
+
+Under default settings, the decoder will allow this datum to appear once at the
+beginning of a per-file decoding sequence, and simply discard it.
+
+
 <!--
 ;; Local Variables:
 ;; fill-column: 80
diff --git a/docs/c1/grammar/abnf.txt b/docs/c1/grammar/abnf.txt
index a5b9eca..aa67646 100644
--- a/docs/c1/grammar/abnf.txt
+++ b/docs/c1/grammar/abnf.txt
@@ -2,11 +2,27 @@
 
 ; Compatible with: https://www.quut.com/abnfgen/
 
-; It's unclear whether this grammar is truly complete.  It has been
-; verified not to produce text that is rejected by the Zisp parser
-; --except for Unicode escape sequences for surrogate code points--
-; but there may be some text that is accepted by the parser despite
-; not being grammatical according to these rules.
+; Unlike PEG, grammar rules in BNF are non-deterministic, which makes
+; it much more challenging to express our naive parse logic.  Whether
+; this ABNF file is truly accurate is difficult to assess.
+
+; The abnfgen(1) tool linked above can be used to generate arbitrary
+; strings matching the grammar in this file.  These can be fed into
+; the Zisp parser to reveal some potential bugs; either in the parser
+; itself, or this ABNF grammar.
+
+; Note that the tool may generate Zisp string literals with Unicode
+; escape sequences corresponding to surrogate code points; the parser
+; may reject these.  This is expected; it's difficult to rewrite this
+; ABNF grammar to exclude those Unicode values.
+
+; Other minor inaccuracies that aren't important include: This ABNF
+; forces line comments to be terminated with an LF character, when in
+; fact the end-of-file may also terminate them; the same applies to
+; hash-bang parsing which doesn't actually have to end in LF.  These
+; discrepancies won't make abnfgen(1) generate invalid strings; they
+; only make this ABNF more strict than the Zisp parser, so it won't
+; generate some strings that the parser would actually accept.
 
 
 Stream        = [ Unit *( Blank Unit ) ] *Blank [Trail]
@@ -52,7 +68,7 @@ RuneDotStr    = "#" RuneName "\" SpecialStr
 
 RuneClad      = "#" RuneName CladDatum
 
-HashBang      = "#" "!" *( SP / HTAB ) BareString
+HashBang      = "#" "!" *( SP / HTAB ) HBLine LF
 
 LabelRef      = "#" "%" Label "%"
 
@@ -101,6 +117,10 @@ RuneName      = ALPHA *5( ALPHA / DIGIT )
 
 Label         = 1*12( HEXDIG )
 
+HBLine        = 1*HBChar [ 1*( SP / HTAB ) *HBChar ]
+
+HBChar        = %x00-08 / %x0b-1f / %x21-ff ; any but HT, LF, SP
+
 
 RJoinDatum    = CladDatum / Rune / RuneStr / RuneDotStr / RuneClad
               / LabelRef / LabelDef / HashStr / HashDotStr / HashClad
diff --git a/docs/c1/grammar/peg.txt b/docs/c1/grammar/peg.txt
index 465123f..cee9c84 100644
--- a/docs/c1/grammar/peg.txt
+++ b/docs/c1/grammar/peg.txt
@@ -50,11 +50,12 @@ HexByte      <- HEXDIG HEXDIG
 UnicodeSV    <- HEXDIG+
 
 RuneExpr     <- Rune ( '\' BareString / CladDatum )?
-HashBang     <- '!' [\t ]* BareString
+HashBang     <- '!' [\t ]* HBLine '\n'?
 LabelExpr    <- '%' Label ( '%' / '=' Datum )
 HashDatum    <- '\' BareString / CladDatum
 
 Rune         <- ALPHA ( ALPHA / DIGIT )*
+HBLine       <- HBChars+ [\t ]* ( HBChars+ )?
 Label        <- HEXDIG+
 
 ParenList    <- '(' ListBody ')'
@@ -63,6 +64,9 @@ BraceList    <- '{' ListBody '}'
 
 ListBody     <- Unit* ( Blank* '&' Unit )? Blank*
 
+HBChars       : ~( SP | HTAB | LF )
+
+
 DIGIT        <- [0-9]
 ALPHA        <- [a-zA-Z]
 HEXDIG       <- [0-9a-fA-F]
diff --git a/docs/c1/grammar/zbnf.txt b/docs/c1/grammar/zbnf.txt
index 0cbceab..704db22 100644
--- a/docs/c1/grammar/zbnf.txt
+++ b/docs/c1/grammar/zbnf.txt
@@ -50,11 +50,12 @@ HexByte       : HEXDIG HEXDIG
 UnicodeSV     : HEXDIG+
 
 RuneExpr      : Rune [ '\' BareString | CladDatum ]
-HashBang      : '!' ( SP | HTAB )* BareString
+HashBang      : '!' ( SP | HTAB )* HBLine [ LF ]
 LabelExpr     : '%' Label ( '%' | '=' Datum )
 HashDatum     : '\' BareString | CladDatum
 
 Rune          : ALPHA ( ALPHA | DIGIT )*
+HBLine        : HBChars+ ( SP | HTAB )* [ HBChars+ ]
 Label         : HEXDIG+
 
 ParenList     : '(' ListBody ')'
@@ -63,6 +64,8 @@ BraceList     : '{' ListBody '}'
 
 ListBody      : Unit* [ Blank* '&' Unit ] Blank*
 
+HBChars       : ~( SP | HTAB | LF )
+
 
 ;; Local Variables:
 ;; eval: (flyspell-mode -1)
diff --git a/src/zisp/io/Parser.zig b/src/zisp/io/Parser.zig
index 8e2908d..e29868a 100644
--- a/src/zisp/io/Parser.zig
+++ b/src/zisp/io/Parser.zig
@@ -620,11 +620,46 @@ fn endRuneDatum(p: *Parser) !void {
 }
 
 fn parseHashBang(p: *Parser, next: Fn) !void {
-    while (try p.readNoEof2("hash-bang")) |c| {
-        if (c == ' ' or c == '\t') continue;
-        const s = try p.getBareString(c);
-        return p.jump(next, p.cons(SHBANG, s));
-    }
+    const val = try p.getHashBangValue();
+    return p.jump(next, p.cons(SHBANG, val));
+}
+
+fn getHashBangValue(p: *Parser) !Value {
+    while (try p.readNoEof2("hash-bang")) |c| switch (c) {
+        ' ', '\t' => continue,
+        '\n' => return p.err(.InvalidCharacter, "hash-bang"),
+        else => {
+            try p.addChar(c);
+            while (try p.read()) |c2| switch (c2) {
+                '\n' => return p.getCharsAsString(),
+                ' ', '\t' => break,
+                else => try p.addChar(c2),
+            };
+            const interp = try p.getCharsAsString();
+            if (try p.getHashBangArgLine()) |arg_line| {
+                return p.cons(interp, arg_line);
+            } else {
+                return interp;
+            }
+        },
+    };
+    unreachable;
+}
+
+fn getHashBangArgLine(p: *Parser) !?Value {
+    while (try p.read()) |c| switch (c) {
+        ' ', '\t' => continue,
+        '\n' => return null,
+        else => {
+            try p.addChar(c);
+            while (try p.read()) |c2| {
+                if (c2 == '\n') break;
+                try p.addChar(c2);
+            }
+            return try p.getCharsAsString();
+        },
+    };
+    return null;
 }
 
 fn parseLabel(p: *Parser, next: Fn) !void {
author	Taylan Kammer <taylan.kammer@gmail.com>	2026-05-31 20:58:42 +0200
committer	Taylan Kammer <taylan.kammer@gmail.com>	2026-05-31 20:58:42 +0200
commit	37ff7af18cd2e896506e6d228058204525b4a6eb (patch)
tree	b45e29afac99b8e6eb21f5eaf040f640221220e8
parent	6794e27eac3e866aa2b24999e2027b301a52ebf2 (diff)