summaryrefslogtreecommitdiff
path: root/spec
diff options
context:
space:
mode:
Diffstat (limited to 'spec')
-rw-r--r--spec/parser.ebnf75
-rw-r--r--spec/syntax.md34
2 files changed, 109 insertions, 0 deletions
diff --git a/spec/parser.ebnf b/spec/parser.ebnf
new file mode 100644
index 0000000..9e02fba
--- /dev/null
+++ b/spec/parser.ebnf
@@ -0,0 +1,75 @@
+unit : blank* ( datum blank? | EOF ) ;
+
+
+blank : 9...13 | comment ;
+
+datum : one_datum ( join_char? one_datum )* ;
+
+join_char : '.' | ':' | '|' ;
+
+
+comment : ';' ( skip_unit | skip_line ) ;
+
+skip_unit : '~' unit ;
+
+skip_line : ( ~LF )* LF? ;
+
+
+one_datum : ( bare_str | clad_datum ) ;
+
+bare_str : bare_str_elt+ ;
+
+clad_datum : '\' bare_esc_str
+ | '"' quoted_str '"'
+ | '#' hash_expr
+ | '(' blank* list? ')'
+ | '[' blank* list? ']'
+ | '{' blank* list? '}'
+ | quote_expr
+ ;
+
+
+bare_str_elt : bare_char | '\' bare_esc ;
+
+
+bare_esc_str : bare_esc bare_str_elt* ;
+
+quoted_str : ( quoted_char | '\' quoted_esc )* ;
+
+hash_expr : rune clad_datum?
+ | '%' label ( '%' | '=' unit )
+ | clad_datum
+ ;
+
+list : unit+ ( '.' blank+ unit )? blank* ;
+
+quote_expr : ( "'" | "`" | "," ) datum ;
+
+
+bare_char : letter | digit
+ | '!' | '$' | '%' | '&' | '*' | '+' | '-' | '/'
+ | '<' | '=' | '>' | '?' | '@' | '^' | '_' | '~'
+ ;
+
+bare_esc : 33...126 ;
+
+
+quoted_char : ~( '"' | '\' ) ;
+
+quoted_esc : '\' | '"' | 'a' | 'b' | 'e'
+ | 'f' | 'n' | 'r' | 't' | 'v'
+ | 'x' hex_digit{2}
+ | 'u' '{' hex_digit+ '}'
+ ;
+
+
+rune : letter ( letter | digit ){0,5} ;
+
+label : hex_digit{1,12} ;
+
+
+letter : 'a'...'z' | 'A'...'Z' ;
+
+digit : '0'...'9' ;
+
+hex_digit : '0'...'9' | 'a'...'f' | 'A'...'F' ;
diff --git a/spec/syntax.md b/spec/syntax.md
new file mode 100644
index 0000000..b85ed78
--- /dev/null
+++ b/spec/syntax.md
@@ -0,0 +1,34 @@
+# Zisp S-Expression Syntax
+
+We use a BNF notation with the following rules:
+
+* Concatenation of expressions is implicit: `foo bar` means `foo`
+ followed by `bar`.
+
+* Expressions may be followed by `?`, `*`, `+`, `{N}`, or `{N,M}`,
+ which have the meanings they have in regular expressions.
+
+* The syntax is defined in terms of bytes, not characters. Terminals
+ `'c'` and `"c"` refer to the ASCII value of the given character `c`.
+ Numbers are in decimal and refer to a byte with the given value.
+
+* The `~` prefix means NOT. It only applies to rules that match one
+ byte, and negates them. For example, `~( 'a' | 'b' )` matches any
+ byte other than 97 and 98.
+
+* Ranges of terminal values are expressed as `x...y` (inclusive).
+
+* There is no ambiguity, backtracking, or look-ahead beyond the byte
+ currently being matched. Rules match left to right, depth-first,
+ and greedy. As soon as the input matches the first terminal of a
+ rule, it must match that rule to the end.
+
+The last rule means that the BNF is very simple to translate to code.
+
+The parser consumes one `unit` from an input stream every time it's
+called; it returns the `datum` therein, or EOF.
+
+```
+
+
+```