Initial Authorative Implementation

rls-moe · Jul 28, 2017 · 1a502ac · 1a502ac
1 parent 58a6976
commit 1a502ac
Show file tree

Hide file tree

Showing 19 changed files with 1,220 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ key: (
  "scientific notation": 9*10^22
  "multi-line strings": "
  HELLO WORLD!"
- "omit quotation around simple strings": Hello-Again!
+ "omit quotation around simple strings": Hello-Again
  "load other files from subfolders": !(loadd "folder" ".conf")
  "empty maps": empty
  
@@ -34,6 +34,18 @@ key: (
 )
 ```
 
+## Project Status
+
+SECL is considered in Version 0.9
+
+Most features are now part of the authorative implementation in this repository.
+
+Missing features:
+
+ * Hex, Octal and Binary Notation ("0x1F", "0o777", "0b110101")
+ * Float E and Scientific Notation ("2.3e18" or "2.3*10^18")
+ * Function Execution (Functions are recognized but not executed)
+
 ## Introduction
 
 ### Strings
@@ -47,7 +59,8 @@ HelloWorld
 
 Reserved characters: `"!@:()`
 
-If any of the reserved characters is present, the string must be wrapped with quotation marks.
+If any of the reserved characters is present, the string must be wrapped with quotation marks. Additionally, if the string equals
+a keyword or function name, it must be wrapped with quotation marks too. (ex. "false" or "empty" or "randstr256")
 
 ```
 "Hello World"

diff --git a/lexer/tokenizer.go b/lexer/tokenizer.go
@@ -0,0 +1,172 @@
+package lexer
+
+import (
+ "fmt"
+ "unicode"
+)
+
+// The below is inspired by https://interpreterbook.com
+
+type Tokenizer struct {
+ input []rune
+ pos, rPos int
+ curRune rune
+}
+
+func NewTokenizer(input string) *Tokenizer {
+ t := &Tokenizer{
+ input: []rune(input),
+ }
+ t.readChar()
+ return t
+}
+
+func (t *Tokenizer) readChar() {
+ if t.rPos >= len(t.input) {
+ t.curRune = 0
+ } else {
+ t.curRune = t.input[t.rPos]
+ }
+ t.pos = t.rPos
+ t.rPos++
+}
+
+func (t *Tokenizer) peekChar() rune {
+ if t.rPos >= len(t.input) {
+ return 0
+ } else {
+ return t.input[t.rPos]
+ }
+}
+
+func (t *Tokenizer) NextToken() Token {
+ var tok Token
+
+ t.skipWhitespace()
+
+ switch t.curRune {
+ case '(':
+ tok = newToken(TT_mapListBegin, t.curRune, t.pos, t.pos)
+ case ')':
+ tok = newToken(TT_mapListEnd, t.curRune, t.pos, t.pos)
+ case '!':
+ tok = newToken(TT_mod_execMap, t.curRune, t.pos, t.pos)
+ case '@':
+ tok = newToken(TT_mod_trim, t.curRune, t.pos, t.pos)
+ case ':':
+ tok = newToken(TT_mod_mapKey, t.curRune, t.pos, t.pos)
+ case 0:
+ tok.Literal = ""
+ tok.Type = TT_eof
+ tok.Start, tok.End = t.pos, t.pos
+ default:
+ if isValidCommentStarter(t.curRune) {
+ tok.Type = TT_comment
+ if t.curRune == '/' && t.peekChar() == '*' {
+ tok.Literal, tok.Start, tok.End = t.readMlComment()
+ t.readChar()
+ return tok
+ } else if (t.curRune == '/' && t.peekChar() == '/') || t.curRune != '/' {
+ tok.Literal, tok.Start, tok.End = t.readSlComment()
+ t.readChar()
+ return tok
+ }
+ }
+ if isValidDigit(t.curRune) {
+ tok.Literal, tok.Start, tok.End = t.readNumber()
+ tok.Type = TT_number
+ return tok
+ } else if isValidChunkRune(t.curRune) {
+ tok.Literal, tok.Start, tok.End = t.readChunk()
+ if kw, kwtt := resolveKeyword(tok.Literal); kw {
+ tok.Type = kwtt
+ } else {
+ tok.Type = TT_singleWordString
+ }
+ return tok
+ } else if isValidStringBorder(t.curRune) {
+ tok.Literal, tok.Start, tok.End = t.readString()
+ tok.Type = TT_string
+ return tok
+ } else {
+ fmt.Print("Illegal char at ", t.pos)
+ tok = newToken(TT_illegal, t.curRune, t.pos, t.pos)
+ }
+ }
+
+ t.readChar()
+ return tok
+}
+
+func (t *Tokenizer) readChunk() (string, int, int) {
+ pos := t.pos
+ for isValidInChunk(t.curRune) {
+ t.readChar()
+ }
+ return string(t.input[pos:t.pos]), pos, t.pos - 1
+}
+
+func (t *Tokenizer) readNumber() (string, int, int) {
+ pos := t.pos
+ for isValidInNumber(t.curRune) {
+ t.readChar()
+ }
+ return string(t.input[pos:t.pos]), pos, t.pos - 1
+}
+
+func (t *Tokenizer) readString() (string, int, int) {
+ pos := t.pos
+ t.readChar()
+ for !isValidStringBorder(t.curRune) {
+ t.readChar()
+ if isEscapeInString(t.curRune) {
+ t.readChar()
+ t.readChar()
+ }
+ if isNull(t.curRune) {
+ return string(t.input[pos+1 : t.pos-1]), pos, -1
+ }
+ }
+ t.readChar()
+ return string(t.input[pos+1 : t.pos-1]), pos, t.pos - 1
+}
+
+func (t *Tokenizer) readMlComment() (string, int, int) {
+ pos := t.pos
+ for {
+ t.readChar()
+ if t.curRune == '*' && t.peekChar() == '/' {
+ t.readChar()
+ t.readChar()
+ return string(t.input[pos:t.pos]), pos, t.pos - 1
+ }
+ if t.curRune == 0 {
+ return string(t.input[pos:t.pos]), pos, -1
+ }
+ }
+}
+
+func (t *Tokenizer) readSlComment() (string, int, int) {
+ pos := t.pos
+ for {
+ t.readChar()
+ if t.curRune == '\n' {
+ return string(t.input[pos:t.pos]), pos, t.pos - 1
+ }
+ }
+}
+
+func (t *Tokenizer) skipWhitespace() {
+ for unicode.IsSpace(t.curRune) {
+ t.readChar()
+ }
+}
+
+func newToken(ttype TokenType, cr rune, start, end int) Token {
+ return Token{Type: ttype, Literal: string(cr), Start: start, End: end}
+}
+
+func resolveKeyword(chunk string) (bool, TokenType) {
+ tt, ok := keywords[chunk]
+ return ok, tt
+}
diff --git a/lexer/tokenizer_test.go b/lexer/tokenizer_test.go
@@ -0,0 +1,174 @@
+package lexer
+
+import "testing"
+
+func TestTokenizer_NextToken(t *testing.T) {
+ input := `( !(nop) let: 909 other:@"test" k2: "test
+ hello🤣🤣\"ttt" k3?: k4! decimal: 0.111e-19*10^10 let2: empty let3: nil randstr128 decb64 true off )
+# This is a comment
+; Also a comment
+// Also a comment
+/+notacomment
+/* this is a ml
+comment that
+should stop parsing here */ ()`
+
+ tests := []struct{
+ expectedType TokenType
+ expectedLiteral string
+ start,end int
+ }{
+ {TT_mapListBegin, "(", 0, 0},
+ {TT_mod_execMap, "!", 2, 2},
+ {TT_mapListBegin, "(", 3, 3},
+ {TT_function, "nop", 4, 6},
+ {TT_mapListEnd, ")", 7, 7},
+ {TT_singleWordString, "let", 9, 11},
+ {TT_mod_mapKey, ":", 12, 12},
+ {TT_number, "909", 14, 16},
+ {TT_singleWordString, "other", 18, 22},
+ {TT_mod_mapKey, ":", 23, 23},
+ {TT_mod_trim, "@", 24, 24},
+ {TT_string, "test", 25, 30},
+ {TT_singleWordString, "k2", 32, 33},
+ {TT_mod_mapKey, ":", 34, 34},
+ {TT_string, "test\n\thello🤣🤣\\\"ttt", 36, 55},
+ {TT_singleWordString, "k3?", 57, 59},
+ {TT_mod_mapKey, ":", 60, 60},
+ {TT_singleWordString, "k4!", 62, 64},
+ {TT_singleWordString, "decimal", 66, 72},
+ {TT_mod_mapKey, ":", 73, 73},
+ {TT_number, "0.111e-19*10^10", 75, 89},
+ {TT_singleWordString, "let2", 91, 94},
+ {TT_mod_mapKey, ":", 95, 95},
+ {TT_empty, "empty", 97, 101},
+ {TT_singleWordString, "let3", 103, 106},
+ {TT_mod_mapKey, ":", 107, 107},
+ {tt_nil, "nil",109,111},
+ {TT_randstr, "randstr128", 113, 122},
+ {TT_function, "decb64", 124, 129},
+ {TT_bool, "true", 131, 134},
+ {TT_bool, "off", 136, 138},
+ {TT_mapListEnd, ")", 140, 140},
+ {TT_comment, "# This is a comment", 142, 160},
+ {TT_comment, "; Also a comment", 162, 177},
+ {TT_comment, "// Also a comment", 179, 195},
+ {TT_singleWordString, "/+notacomment", 197, 209},
+ {TT_comment, "/* this is a ml\ncomment that\nshould stop parsing here */", 211, 266},
+ {TT_mapListBegin, "(", 268, 268},
+ {TT_mapListEnd, ")", 269, 269},
+ {TT_eof, "", 270, 270},
+ }
+
+ l := NewTokenizer(input)
+ //t.Logf("Testing input: %q", input)
+ for i, tt := range tests {
+
+ tok := l.NextToken()
+
+ if tok.Literal != tt.expectedLiteral {
+ t.Fatalf("test %d: literal wrong, expected %+#v (% x) got %+#v (% x)", i, tests[i], tests[i].expectedLiteral, tok, tok.Literal)
+ }
+
+ if tok.Type != tt.expectedType {
+ t.Fatalf("test %d: type wrong, expected %+v got %+v", i, tests[i], tok)
+ }
+
+ if tok.Start != tt.start || tok.End != tt.end {
+ t.Fatalf("test %d: position wrong, expected %d-%d got %d-%d", i , tests[i].start, tests[i].end, tok.Start, tok.End)
+ }
+
+ //t.Logf("Token: %s", tok.Type)
+ }
+}
+
+func TestTokenizer_NextToken2(t *testing.T) {
+ input := `/* abort comment on eof`
+
+ tests := []struct{
+ expectedType TokenType
+ expectedLiteral string
+ start,end int
+ }{
+ {TT_comment, "/* abort comment on eof", 0, -1},
+ }
+
+ l := NewTokenizer(input)
+ for i, tt := range tests {
+
+ tok := l.NextToken()
+
+ if tok.Literal != tt.expectedLiteral {
+ t.Fatalf("test %d: literal wrong, expected %+#v (% x) got %+#v (% x)", i, tests[i], tests[i].expectedLiteral, tok, tok.Literal)
+ }
+
+ if tok.Type != tt.expectedType {
+ t.Fatalf("test %d: type wrong, expected %+v got %+v", i, tests[i], tok)
+ }
+
+ if tok.Start != tt.start || tok.End != tt.end {
+ t.Fatalf("test %d: position wrong, expected %d-%d got %d-%d", i , tests[i].start, tests[i].end, tok.Start, tok.End)
+ }
+ }
+}
+
+func TestTokenizer_NextToken3(t *testing.T) {
+ input := `"abort string on eof`
+
+ tests := []struct{
+ expectedType TokenType
+ expectedLiteral string
+ start,end int
+ }{
+ {TT_string, "abort string on eo", 0, -1},
+ }
+
+ l := NewTokenizer(input)
+ for i, tt := range tests {
+
+ tok := l.NextToken()
+
+ if tok.Literal != tt.expectedLiteral {
+ t.Fatalf("test %d: literal wrong, expected %+#v (% x) got %+#v (% x)", i, tests[i], tests[i].expectedLiteral, tok, tok.Literal)
+ }
+
+ if tok.Type != tt.expectedType {
+ t.Fatalf("test %d: type wrong, expected %+v got %+v", i, tests[i], tok)
+ }
+
+ if tok.Start != tt.start || tok.End != tt.end {
+ t.Fatalf("test %d: position wrong, expected %d-%d got %d-%d", i , tests[i].start, tests[i].end, tok.Start, tok.End)
+ }
+ }
+}
+
+
+func TestTokenizer_NextToken4(t *testing.T) {
+ input := ` `
+
+ tests := []struct{
+ expectedType TokenType
+ expectedLiteral string
+ start,end int
+ }{
+ {TT_eof, "", 1, 1},
+ }
+
+ l := NewTokenizer(input)
+ for i, tt := range tests {
+
+ tok := l.NextToken()
+
+ if tok.Literal != tt.expectedLiteral {
+ t.Fatalf("test %d: literal wrong, expected %+#v (% x) got %+#v (% x)", i, tests[i], tests[i].expectedLiteral, tok, tok.Literal)
+ }
+
+ if tok.Type != tt.expectedType {
+ t.Fatalf("test %d: type wrong, expected %+v got %+v", i, tests[i], tok)
+ }
+
+ if tok.Start != tt.start || tok.End != tt.end {
+ t.Fatalf("test %d: position wrong, expected %d-%d got %d-%d", i , tests[i].start, tests[i].end, tok.Start, tok.End)
+ }
+ }
+}