Skip to content
This repository has been archived by the owner on Aug 25, 2021. It is now read-only.

Commit

Permalink
Initial Authorative Implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
tscs37 committed Jul 28, 2017
1 parent 58a6976 commit 1a502ac
Show file tree
Hide file tree
Showing 19 changed files with 1,220 additions and 2 deletions.
17 changes: 15 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ key: (
"scientific notation": 9*10^22
"multi-line strings": "
HELLO WORLD!"
"omit quotation around simple strings": Hello-Again!
"omit quotation around simple strings": Hello-Again
"load other files from subfolders": !(loadd "folder" ".conf")
"empty maps": empty
Expand All @@ -34,6 +34,18 @@ key: (
)
```

## Project Status

SECL is considered in Version 0.9

Most features are now part of the authorative implementation in this repository.

Missing features:

* Hex, Octal and Binary Notation ("0x1F", "0o777", "0b110101")
* Float E and Scientific Notation ("2.3e18" or "2.3*10^18")
* Function Execution (Functions are recognized but not executed)

## Introduction

### Strings
Expand All @@ -47,7 +59,8 @@ HelloWorld

Reserved characters: `"!@:()`

If any of the reserved characters is present, the string must be wrapped with quotation marks.
If any of the reserved characters is present, the string must be wrapped with quotation marks. Additionally, if the string equals
a keyword or function name, it must be wrapped with quotation marks too. (ex. "false" or "empty" or "randstr256")

```
"Hello World"
Expand Down
172 changes: 172 additions & 0 deletions lexer/tokenizer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
package lexer

import (
"fmt"
"unicode"
)

// The below is inspired by https://interpreterbook.com

type Tokenizer struct {
input []rune
pos, rPos int
curRune rune
}

func NewTokenizer(input string) *Tokenizer {
t := &Tokenizer{
input: []rune(input),
}
t.readChar()
return t
}

func (t *Tokenizer) readChar() {
if t.rPos >= len(t.input) {
t.curRune = 0
} else {
t.curRune = t.input[t.rPos]
}
t.pos = t.rPos
t.rPos++
}

func (t *Tokenizer) peekChar() rune {
if t.rPos >= len(t.input) {
return 0
} else {
return t.input[t.rPos]
}
}

func (t *Tokenizer) NextToken() Token {
var tok Token

t.skipWhitespace()

switch t.curRune {
case '(':
tok = newToken(TT_mapListBegin, t.curRune, t.pos, t.pos)
case ')':
tok = newToken(TT_mapListEnd, t.curRune, t.pos, t.pos)
case '!':
tok = newToken(TT_mod_execMap, t.curRune, t.pos, t.pos)
case '@':
tok = newToken(TT_mod_trim, t.curRune, t.pos, t.pos)
case ':':
tok = newToken(TT_mod_mapKey, t.curRune, t.pos, t.pos)
case 0:
tok.Literal = ""
tok.Type = TT_eof
tok.Start, tok.End = t.pos, t.pos
default:
if isValidCommentStarter(t.curRune) {
tok.Type = TT_comment
if t.curRune == '/' && t.peekChar() == '*' {
tok.Literal, tok.Start, tok.End = t.readMlComment()
t.readChar()
return tok
} else if (t.curRune == '/' && t.peekChar() == '/') || t.curRune != '/' {
tok.Literal, tok.Start, tok.End = t.readSlComment()
t.readChar()
return tok
}
}
if isValidDigit(t.curRune) {
tok.Literal, tok.Start, tok.End = t.readNumber()
tok.Type = TT_number
return tok
} else if isValidChunkRune(t.curRune) {
tok.Literal, tok.Start, tok.End = t.readChunk()
if kw, kwtt := resolveKeyword(tok.Literal); kw {
tok.Type = kwtt
} else {
tok.Type = TT_singleWordString
}
return tok
} else if isValidStringBorder(t.curRune) {
tok.Literal, tok.Start, tok.End = t.readString()
tok.Type = TT_string
return tok
} else {
fmt.Print("Illegal char at ", t.pos)
tok = newToken(TT_illegal, t.curRune, t.pos, t.pos)
}
}

t.readChar()
return tok
}

func (t *Tokenizer) readChunk() (string, int, int) {
pos := t.pos
for isValidInChunk(t.curRune) {
t.readChar()
}
return string(t.input[pos:t.pos]), pos, t.pos - 1
}

func (t *Tokenizer) readNumber() (string, int, int) {
pos := t.pos
for isValidInNumber(t.curRune) {
t.readChar()
}
return string(t.input[pos:t.pos]), pos, t.pos - 1
}

func (t *Tokenizer) readString() (string, int, int) {
pos := t.pos
t.readChar()
for !isValidStringBorder(t.curRune) {
t.readChar()
if isEscapeInString(t.curRune) {
t.readChar()
t.readChar()
}
if isNull(t.curRune) {
return string(t.input[pos+1 : t.pos-1]), pos, -1
}
}
t.readChar()
return string(t.input[pos+1 : t.pos-1]), pos, t.pos - 1
}

func (t *Tokenizer) readMlComment() (string, int, int) {
pos := t.pos
for {
t.readChar()
if t.curRune == '*' && t.peekChar() == '/' {
t.readChar()
t.readChar()
return string(t.input[pos:t.pos]), pos, t.pos - 1
}
if t.curRune == 0 {
return string(t.input[pos:t.pos]), pos, -1
}
}
}

func (t *Tokenizer) readSlComment() (string, int, int) {
pos := t.pos
for {
t.readChar()
if t.curRune == '\n' {
return string(t.input[pos:t.pos]), pos, t.pos - 1
}
}
}

func (t *Tokenizer) skipWhitespace() {
for unicode.IsSpace(t.curRune) {
t.readChar()
}
}

func newToken(ttype TokenType, cr rune, start, end int) Token {
return Token{Type: ttype, Literal: string(cr), Start: start, End: end}
}

func resolveKeyword(chunk string) (bool, TokenType) {
tt, ok := keywords[chunk]
return ok, tt
}
174 changes: 174 additions & 0 deletions lexer/tokenizer_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
package lexer

import "testing"

func TestTokenizer_NextToken(t *testing.T) {
input := `( !(nop) let: 909 other:@"test" k2: "test
hello🤣🤣\"ttt" k3?: k4! decimal: 0.111e-19*10^10 let2: empty let3: nil randstr128 decb64 true off )
# This is a comment
; Also a comment
// Also a comment
/+notacomment
/* this is a ml
comment that
should stop parsing here */ ()`

tests := []struct{
expectedType TokenType
expectedLiteral string
start,end int
}{
{TT_mapListBegin, "(", 0, 0},
{TT_mod_execMap, "!", 2, 2},
{TT_mapListBegin, "(", 3, 3},
{TT_function, "nop", 4, 6},
{TT_mapListEnd, ")", 7, 7},
{TT_singleWordString, "let", 9, 11},
{TT_mod_mapKey, ":", 12, 12},
{TT_number, "909", 14, 16},
{TT_singleWordString, "other", 18, 22},
{TT_mod_mapKey, ":", 23, 23},
{TT_mod_trim, "@", 24, 24},
{TT_string, "test", 25, 30},
{TT_singleWordString, "k2", 32, 33},
{TT_mod_mapKey, ":", 34, 34},
{TT_string, "test\n\thello🤣🤣\\\"ttt", 36, 55},
{TT_singleWordString, "k3?", 57, 59},
{TT_mod_mapKey, ":", 60, 60},
{TT_singleWordString, "k4!", 62, 64},
{TT_singleWordString, "decimal", 66, 72},
{TT_mod_mapKey, ":", 73, 73},
{TT_number, "0.111e-19*10^10", 75, 89},
{TT_singleWordString, "let2", 91, 94},
{TT_mod_mapKey, ":", 95, 95},
{TT_empty, "empty", 97, 101},
{TT_singleWordString, "let3", 103, 106},
{TT_mod_mapKey, ":", 107, 107},
{tt_nil, "nil",109,111},
{TT_randstr, "randstr128", 113, 122},
{TT_function, "decb64", 124, 129},
{TT_bool, "true", 131, 134},
{TT_bool, "off", 136, 138},
{TT_mapListEnd, ")", 140, 140},
{TT_comment, "# This is a comment", 142, 160},
{TT_comment, "; Also a comment", 162, 177},
{TT_comment, "// Also a comment", 179, 195},
{TT_singleWordString, "/+notacomment", 197, 209},
{TT_comment, "/* this is a ml\ncomment that\nshould stop parsing here */", 211, 266},
{TT_mapListBegin, "(", 268, 268},
{TT_mapListEnd, ")", 269, 269},
{TT_eof, "", 270, 270},
}

l := NewTokenizer(input)
//t.Logf("Testing input: %q", input)
for i, tt := range tests {

tok := l.NextToken()

if tok.Literal != tt.expectedLiteral {
t.Fatalf("test %d: literal wrong, expected %+#v (% x) got %+#v (% x)", i, tests[i], tests[i].expectedLiteral, tok, tok.Literal)
}

if tok.Type != tt.expectedType {
t.Fatalf("test %d: type wrong, expected %+v got %+v", i, tests[i], tok)
}

if tok.Start != tt.start || tok.End != tt.end {
t.Fatalf("test %d: position wrong, expected %d-%d got %d-%d", i , tests[i].start, tests[i].end, tok.Start, tok.End)
}

//t.Logf("Token: %s", tok.Type)
}
}

func TestTokenizer_NextToken2(t *testing.T) {
input := `/* abort comment on eof`

tests := []struct{
expectedType TokenType
expectedLiteral string
start,end int
}{
{TT_comment, "/* abort comment on eof", 0, -1},
}

l := NewTokenizer(input)
for i, tt := range tests {

tok := l.NextToken()

if tok.Literal != tt.expectedLiteral {
t.Fatalf("test %d: literal wrong, expected %+#v (% x) got %+#v (% x)", i, tests[i], tests[i].expectedLiteral, tok, tok.Literal)
}

if tok.Type != tt.expectedType {
t.Fatalf("test %d: type wrong, expected %+v got %+v", i, tests[i], tok)
}

if tok.Start != tt.start || tok.End != tt.end {
t.Fatalf("test %d: position wrong, expected %d-%d got %d-%d", i , tests[i].start, tests[i].end, tok.Start, tok.End)
}
}
}

func TestTokenizer_NextToken3(t *testing.T) {
input := `"abort string on eof`

tests := []struct{
expectedType TokenType
expectedLiteral string
start,end int
}{
{TT_string, "abort string on eo", 0, -1},
}

l := NewTokenizer(input)
for i, tt := range tests {

tok := l.NextToken()

if tok.Literal != tt.expectedLiteral {
t.Fatalf("test %d: literal wrong, expected %+#v (% x) got %+#v (% x)", i, tests[i], tests[i].expectedLiteral, tok, tok.Literal)
}

if tok.Type != tt.expectedType {
t.Fatalf("test %d: type wrong, expected %+v got %+v", i, tests[i], tok)
}

if tok.Start != tt.start || tok.End != tt.end {
t.Fatalf("test %d: position wrong, expected %d-%d got %d-%d", i , tests[i].start, tests[i].end, tok.Start, tok.End)
}
}
}


func TestTokenizer_NextToken4(t *testing.T) {
input := ` `

tests := []struct{
expectedType TokenType
expectedLiteral string
start,end int
}{
{TT_eof, "", 1, 1},
}

l := NewTokenizer(input)
for i, tt := range tests {

tok := l.NextToken()

if tok.Literal != tt.expectedLiteral {
t.Fatalf("test %d: literal wrong, expected %+#v (% x) got %+#v (% x)", i, tests[i], tests[i].expectedLiteral, tok, tok.Literal)
}

if tok.Type != tt.expectedType {
t.Fatalf("test %d: type wrong, expected %+v got %+v", i, tests[i], tok)
}

if tok.Start != tt.start || tok.End != tt.end {
t.Fatalf("test %d: position wrong, expected %d-%d got %d-%d", i , tests[i].start, tests[i].end, tok.Start, tok.End)
}
}
}
Loading

0 comments on commit 1a502ac

Please sign in to comment.