This repository has been archived by the owner on Aug 25, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
19 changed files
with
1,220 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
package lexer | ||
|
||
import ( | ||
"fmt" | ||
"unicode" | ||
) | ||
|
||
// The below is inspired by https://interpreterbook.com | ||
|
||
type Tokenizer struct { | ||
input []rune | ||
pos, rPos int | ||
curRune rune | ||
} | ||
|
||
func NewTokenizer(input string) *Tokenizer { | ||
t := &Tokenizer{ | ||
input: []rune(input), | ||
} | ||
t.readChar() | ||
return t | ||
} | ||
|
||
func (t *Tokenizer) readChar() { | ||
if t.rPos >= len(t.input) { | ||
t.curRune = 0 | ||
} else { | ||
t.curRune = t.input[t.rPos] | ||
} | ||
t.pos = t.rPos | ||
t.rPos++ | ||
} | ||
|
||
func (t *Tokenizer) peekChar() rune { | ||
if t.rPos >= len(t.input) { | ||
return 0 | ||
} else { | ||
return t.input[t.rPos] | ||
} | ||
} | ||
|
||
func (t *Tokenizer) NextToken() Token { | ||
var tok Token | ||
|
||
t.skipWhitespace() | ||
|
||
switch t.curRune { | ||
case '(': | ||
tok = newToken(TT_mapListBegin, t.curRune, t.pos, t.pos) | ||
case ')': | ||
tok = newToken(TT_mapListEnd, t.curRune, t.pos, t.pos) | ||
case '!': | ||
tok = newToken(TT_mod_execMap, t.curRune, t.pos, t.pos) | ||
case '@': | ||
tok = newToken(TT_mod_trim, t.curRune, t.pos, t.pos) | ||
case ':': | ||
tok = newToken(TT_mod_mapKey, t.curRune, t.pos, t.pos) | ||
case 0: | ||
tok.Literal = "" | ||
tok.Type = TT_eof | ||
tok.Start, tok.End = t.pos, t.pos | ||
default: | ||
if isValidCommentStarter(t.curRune) { | ||
tok.Type = TT_comment | ||
if t.curRune == '/' && t.peekChar() == '*' { | ||
tok.Literal, tok.Start, tok.End = t.readMlComment() | ||
t.readChar() | ||
return tok | ||
} else if (t.curRune == '/' && t.peekChar() == '/') || t.curRune != '/' { | ||
tok.Literal, tok.Start, tok.End = t.readSlComment() | ||
t.readChar() | ||
return tok | ||
} | ||
} | ||
if isValidDigit(t.curRune) { | ||
tok.Literal, tok.Start, tok.End = t.readNumber() | ||
tok.Type = TT_number | ||
return tok | ||
} else if isValidChunkRune(t.curRune) { | ||
tok.Literal, tok.Start, tok.End = t.readChunk() | ||
if kw, kwtt := resolveKeyword(tok.Literal); kw { | ||
tok.Type = kwtt | ||
} else { | ||
tok.Type = TT_singleWordString | ||
} | ||
return tok | ||
} else if isValidStringBorder(t.curRune) { | ||
tok.Literal, tok.Start, tok.End = t.readString() | ||
tok.Type = TT_string | ||
return tok | ||
} else { | ||
fmt.Print("Illegal char at ", t.pos) | ||
tok = newToken(TT_illegal, t.curRune, t.pos, t.pos) | ||
} | ||
} | ||
|
||
t.readChar() | ||
return tok | ||
} | ||
|
||
func (t *Tokenizer) readChunk() (string, int, int) { | ||
pos := t.pos | ||
for isValidInChunk(t.curRune) { | ||
t.readChar() | ||
} | ||
return string(t.input[pos:t.pos]), pos, t.pos - 1 | ||
} | ||
|
||
func (t *Tokenizer) readNumber() (string, int, int) { | ||
pos := t.pos | ||
for isValidInNumber(t.curRune) { | ||
t.readChar() | ||
} | ||
return string(t.input[pos:t.pos]), pos, t.pos - 1 | ||
} | ||
|
||
func (t *Tokenizer) readString() (string, int, int) { | ||
pos := t.pos | ||
t.readChar() | ||
for !isValidStringBorder(t.curRune) { | ||
t.readChar() | ||
if isEscapeInString(t.curRune) { | ||
t.readChar() | ||
t.readChar() | ||
} | ||
if isNull(t.curRune) { | ||
return string(t.input[pos+1 : t.pos-1]), pos, -1 | ||
} | ||
} | ||
t.readChar() | ||
return string(t.input[pos+1 : t.pos-1]), pos, t.pos - 1 | ||
} | ||
|
||
func (t *Tokenizer) readMlComment() (string, int, int) { | ||
pos := t.pos | ||
for { | ||
t.readChar() | ||
if t.curRune == '*' && t.peekChar() == '/' { | ||
t.readChar() | ||
t.readChar() | ||
return string(t.input[pos:t.pos]), pos, t.pos - 1 | ||
} | ||
if t.curRune == 0 { | ||
return string(t.input[pos:t.pos]), pos, -1 | ||
} | ||
} | ||
} | ||
|
||
func (t *Tokenizer) readSlComment() (string, int, int) { | ||
pos := t.pos | ||
for { | ||
t.readChar() | ||
if t.curRune == '\n' { | ||
return string(t.input[pos:t.pos]), pos, t.pos - 1 | ||
} | ||
} | ||
} | ||
|
||
func (t *Tokenizer) skipWhitespace() { | ||
for unicode.IsSpace(t.curRune) { | ||
t.readChar() | ||
} | ||
} | ||
|
||
func newToken(ttype TokenType, cr rune, start, end int) Token { | ||
return Token{Type: ttype, Literal: string(cr), Start: start, End: end} | ||
} | ||
|
||
func resolveKeyword(chunk string) (bool, TokenType) { | ||
tt, ok := keywords[chunk] | ||
return ok, tt | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
package lexer | ||
|
||
import "testing" | ||
|
||
func TestTokenizer_NextToken(t *testing.T) { | ||
input := `( !(nop) let: 909 other:@"test" k2: "test | ||
hello🤣🤣\"ttt" k3?: k4! decimal: 0.111e-19*10^10 let2: empty let3: nil randstr128 decb64 true off ) | ||
# This is a comment | ||
; Also a comment | ||
// Also a comment | ||
/+notacomment | ||
/* this is a ml | ||
comment that | ||
should stop parsing here */ ()` | ||
|
||
tests := []struct{ | ||
expectedType TokenType | ||
expectedLiteral string | ||
start,end int | ||
}{ | ||
{TT_mapListBegin, "(", 0, 0}, | ||
{TT_mod_execMap, "!", 2, 2}, | ||
{TT_mapListBegin, "(", 3, 3}, | ||
{TT_function, "nop", 4, 6}, | ||
{TT_mapListEnd, ")", 7, 7}, | ||
{TT_singleWordString, "let", 9, 11}, | ||
{TT_mod_mapKey, ":", 12, 12}, | ||
{TT_number, "909", 14, 16}, | ||
{TT_singleWordString, "other", 18, 22}, | ||
{TT_mod_mapKey, ":", 23, 23}, | ||
{TT_mod_trim, "@", 24, 24}, | ||
{TT_string, "test", 25, 30}, | ||
{TT_singleWordString, "k2", 32, 33}, | ||
{TT_mod_mapKey, ":", 34, 34}, | ||
{TT_string, "test\n\thello🤣🤣\\\"ttt", 36, 55}, | ||
{TT_singleWordString, "k3?", 57, 59}, | ||
{TT_mod_mapKey, ":", 60, 60}, | ||
{TT_singleWordString, "k4!", 62, 64}, | ||
{TT_singleWordString, "decimal", 66, 72}, | ||
{TT_mod_mapKey, ":", 73, 73}, | ||
{TT_number, "0.111e-19*10^10", 75, 89}, | ||
{TT_singleWordString, "let2", 91, 94}, | ||
{TT_mod_mapKey, ":", 95, 95}, | ||
{TT_empty, "empty", 97, 101}, | ||
{TT_singleWordString, "let3", 103, 106}, | ||
{TT_mod_mapKey, ":", 107, 107}, | ||
{tt_nil, "nil",109,111}, | ||
{TT_randstr, "randstr128", 113, 122}, | ||
{TT_function, "decb64", 124, 129}, | ||
{TT_bool, "true", 131, 134}, | ||
{TT_bool, "off", 136, 138}, | ||
{TT_mapListEnd, ")", 140, 140}, | ||
{TT_comment, "# This is a comment", 142, 160}, | ||
{TT_comment, "; Also a comment", 162, 177}, | ||
{TT_comment, "// Also a comment", 179, 195}, | ||
{TT_singleWordString, "/+notacomment", 197, 209}, | ||
{TT_comment, "/* this is a ml\ncomment that\nshould stop parsing here */", 211, 266}, | ||
{TT_mapListBegin, "(", 268, 268}, | ||
{TT_mapListEnd, ")", 269, 269}, | ||
{TT_eof, "", 270, 270}, | ||
} | ||
|
||
l := NewTokenizer(input) | ||
//t.Logf("Testing input: %q", input) | ||
for i, tt := range tests { | ||
|
||
tok := l.NextToken() | ||
|
||
if tok.Literal != tt.expectedLiteral { | ||
t.Fatalf("test %d: literal wrong, expected %+#v (% x) got %+#v (% x)", i, tests[i], tests[i].expectedLiteral, tok, tok.Literal) | ||
} | ||
|
||
if tok.Type != tt.expectedType { | ||
t.Fatalf("test %d: type wrong, expected %+v got %+v", i, tests[i], tok) | ||
} | ||
|
||
if tok.Start != tt.start || tok.End != tt.end { | ||
t.Fatalf("test %d: position wrong, expected %d-%d got %d-%d", i , tests[i].start, tests[i].end, tok.Start, tok.End) | ||
} | ||
|
||
//t.Logf("Token: %s", tok.Type) | ||
} | ||
} | ||
|
||
func TestTokenizer_NextToken2(t *testing.T) { | ||
input := `/* abort comment on eof` | ||
|
||
tests := []struct{ | ||
expectedType TokenType | ||
expectedLiteral string | ||
start,end int | ||
}{ | ||
{TT_comment, "/* abort comment on eof", 0, -1}, | ||
} | ||
|
||
l := NewTokenizer(input) | ||
for i, tt := range tests { | ||
|
||
tok := l.NextToken() | ||
|
||
if tok.Literal != tt.expectedLiteral { | ||
t.Fatalf("test %d: literal wrong, expected %+#v (% x) got %+#v (% x)", i, tests[i], tests[i].expectedLiteral, tok, tok.Literal) | ||
} | ||
|
||
if tok.Type != tt.expectedType { | ||
t.Fatalf("test %d: type wrong, expected %+v got %+v", i, tests[i], tok) | ||
} | ||
|
||
if tok.Start != tt.start || tok.End != tt.end { | ||
t.Fatalf("test %d: position wrong, expected %d-%d got %d-%d", i , tests[i].start, tests[i].end, tok.Start, tok.End) | ||
} | ||
} | ||
} | ||
|
||
func TestTokenizer_NextToken3(t *testing.T) { | ||
input := `"abort string on eof` | ||
|
||
tests := []struct{ | ||
expectedType TokenType | ||
expectedLiteral string | ||
start,end int | ||
}{ | ||
{TT_string, "abort string on eo", 0, -1}, | ||
} | ||
|
||
l := NewTokenizer(input) | ||
for i, tt := range tests { | ||
|
||
tok := l.NextToken() | ||
|
||
if tok.Literal != tt.expectedLiteral { | ||
t.Fatalf("test %d: literal wrong, expected %+#v (% x) got %+#v (% x)", i, tests[i], tests[i].expectedLiteral, tok, tok.Literal) | ||
} | ||
|
||
if tok.Type != tt.expectedType { | ||
t.Fatalf("test %d: type wrong, expected %+v got %+v", i, tests[i], tok) | ||
} | ||
|
||
if tok.Start != tt.start || tok.End != tt.end { | ||
t.Fatalf("test %d: position wrong, expected %d-%d got %d-%d", i , tests[i].start, tests[i].end, tok.Start, tok.End) | ||
} | ||
} | ||
} | ||
|
||
|
||
func TestTokenizer_NextToken4(t *testing.T) { | ||
input := ` ` | ||
|
||
tests := []struct{ | ||
expectedType TokenType | ||
expectedLiteral string | ||
start,end int | ||
}{ | ||
{TT_eof, "", 1, 1}, | ||
} | ||
|
||
l := NewTokenizer(input) | ||
for i, tt := range tests { | ||
|
||
tok := l.NextToken() | ||
|
||
if tok.Literal != tt.expectedLiteral { | ||
t.Fatalf("test %d: literal wrong, expected %+#v (% x) got %+#v (% x)", i, tests[i], tests[i].expectedLiteral, tok, tok.Literal) | ||
} | ||
|
||
if tok.Type != tt.expectedType { | ||
t.Fatalf("test %d: type wrong, expected %+v got %+v", i, tests[i], tok) | ||
} | ||
|
||
if tok.Start != tt.start || tok.End != tt.end { | ||
t.Fatalf("test %d: position wrong, expected %d-%d got %d-%d", i , tests[i].start, tests[i].end, tok.Start, tok.End) | ||
} | ||
} | ||
} |
Oops, something went wrong.