Skip to content

Commit

Permalink
feat[lexer]: Support identifiers
Browse files Browse the repository at this point in the history
  • Loading branch information
watcol committed Nov 22, 2021
1 parent da1e3b0 commit 6c4164d
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 5 deletions.
8 changes: 5 additions & 3 deletions docs/language.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,15 +181,17 @@ mark and backslash (`U+005C`), and escape sequences starts with backslashes
can be used:

- `\n` ... linefeed (`U+000A`)
- `\r` ... cariage return (`U+000D`)
- `\r` ... carriage return (`U+000D`)
- `\t` ... horizontal tab (`U+0009`)
- `\"` ... quatation mark (`U+0022`)
- `\\` ... backslash (`U+005C`)
- `\` and linefeed (LF), carriage return (CR), or "carriage return and
linefeed" (CRLF) ... ignored
- `\xXX` ... 8 bit character (`U+00XX`)
- `\u{XXXX}` ... unicode character (`U+XXXX`)

Note that 8 bit characters are 2-digit hex values, and unicode characters are
hex values with any digits.
hex values with 2~8 digits.

```toml
string = "The \"Double quated\"\r
Expand Down Expand Up @@ -418,7 +420,7 @@ surrounded by a pair of [curly brackets](#terms). This key can contain any key
except backslashes (`U+005C`) and right curly brackets (`U+007D`). Escape
patterns similar to [double quoted strings](#double-quoted-string) is
available, but instead of `\"`, `\}` is used to express right curly brackets
(`U+007D`). [Whitespaces](#terms) after `$` prefix are ignored.
(`U+007D`).

Note that bare keys and raw keys consists of same characters (for
instance `key` and `${key}`) are identical and will conflict.
Expand Down
44 changes: 42 additions & 2 deletions src/lex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,43 @@ peg::parser! { grammar lexer() for str {
/ "_" { Symbol::UnderLine }
/ "@" { Symbol::At }

// TODO: Add test
rule ident() -> String = ident_bare() / ident_raw()
rule ident_bare() -> String
= s:$(['a'..='z'|'A'..='Z'] ['a'..='z'|'A'..='Z'|'0'..='9'|'_']*) { s.to_string() }
rule ident_raw() -> String
= "${" s:((
c:$([^ '\\'|'}']) {? c.chars().next().map(|c| Some(c)).ok_or("char") }
/ escape("}")
)*) "}" { s.into_iter().flat_map(|x| x).collect() }

use peg::ParseLiteral;
rule escape(lit: &'static str) -> Option<char> = "\\" s:(
"n" { Some('\n') }
/ "r" { Some('\r') }
/ "t" { Some('\t') }
/ "\\" { Some('\\') }
/ "\n\r" { None }
/ "\n" { None }
/ "\r" { None }
/ ##parse_string_literal(lit) {?
lit.chars()
.next()
.map(|c| Some(c))
.ok_or("literal")
}
/ "x" h:$(['0'..='9'|'a'..='f'|'A'..='F']*<2>) {?
u8::from_str_radix(h, 16).map(|u| Some(u as char)).or(Err("hex"))
}
/ "u{" u:$(['0'..='9'|'a'..='f'|'A'..='F']*<2,8>) "}" {?
u32::from_str_radix(u, 16)
.or(Err("hex"))
.and_then(|u| {
u.try_into().map(|u| Some(u)).or(Err("unicode"))
})
}
) { s }

rule boolean() -> bool = "true" { true } / "false" { false }

rule comment() = "#" [^ '\n'|'\r']*
Expand All @@ -44,8 +81,11 @@ peg::parser! { grammar lexer() for str {

rule token(file_id: usize) -> PosToken
= s:position!()
t:(s:symbol() { Token::Symbol(s) }
/ b:boolean() { Token::Bool(b) })
t:(
s:symbol() { Token::Symbol(s) }
/ b:boolean() { Token::Bool(b) }
/ i:ident() { Token::Ident(i) }
)
e:position!() { PosToken{ file_id, pos: s..e, token: t } }

rule statement(file_id: usize) -> Vec<PosToken>
Expand Down

0 comments on commit 6c4164d

Please sign in to comment.