feat[lexer]: Support identifiers

watcol · Nov 22, 2021 · 6c4164d · 6c4164d
1 parent da1e3b0
commit 6c4164d
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 5 deletions.
diff --git a/docs/language.md b/docs/language.md
@@ -181,15 +181,17 @@ mark and backslash (`U+005C`), and escape sequences starts with backslashes
 can be used:
 
 - `\n` ... linefeed (`U+000A`)
-- `\r` ... cariage return (`U+000D`)
+- `\r` ... carriage return (`U+000D`)
 - `\t` ... horizontal tab (`U+0009`)
 - `\"` ... quatation mark (`U+0022`)
 - `\\` ... backslash (`U+005C`)
+- `\` and linefeed (LF), carriage return (CR), or "carriage return and
+ linefeed" (CRLF) ... ignored
 - `\xXX` ... 8 bit character (`U+00XX`)
 - `\u{XXXX}` ... unicode character (`U+XXXX`)
 
 Note that 8 bit characters are 2-digit hex values, and unicode characters are
-hex values with any digits.
+hex values with 2~8 digits.
 
 ```toml
 string = "The \"Double quated\"\r
@@ -418,7 +420,7 @@ surrounded by a pair of [curly brackets](#terms). This key can contain any key
 except backslashes (`U+005C`) and right curly brackets (`U+007D`). Escape
 patterns similar to [double quoted strings](#double-quoted-string) is
 available, but instead of `\"`, `\}` is used to express right curly brackets
-(`U+007D`). [Whitespaces](#terms) after `$` prefix are ignored.
+(`U+007D`).
 
 Note that bare keys and raw keys consists of same characters (for
 instance `key` and `${key}`) are identical and will conflict.

diff --git a/src/lex.rs b/src/lex.rs
@@ -35,6 +35,43 @@ peg::parser! { grammar lexer() for str {
  / "_" { Symbol::UnderLine }
  / "@" { Symbol::At }
 
+ // TODO: Add test
+ rule ident() -> String = ident_bare() / ident_raw()
+ rule ident_bare() -> String
+ = s:$(['a'..='z'|'A'..='Z'] ['a'..='z'|'A'..='Z'|'0'..='9'|'_']*) { s.to_string() }
+ rule ident_raw() -> String
+ = "${" s:((
+ c:$([^ '\\'|'}']) {? c.chars().next().map(|c| Some(c)).ok_or("char") }
+ / escape("}")
+ )*) "}" { s.into_iter().flat_map(|x| x).collect() }
+
+ use peg::ParseLiteral;
+ rule escape(lit: &'static str) -> Option<char> = "\\" s:(
+ "n" { Some('\n') }
+ / "r" { Some('\r') }
+ / "t" { Some('\t') }
+ / "\\" { Some('\\') }
+ / "\n\r" { None }
+ / "\n" { None }
+ / "\r" { None }
+ / ##parse_string_literal(lit) {?
+ lit.chars()
+ .next()
+ .map(|c| Some(c))
+ .ok_or("literal")
+ }
+ / "x" h:$(['0'..='9'|'a'..='f'|'A'..='F']*<2>) {?
+ u8::from_str_radix(h, 16).map(|u| Some(u as char)).or(Err("hex"))
+ }
+ / "u{" u:$(['0'..='9'|'a'..='f'|'A'..='F']*<2,8>) "}" {?
+ u32::from_str_radix(u, 16)
+ .or(Err("hex"))
+ .and_then(|u| {
+ u.try_into().map(|u| Some(u)).or(Err("unicode"))
+ })
+ }
+ ) { s }
+
  rule boolean() -> bool = "true" { true } / "false" { false }
 
  rule comment() = "#" [^ '\n'|'\r']*
@@ -44,8 +81,11 @@ peg::parser! { grammar lexer() for str {
 
  rule token(file_id: usize) -> PosToken
  = s:position!()
- t:(s:symbol() { Token::Symbol(s) }
- / b:boolean() { Token::Bool(b) })
+ t:(
+ s:symbol() { Token::Symbol(s) }
+ / b:boolean() { Token::Bool(b) }
+ / i:ident() { Token::Ident(i) }
+ )
  e:position!() { PosToken{ file_id, pos: s..e, token: t } }
 
  rule statement(file_id: usize) -> Vec<PosToken>