From 6c4164dd36ee25a3a370efd36437c5218cc25bc7 Mon Sep 17 00:00:00 2001 From: watcol Date: Mon, 22 Nov 2021 23:04:28 +0900 Subject: [PATCH] feat[lexer]: Support identifiers --- docs/language.md | 8 +++++--- src/lex.rs | 44 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 5 deletions(-) diff --git a/docs/language.md b/docs/language.md index 1273a6b..11fada1 100644 --- a/docs/language.md +++ b/docs/language.md @@ -181,15 +181,17 @@ mark and backslash (`U+005C`), and escape sequences starts with backslashes can be used: - `\n` ... linefeed (`U+000A`) -- `\r` ... cariage return (`U+000D`) +- `\r` ... carriage return (`U+000D`) - `\t` ... horizontal tab (`U+0009`) - `\"` ... quatation mark (`U+0022`) - `\\` ... backslash (`U+005C`) +- `\` and linefeed (LF), carriage return (CR), or "carriage return and + linefeed" (CRLF) ... ignored - `\xXX` ... 8 bit character (`U+00XX`) - `\u{XXXX}` ... unicode character (`U+XXXX`) Note that 8 bit characters are 2-digit hex values, and unicode characters are -hex values with any digits. +hex values with 2~8 digits. ```toml string = "The \"Double quated\"\r @@ -418,7 +420,7 @@ surrounded by a pair of [curly brackets](#terms). This key can contain any key except backslashes (`U+005C`) and right curly brackets (`U+007D`). Escape patterns similar to [double quoted strings](#double-quoted-string) is available, but instead of `\"`, `\}` is used to express right curly brackets -(`U+007D`). [Whitespaces](#terms) after `$` prefix are ignored. +(`U+007D`). Note that bare keys and raw keys consists of same characters (for instance `key` and `${key}`) are identical and will conflict. diff --git a/src/lex.rs b/src/lex.rs index bf85db0..8f1082a 100644 --- a/src/lex.rs +++ b/src/lex.rs @@ -35,6 +35,43 @@ peg::parser! { grammar lexer() for str { / "_" { Symbol::UnderLine } / "@" { Symbol::At } + // TODO: Add test + rule ident() -> String = ident_bare() / ident_raw() + rule ident_bare() -> String + = s:$(['a'..='z'|'A'..='Z'] ['a'..='z'|'A'..='Z'|'0'..='9'|'_']*) { s.to_string() } + rule ident_raw() -> String + = "${" s:(( + c:$([^ '\\'|'}']) {? c.chars().next().map(|c| Some(c)).ok_or("char") } + / escape("}") + )*) "}" { s.into_iter().flat_map(|x| x).collect() } + + use peg::ParseLiteral; + rule escape(lit: &'static str) -> Option = "\\" s:( + "n" { Some('\n') } + / "r" { Some('\r') } + / "t" { Some('\t') } + / "\\" { Some('\\') } + / "\n\r" { None } + / "\n" { None } + / "\r" { None } + / ##parse_string_literal(lit) {? + lit.chars() + .next() + .map(|c| Some(c)) + .ok_or("literal") + } + / "x" h:$(['0'..='9'|'a'..='f'|'A'..='F']*<2>) {? + u8::from_str_radix(h, 16).map(|u| Some(u as char)).or(Err("hex")) + } + / "u{" u:$(['0'..='9'|'a'..='f'|'A'..='F']*<2,8>) "}" {? + u32::from_str_radix(u, 16) + .or(Err("hex")) + .and_then(|u| { + u.try_into().map(|u| Some(u)).or(Err("unicode")) + }) + } + ) { s } + rule boolean() -> bool = "true" { true } / "false" { false } rule comment() = "#" [^ '\n'|'\r']* @@ -44,8 +81,11 @@ peg::parser! { grammar lexer() for str { rule token(file_id: usize) -> PosToken = s:position!() - t:(s:symbol() { Token::Symbol(s) } - / b:boolean() { Token::Bool(b) }) + t:( + s:symbol() { Token::Symbol(s) } + / b:boolean() { Token::Bool(b) } + / i:ident() { Token::Ident(i) } + ) e:position!() { PosToken{ file_id, pos: s..e, token: t } } rule statement(file_id: usize) -> Vec