Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add string encoding suffixes #127

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Implement string encodings
  • Loading branch information
Artentus committed Oct 26, 2021
commit dc014c857daa7e587b8920a223cc8d765dc2a656
2 changes: 1 addition & 1 deletion src/asm/parser/include.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ pub fn parse_directive_include(
state: &mut asm::parser::State)
-> Result<(), ()>
{
let tk_filename = state.parser.expect(syntax::TokenKind::String)?;
let tk_filename = state.parser.expect(syntax::TokenKind::String(expr::StringEncoding::Utf8))?;
let filename = syntax::excerpt_as_string_contents(
state.report.clone(),
tk_filename.excerpt.as_ref().unwrap(),
Expand Down
34 changes: 32 additions & 2 deletions src/expr/expression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,23 @@ pub enum Value
}


#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
pub enum StringEncoding
{
Utf8,
Utf16BE,
Utf16LE,
UnicodeBE,
UnicodeLE,
Ascii,
}


#[derive(Clone, Debug, Eq, PartialEq)]
pub struct ValueString
{
pub utf8_contents: String,
pub encoding: String,
pub encoding: StringEncoding,
}


Expand Down Expand Up @@ -105,6 +117,15 @@ impl Value
}


pub fn make_string(s: &str, encoding: expr::StringEncoding) -> Value
{
Value::String(ValueString {
utf8_contents: s.to_string(),
encoding
})
}


pub fn get_bigint(&self) -> Option<util::BigInt>
{
match self
Expand All @@ -122,6 +143,15 @@ impl ValueString
{
pub fn to_bigint(&self) -> util::BigInt
{
util::BigInt::new_from_str(&self.utf8_contents)
let bytes: Vec<u8> = match self.encoding
{
StringEncoding::Utf8 => self.utf8_contents.bytes().collect(),
StringEncoding::Utf16BE => self.utf8_contents.encode_utf16().flat_map(|v| v.to_be_bytes()).collect(),
StringEncoding::Utf16LE => self.utf8_contents.encode_utf16().flat_map(|v| v.to_le_bytes()).collect(),
StringEncoding::UnicodeBE => self.utf8_contents.chars().flat_map(|c| (c as u32).to_be_bytes()).collect(),
StringEncoding::UnicodeLE => self.utf8_contents.chars().flat_map(|c| (c as u32).to_le_bytes()).collect(),
StringEncoding::Ascii => self.utf8_contents.chars().map(|c| (c as u32) as u8).collect(), // can potentially contain invalid chars
};
util::BigInt::from_bytes_be(&bytes)
}
}
1 change: 1 addition & 0 deletions src/expr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ mod eval;

pub use self::expression::Expr;
pub use self::expression::Value;
pub use self::expression::StringEncoding;
pub use self::expression::ValueString;
pub use self::expression::UnaryOp;
pub use self::expression::BinaryOp;
Expand Down
10 changes: 5 additions & 5 deletions src/expr/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -378,8 +378,8 @@ impl<'a, 'parser> ExpressionParser<'a, 'parser>
else if self.parser.next_is(0, syntax::TokenKind::Number)
{ self.parse_number() }

else if self.parser.next_is(0, syntax::TokenKind::String)
{ self.parse_string() }
else if let Some(encoding) = self.parser.next_is_string(0)
{ self.parse_string(encoding) }

else if self.parser.next_is(0, syntax::TokenKind::KeywordAsm)
{ self.parse_asm() }
Expand Down Expand Up @@ -497,9 +497,9 @@ impl<'a, 'parser> ExpressionParser<'a, 'parser>
}


fn parse_string(&mut self) -> Result<expr::Expr, ()>
fn parse_string(&mut self, encoding: expr::StringEncoding) -> Result<expr::Expr, ()>
{
let tk_str = self.parser.expect(syntax::TokenKind::String)?;
let tk_str = self.parser.expect(syntax::TokenKind::String(encoding))?;

let string = syntax::excerpt_as_string_contents(
self.parser.report.clone().unwrap_or(diagn::RcReport::new()),
Expand All @@ -511,7 +511,7 @@ impl<'a, 'parser> ExpressionParser<'a, 'parser>
expr::Value::String(expr::ValueString
{
utf8_contents: string,
encoding: "utf8".to_string(),
encoding,
}));

Ok(expr)
Expand Down
21 changes: 21 additions & 0 deletions src/syntax/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,27 @@ impl<'a> Parser<'a>

self.tokens[index].kind == kind
}


pub fn next_is_string(&self, mut nth: usize) -> Option<expr::StringEncoding>
{
let mut index = self.index;

while nth > 0 && index < self.tokens.len()
{
nth -= 1;
index += 1;
while index < self.tokens.len() && self.tokens[index].kind.ignorable()
{ index += 1; }
}

if index >= self.tokens.len()
{
return None;
}

self.tokens[index].kind.is_string()
}


pub fn maybe_expect(&mut self, kind: TokenKind) -> Option<Token>
Expand Down
56 changes: 49 additions & 7 deletions src/syntax/token.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use crate::diagn::{Span, RcReport};
use crate::expr::StringEncoding;
use std::rc::Rc;


Expand All @@ -20,7 +21,7 @@ pub enum TokenKind
LineBreak,
Identifier,
Number,
String,
String(StringEncoding),
KeywordAsm,
ParenOpen,
ParenClose,
Expand Down Expand Up @@ -66,11 +67,19 @@ pub enum TokenKind

impl TokenKind
{
pub fn is_string(self) -> Option<StringEncoding> {
if let TokenKind::String(encoding) = self {
Some(encoding)
} else {
None
}
}

fn needs_excerpt(self) -> bool
{
self == TokenKind::Identifier ||
self == TokenKind::Number ||
self == TokenKind::String
self.is_string().is_some()
}


Expand Down Expand Up @@ -130,7 +139,7 @@ impl TokenKind
TokenKind::LineBreak => "line break",
TokenKind::Identifier => "identifier",
TokenKind::Number => "number",
TokenKind::String => "string",
TokenKind::String(_) => "string",
TokenKind::KeywordAsm => "`asm` keyword",
TokenKind::ParenOpen => "`(`",
TokenKind::ParenClose => "`)`",
Expand Down Expand Up @@ -257,12 +266,22 @@ where S: Into<String>
check_for_string (&src[index..]).unwrap_or_else(||
(TokenKind::Error, 1)))))));

let span = Span::new(filename.clone(), index, index + length);
let length_offset = if let Some(encoding) = kind.is_string() {
if encoding != StringEncoding::Utf8 {
1
} else {
0
}
} else {
0
};

let span = Span::new(filename.clone(), index, index + length - length_offset);

// Get the source excerpt for variable tokens (e.g. identifiers).
let excerpt = match kind.needs_excerpt()
{
true => Some(src[index..].iter().cloned().take(length).collect()),
true => Some(src[index..].iter().cloned().take(length - length_offset).collect()),
false => None
};

Expand Down Expand Up @@ -401,10 +420,33 @@ fn check_for_string(src: &[char]) -> Option<(TokenKind, usize)>

if src[length] != '\"'
{ return None; }

length += 1;

let encoding = if length >= src.len() {
StringEncoding::Utf8
} else if src[length] == 'W' {
length += 1;
StringEncoding::Utf16BE
} else if src[length] == 'U' {
length += 1;
StringEncoding::UnicodeBE
} else if src[length] == 'w' {
length += 1;
StringEncoding::Utf16LE
} else if src[length] == 'u' {
length += 1;
StringEncoding::UnicodeLE
} else if src[length] == 'a' {
length += 1;
StringEncoding::Ascii
} else if src[length].is_alphanumeric() {
return None
} else {
StringEncoding::Utf8
};

Some((TokenKind::String, length))
Some((TokenKind::String(encoding), length))
}


Expand Down
21 changes: 21 additions & 0 deletions src/test/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,27 @@ fn test_literals()
}


#[test]
fn test_string_literals()
{
test("\"\"", Pass(expr::Value::make_string("", expr::StringEncoding::Utf8)));
test("\"\"W", Pass(expr::Value::make_string("", expr::StringEncoding::Utf16BE)));
test("\"\"w", Pass(expr::Value::make_string("", expr::StringEncoding::Utf16LE)));
test("\"\"U", Pass(expr::Value::make_string("", expr::StringEncoding::UnicodeBE)));
test("\"\"u", Pass(expr::Value::make_string("", expr::StringEncoding::UnicodeLE)));
test("\"\"a", Pass(expr::Value::make_string("", expr::StringEncoding::Ascii)));
test("\"\"x", Fail(("test", 1, "unexpected character")));

test("\"abc\"", Pass(expr::Value::make_string("abc", expr::StringEncoding::Utf8)));
test("\"abc\"W", Pass(expr::Value::make_string("abc", expr::StringEncoding::Utf16BE)));
test("\"abc\"w", Pass(expr::Value::make_string("abc", expr::StringEncoding::Utf16LE)));
test("\"abc\"U", Pass(expr::Value::make_string("abc", expr::StringEncoding::UnicodeBE)));
test("\"abc\"u", Pass(expr::Value::make_string("abc", expr::StringEncoding::UnicodeLE)));
test("\"abc\"a", Pass(expr::Value::make_string("abc", expr::StringEncoding::Ascii)));
test("\"abc\"x", Fail(("test", 1, "unexpected character")));
}


#[test]
fn test_variables()
{
Expand Down
12 changes: 0 additions & 12 deletions src/util/bigint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,6 @@ impl BigInt
}


pub fn new_from_str(s: &str) -> BigInt
{
let bytes = s.bytes().collect::<Vec<u8>>();
let bigint = num_bigint::BigInt::from_signed_bytes_be(&bytes);
BigInt
{
bigint,
size: Some(bytes.len() * 8),
}
}


pub fn as_string(&self) -> String
{
String::from_utf8_lossy(&self.bigint.to_signed_bytes_be()).to_string()
Expand Down