Skip to content

Commit

Permalink
Optionally support Python's WTF-8 string litterals.
Browse files Browse the repository at this point in the history
eg. '-\u5171\u0141\u2661\u0363\uDC80'
  • Loading branch information
progval committed Jun 16, 2018
1 parent 7c0354b commit cd6159b
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 66 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ script:
- cargo test --no-default-features --features "$FEATURES"
env:
matrix:
- FEATURES="bigint"
- FEATURES="bigint wtf8"
- FEATURES=""
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ authors = ["Valentin Lorentz <[email protected]>"]
license = "GPL-3.0-or-later"

[features]
default = ["bigint"]
default = ["bigint", "wtf8"]
bigint = ["num-traits", "num-bigint"]

[[bin]]
Expand All @@ -20,6 +20,7 @@ unicode-xid = "^0.1"
#unicode_names = "^0.1.7"
num-traits = { version="^0.2.4", optional=true }
num-bigint = { version="^0.2.0", optional=true }
wtf8 = { version="^0.0.3", optional=true }

[dev-dependencies]
pretty_assertions = "^0.4"
16 changes: 15 additions & 1 deletion src/ast.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,26 @@
use std::fmt;

#[cfg(feature="bigint")]
use num_bigint::BigUint;

#[cfg(feature="wtf8")]
use wtf8;

#[cfg(feature="bigint")]
pub type IntegerType = BigUint;
#[cfg(not(feature="bigint"))]
pub type IntegerType = u64;

#[cfg(feature="wtf8")]
pub type PyStringContent = wtf8::Wtf8Buf;
#[cfg(feature="wtf8")]
pub type PyStringCodePoint = wtf8::CodePoint;

#[cfg(not(feature="wtf8"))]
pub type PyStringContent = String;
#[cfg(not(feature="wtf8"))]
pub type PyStringCodePoint = char;

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum ArgumentError {
KeywordExpression,
Expand Down Expand Up @@ -206,7 +220,7 @@ pub enum SetItem {
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct PyString {
pub prefix: String,
pub content: String,
pub content: PyStringContent,
}

#[derive(Clone, Debug, PartialEq)]
Expand Down
30 changes: 19 additions & 11 deletions src/expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -592,43 +592,51 @@ named!(pub yield_expr<StrSpan, Expression>,
mod tests {
use helpers::{NewlinesAreNotSpaces, make_strspan, assert_parse_eq};
use super::*;

#[cfg(feature="wtf8")]
fn new_pystring(prefix: &str, s: &str) -> PyString {
PyString { prefix: prefix.to_string(), content: PyStringContent::from_str(s) }
}

#[cfg(not(feature="wtf8"))]
fn new_pystring(prefix: &str, s: &str) -> PyString {
PyString { prefix: prefix.to_string(), content: s.to_string() }
}

#[test]
fn test_string() {
let atom = ExpressionParser::<NewlinesAreNotSpaces>::atom;
let new_pystring = |s: &str| PyString { prefix: "".to_string(), content: s.to_string() };
assert_parse_eq(atom(make_strspan(r#""foo" "#)), Ok((make_strspan(" "),
Box::new(Expression::String(vec![new_pystring("foo")])))
Box::new(Expression::String(vec![new_pystring("", "foo")])))
));
assert_parse_eq(atom(make_strspan(r#""foo" "bar""#)), Ok((make_strspan(""),
Box::new(Expression::String(vec![new_pystring("foo"), new_pystring("bar")])))
Box::new(Expression::String(vec![new_pystring("", "foo"), new_pystring("", "bar")])))
));
assert_parse_eq(atom(make_strspan(r#""fo\"o" "#)), Ok((make_strspan(" "),
Box::new(Expression::String(vec![new_pystring("fo\"o")])))
Box::new(Expression::String(vec![new_pystring("", "fo\"o")])))
));
assert_parse_eq(atom(make_strspan(r#""fo"o" "#)), Ok((make_strspan(r#"o" "#),
Box::new(Expression::String(vec![new_pystring("fo")])))
Box::new(Expression::String(vec![new_pystring("", "fo")])))
));
assert_parse_eq(atom(make_strspan(r#""fo \" o" "#)), Ok((make_strspan(" "),
Box::new(Expression::String(vec![new_pystring("fo \" o")])))
Box::new(Expression::String(vec![new_pystring("", "fo \" o")])))
));
assert_parse_eq(atom(make_strspan(r#"'fo \' o' "#)), Ok((make_strspan(" "),
Box::new(Expression::String(vec![new_pystring("fo ' o")])))
Box::new(Expression::String(vec![new_pystring("", "fo ' o")])))
));
assert_parse_eq(atom(make_strspan(r#"r'fo \' o' "#)), Ok((make_strspan(" "),
Box::new(Expression::String(vec![PyString { prefix: "r".to_string(), content: "fo \\' o".to_string() }])))
Box::new(Expression::String(vec![new_pystring("r", "fo \\' o")])))
));

assert_parse_eq(atom(make_strspan(r#"'\x8a'"#)), Ok((make_strspan(""),
Box::new(Expression::String(vec![new_pystring("\u{8a}")])))
Box::new(Expression::String(vec![new_pystring("", "\u{8a}")])))
));
}

#[test]
fn test_triple_quotes_string() {
let new_pystring = |s: &str| PyString { prefix: "".to_string(), content: s.to_string() };
let atom = ExpressionParser::<NewlinesAreNotSpaces>::atom;
assert_parse_eq(atom(make_strspan(r#"'''fo ' o''' "#)), Ok((make_strspan(" "), Box::new(Expression::String(vec![new_pystring("fo ' o")])))));
assert_parse_eq(atom(make_strspan(r#"'''fo ' o''' "#)), Ok((make_strspan(" "), Box::new(Expression::String(vec![new_pystring("", "fo ' o")])))));
}

#[test]
Expand Down
3 changes: 3 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ extern crate num_traits;
#[cfg(feature="bigint")]
extern crate num_bigint;

#[cfg(feature="wtf8")]
extern crate wtf8;

#[macro_use]
mod helpers;
#[macro_use]
Expand Down
92 changes: 56 additions & 36 deletions src/strings.rs
Original file line number Diff line number Diff line change
@@ -1,33 +1,53 @@
use nom::anychar;

#[cfg(feature="wtf8")]
use wtf8;

use helpers::StrSpan;
use ast::*;

named!(escapedchar<StrSpan, Option<char>>,
#[cfg(feature="wtf8")]
fn cp_from_char(c: char) -> wtf8::CodePoint {
wtf8::CodePoint::from_char(c)
}
#[cfg(feature="wtf8")]
fn cp_from_u32(n: u32) -> Option<wtf8::CodePoint> {
wtf8::CodePoint::from_u32(n)
}
#[cfg(not(feature="wtf8"))]
fn cp_from_char(c: char) -> char {
c
}
#[cfg(not(feature="wtf8"))]
fn cp_from_u32(n: u32) -> Option<char> {
::std::char::from_u32(n)
}

named!(escapedchar<StrSpan, Option<PyStringCodePoint>>,
preceded!(char!('\\'),
alt!(
char!('\n') => { |_| None }
| char!('\\') => { |_| Some('\\') }
| char!('\'') => { |_| Some('\'') }
| char!('"') => { |_| Some('"') }
| char!('a') => { |_| Some('\x07') } // BEL
| char!('b') => { |_| Some('\x08') } // BS
| char!('f') => { |_| Some('\x0c') } // FF
| char!('n') => { |_| Some('\n') }
| char!('r') => { |_| Some('\r') }
| char!('t') => { |_| Some('\t') }
| char!('v') => { |_| Some('\x0b') } // VT
| char!('\\') => { |_| Some(cp_from_char('\\')) }
| char!('\'') => { |_| Some(cp_from_char('\'')) }
| char!('"') => { |_| Some(cp_from_char('"')) }
| char!('a') => { |_| Some(cp_from_char('\x07')) } // BEL
| char!('b') => { |_| Some(cp_from_char('\x08')) } // BS
| char!('f') => { |_| Some(cp_from_char('\x0c')) } // FF
| char!('n') => { |_| Some(cp_from_char('\n')) }
| char!('r') => { |_| Some(cp_from_char('\r')) }
| char!('t') => { |_| Some(cp_from_char('\t')) }
| char!('v') => { |_| Some(cp_from_char('\x0b')) } // VT
| tuple!(one_of!("01234567"), opt!(one_of!("01234567")), opt!(one_of!("01234567"))) => { |(c1, c2, c3): (char, Option<char>, Option<char>)|
match (c1.to_digit(8), c2.and_then(|c| c.to_digit(8)), c3.and_then(|c| c.to_digit(8))) {
(Some(d1), Some(d2), Some(d3)) => ::std::char::from_u32((d1 << 6) + (d2 << 3) + d3),
(Some(d1), Some(d2), None ) => ::std::char::from_u32((d1 << 3) + d2),
(Some(d1), None, None ) => ::std::char::from_u32(d1),
(Some(d1), Some(d2), Some(d3)) => cp_from_u32((d1 << 6) + (d2 << 3) + d3),
(Some(d1), Some(d2), None ) => cp_from_u32((d1 << 3) + d2),
(Some(d1), None, None ) => cp_from_u32(d1),
_ => unreachable!(),
}
}
| preceded!(char!('x'), tuple!(one_of!("0123456789abcdefABCDEF"), one_of!("0123456789abcdefABCDEF"))) => { |(c1, c2): (char, char)|
match (c1.to_digit(16), c2.to_digit(16)) {
(Some(d1), Some(d2)) => ::std::char::from_u32((d1 << 4) + d2),
(Some(d1), Some(d2)) => cp_from_u32((d1 << 4) + d2),
_ => unreachable!(),
}
}
Expand All @@ -38,14 +58,14 @@ named!(escapedchar<StrSpan, Option<char>>,
| preceded!(char!('u'), count!(one_of!("0123456789abcdefABCDEF"), 4)) => { |v: Vec<char>| {
let it: Vec<u32> = v.iter().map(|c| c.to_digit(16).unwrap()).collect();
if let [d1, d2, d3, d4] = &it[..] {
::std::char::from_u32((d1 << 12) + (d2 << 8) + (d3 << 4) + d4)
cp_from_u32((d1 << 12) + (d2 << 8) + (d3 << 4) + d4)
}
else { unreachable!() }
}}
| preceded!(char!('U'), count!(one_of!("0123456789abcdefABCDEF"), 8)) => { |v: Vec<char>| {
let it: Vec<u32> = v.iter().map(|c| c.to_digit(16).unwrap()).collect();
if let [d1, d2, d3, d4, d5, d6, d7, d8] = &it[..] {
::std::char::from_u32((d1 << 28) + (d2 << 24) + (d3 << 20) + (d4 << 16) +
cp_from_u32((d1 << 28) + (d2 << 24) + (d3 << 20) + (d4 << 16) +
(d5 << 12) + (d6 << 8) + (d7 << 4) + d8)
}
else { unreachable!() }
Expand All @@ -54,51 +74,51 @@ named!(escapedchar<StrSpan, Option<char>>,
)
);

named_args!(shortstring(quote: char) <StrSpan, String>,
named_args!(shortstring(quote: char) <StrSpan, PyStringContent>,
fold_many0!(
alt!(
call!(escapedchar)
| verify!(anychar, |c:char| c != quote) => { |c:char| Some(c) }
| verify!(anychar, |c:char| c != quote) => { |c:char| Some(cp_from_char(c)) }
),
String::new(),
|mut acc:String, c:Option<char>| { match c { Some(c) => acc.push_str(&c.to_string()), None => () }; acc }
PyStringContent::new(),
|mut acc:PyStringContent, c:Option<PyStringCodePoint>| { match c { Some(c) => acc.push(c), None => () }; acc }
)
);

named_args!(longstring(quote: char) <StrSpan, String>,
named_args!(longstring(quote: char) <StrSpan, PyStringContent>,
fold_many0!(
alt!(
call!(escapedchar)
| verify!(tuple!(peek!(take!(3)), anychar), |(s,_):(StrSpan,_)| { s.fragment.0.chars().collect::<Vec<char>>() != vec![quote,quote,quote] }) => { |(_,c)| Some(c) }
| verify!(tuple!(peek!(take!(3)), anychar), |(s,_):(StrSpan,_)| { s.fragment.0.chars().collect::<Vec<char>>() != vec![quote,quote,quote] }) => { |(_,c)| Some(cp_from_char(c)) }
),
String::new(),
|mut acc:String, c:Option<char>| { match c { Some(c) => acc.push_str(&c.to_string()), None => () }; acc }
PyStringContent::new(),
|mut acc:PyStringContent, c:Option<PyStringCodePoint>| { match c { Some(c) => acc.push(c), None => () }; acc }
)
);

named_args!(shortrawstring(quote: char) <StrSpan, String>,
named_args!(shortrawstring(quote: char) <StrSpan, PyStringContent>,
fold_many0!(
alt!(
tuple!(char!('\\'), anychar) => { |(c1,c2)| (c1, Some(c2)) }
| verify!(none_of!("\\"), |c:char| c != quote) => { |c:char| (c, None) }
tuple!(char!('\\'), anychar) => { |(c1,c2)| (cp_from_char(c1), Some(cp_from_char(c2))) }
| verify!(none_of!("\\"), |c:char| c != quote) => { |c:char| (cp_from_char(c), None) }
),
String::new(),
|mut acc:String, (c1,c2):(char, Option<char>)| {
PyStringContent::new(),
|mut acc:PyStringContent, (c1,c2):(PyStringCodePoint, Option<PyStringCodePoint>)| {
acc.push(c1);
match c2 { Some(c) => acc.push(c), None => () };
acc
}
)
);

named_args!(longrawstring(quote: char) <StrSpan, String>,
named_args!(longrawstring(quote: char) <StrSpan, PyStringContent>,
fold_many0!(
alt!(
tuple!(char!('\\'), anychar) => { |(c1,c2)| (c1, Some(c2)) }
| verify!(tuple!(peek!(take!(3)), none_of!("\\")), |(s,_):(StrSpan,_)| { s.fragment.0.chars().collect::<Vec<char>>() != vec![quote,quote,quote] }) => { |(_,c)| (c, None) }
tuple!(char!('\\'), anychar) => { |(c1,c2)| (cp_from_char(c1), Some(cp_from_char(c2))) }
| verify!(tuple!(peek!(take!(3)), none_of!("\\")), |(s,_):(StrSpan,_)| { s.fragment.0.chars().collect::<Vec<char>>() != vec![quote,quote,quote] }) => { |(_,c)| (cp_from_char(c), None) }
),
String::new(),
|mut acc:String, (c1,c2):(char, Option<char>)| {
PyStringContent::new(),
|mut acc:PyStringContent, (c1,c2):(PyStringCodePoint, Option<PyStringCodePoint>)| {
acc.push(c1);
match c2 { Some(c) => acc.push(c), None => () };
acc
Expand All @@ -123,7 +143,7 @@ named!(pub string<StrSpan, PyString>,
| delimited!(char!('\''), call!(shortrawstring, '\''), char!('\''))
| delimited!(char!('"'), call!(shortrawstring, '"'), char!('"'))
)
) >> (PyString { prefix: prefix.to_string(), content: content.to_string() })
) >> (PyString { prefix: prefix.to_string(), content: content })
)
);

54 changes: 38 additions & 16 deletions src/visitors/printer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,43 @@ fn format_float(n: f64) -> String {
s
}

#[cfg(feature="wtf8")]
fn format_string(v: &Vec<PyString>) -> String {
space_join(v.iter().map(|PyString { prefix, content }|
format!("{}\"{}\"", prefix.to_ascii_lowercase().replace("r", ""), content.code_points().map(|c| match c.to_u32() {
0xd => "\\r".to_string(),
0xa => "\\n".to_string(),
0x9 => "\\t".to_string(),
0x5c => "\\\\".to_string(),
0x22 => "\\\"".to_string(),
0x20...0x7e => c.to_char().unwrap().to_string(), // unwrap can't panic
0x00...0x1f | 0x7f | 0x80...0xff => format!("\\x{:02x}", c.to_u32()),
0x100...0xffff => format!("\\u{:04x}", c.to_u32()),
0x10000...0x10ffff => format!("\\U{:08x}", c.to_u32()),
_ => unreachable!(),
}).collect::<Vec<_>>()[..].concat())
))
}

#[cfg(not(feature="wtf8"))]
fn format_string(v: &Vec<PyString>) -> String {
space_join(v.iter().map(|PyString { prefix, content }|
format!("{}\"{}\"", prefix.to_ascii_lowercase().replace("r", ""), content.chars().map(|c| match c {
'\r' => "\\r".to_string(),
'\n' => "\\n".to_string(),
'\t' => "\\t".to_string(),
'\\' => "\\\\".to_string(),
'"' => "\\\"".to_string(),
'\x20'...'\x7e' => c.to_string(),
'\x00'...'\x1f' | '\x7f' | '\u{80}'...'\u{ff}' => format!("\\x{:02x}", c as u8),
'\u{100}'...'\u{ffff}' => format!("\\u{:04x}", c as u16),
'\u{10000}'...'\u{10ffff}' => format!("\\U{:08x}", c as u32),
_ => unreachable!(),
}).collect::<Vec<_>>()[..].concat())
))
}


fn format_expr(e: &Expression) -> String {
match e {
Expression::Ellipsis => "...".to_string(),
Expand All @@ -460,22 +497,7 @@ fn format_expr(e: &Expression) -> String {
Expression::ImaginaryInt(ref n) => format!("{}j", n),
Expression::Float(ref n) => format_float(*n),
Expression::ImaginaryFloat(ref n) => format!("{}j", format_float(*n)),
Expression::String(ref v) => {
space_join(v.iter().map(|PyString { prefix, content }|
format!("{}\"{}\"", prefix.to_ascii_lowercase().replace("r", ""), content.chars().map(|c| match c {
'\r' => "\\r".to_string(),
'\n' => "\\n".to_string(),
'\t' => "\\t".to_string(),
'\\' => "\\\\".to_string(),
'"' => "\\\"".to_string(),
'\x20'...'\x7e' => c.to_string(),
'\x00'...'\x1f' | '\x7f' | '\u{80}'...'\u{ff}' => format!("\\x{:02x}", c as u8),
'\u{100}'...'\u{ffff}' => format!("\\u{:04x}", c as u16),
'\u{10000}'...'\u{10ffff}' => format!("\\U{:08x}", c as u32),
_ => unreachable!(),
}).collect::<Vec<_>>()[..].concat())
))
},
Expression::String(ref v) => format_string(v),
Expression::Bytes(ref content) => {
format!("b\"{}\"", content.iter().map(|b| match b {
b'\r' => "\\r".to_string(),
Expand Down

0 comments on commit cd6159b

Please sign in to comment.