diff --git a/benches/bench.rs b/benches/bench.rs index 637771b..3272a9c 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -11,7 +11,7 @@ use std::io; use serde::de::DeserializeOwned; use test::Bencher; -use csv::{ByteRecord, Reader, ReaderBuilder, StringRecord, Writer}; +use csv::{ByteRecord, Reader, ReaderBuilder, StringRecord, Writer, Trim}; static NFL: &'static str = include_str!("../examples/data/bench/nfl.csv"); @@ -128,6 +128,24 @@ macro_rules! bench { }; } +macro_rules! bench_trimmed { + ($name:ident, $data:ident, $counter:ident, $result:expr) => { + #[bench] + fn $name(b: &mut Bencher) { + let data = $data.as_bytes(); + b.bytes = data.len() as u64; + b.iter(|| { + let mut rdr = ReaderBuilder::new() + .has_headers(false) + .trim(Trim::All) + .from_reader(data); + assert_eq!($counter(&mut rdr), $result); + }) + } + }; +} + + macro_rules! bench_serde { (no_headers, $name:ident, $data:ident, $counter:ident, $type:ty, $result:expr) => { @@ -213,7 +231,9 @@ bench_serde_borrowed_bytes!( bench_serde_borrowed_str!( count_nfl_deserialize_borrowed_str, NFL, NFLRowBorrowed, true, 9999); bench!(count_nfl_iter_bytes, NFL, count_iter_bytes, 130000); +bench_trimmed!(count_nfl_iter_bytes_trimmed, NFL, count_iter_bytes, 130000); bench!(count_nfl_iter_str, NFL, count_iter_str, 130000); +bench_trimmed!(count_nfl_iter_str_trimmed, NFL, count_iter_str, 130000); bench!(count_nfl_read_bytes, NFL, count_read_bytes, 130000); bench!(count_nfl_read_str, NFL, count_read_str, 130000); bench_serde!( diff --git a/src/byte_record.rs b/src/byte_record.rs index 003b89e..85286b9 100644 --- a/src/byte_record.rs +++ b/src/byte_record.rs @@ -417,6 +417,59 @@ impl ByteRecord { self.truncate(0); } + /// Trim the fields of this record so that leading and trailing whitespace + /// is removed. + /// + /// This method uses the ASCII definition of whitespace. That is, only + /// bytes in the class `[\t\n\v\f\r ]` are trimmed. + /// + /// # Example + /// + /// ``` + /// use csv::ByteRecord; + /// + /// let mut record = ByteRecord::from(vec![ + /// " ", "\tfoo", "bar ", "b a z", + /// ]); + /// record.trim(); + /// assert_eq!(record, vec!["", "foo", "bar", "b a z"]); + /// ``` + pub fn trim(&mut self) { + let mut trimmed = 0; + for field in 0..self.len() { + self.0.bounds.ends[field] -= trimmed; + let bound = self.0.bounds.get(field).unwrap(); + let front_space = self.count_leading_whitespace(bound.clone()); + let back_space = + if front_space < bound.end - bound.start { + self.count_leading_whitespace(bound.clone().rev()) + } else { + 0 + }; + + self.0.fields.drain(bound.end - back_space..bound.end); + self.0.fields.drain(bound.start..bound.start + front_space); + self.0.bounds.ends[field] -= front_space + back_space; + trimmed += front_space + back_space; + } + } + + /// Returns amount of leading whitespace starting in the given range. + /// Whitespace is not counted past the end of the range. + fn count_leading_whitespace(&self, range: R) -> usize + where R: Iterator + { + let mut count = 0; + for i in range { + match self.0.fields[i] { + b'\t' | b'\n' | b'\x0B' | b'\x0C' | b'\r' | b' ' => {} + _ => break, + } + count += 1; + } + count + } + /// Add a new field to this record. /// /// # Example @@ -842,6 +895,71 @@ mod tests { assert_eq!(rec.get(1), None); } + #[test] + fn trim_whitespace_only() { + let mut rec = ByteRecord::from(vec![b" \t\n\r\x0c"]); + rec.trim(); + assert_eq!(rec.get(0), Some(b(""))); + } + + #[test] + fn trim_front() { + let mut rec = ByteRecord::from(vec![b" abc"]); + rec.trim(); + assert_eq!(rec.get(0), Some(b("abc"))); + + let mut rec = ByteRecord::from(vec![b(" abc"), b(" xyz")]); + rec.trim(); + assert_eq!(rec.get(0), Some(b("abc"))); + assert_eq!(rec.get(1), Some(b("xyz"))); + } + + #[test] + fn trim_back() { + let mut rec = ByteRecord::from(vec![b"abc "]); + rec.trim(); + assert_eq!(rec.get(0), Some(b("abc"))); + + let mut rec = ByteRecord::from(vec![b("abc "), b("xyz ")]); + rec.trim(); + assert_eq!(rec.get(0), Some(b("abc"))); + assert_eq!(rec.get(1), Some(b("xyz"))); + } + + #[test] + fn trim_both() { + let mut rec = ByteRecord::from(vec![b" abc "]); + rec.trim(); + assert_eq!(rec.get(0), Some(b("abc"))); + + let mut rec = ByteRecord::from(vec![b(" abc "), b(" xyz ")]); + rec.trim(); + assert_eq!(rec.get(0), Some(b("abc"))); + assert_eq!(rec.get(1), Some(b("xyz"))); + } + + #[test] + fn trim_does_not_panic_on_empty_records_1() { + let mut rec = ByteRecord::from(vec![b""]); + rec.trim(); + assert_eq!(rec.get(0), Some(b(""))); + } + + #[test] + fn trim_does_not_panic_on_empty_records_2() { + let mut rec = ByteRecord::from(vec![b"", b""]); + rec.trim(); + assert_eq!(rec.get(0), Some(b(""))); + assert_eq!(rec.get(1), Some(b(""))); + } + + #[test] + fn trim_does_not_panic_on_empty_records_3() { + let mut rec = ByteRecord::new(); + rec.trim(); + assert_eq!(rec.as_slice().len(), 0); + } + #[test] fn empty_field_1() { let mut rec = ByteRecord::new(); diff --git a/src/lib.rs b/src/lib.rs index f26ce3f..314bbf1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -286,6 +286,42 @@ impl Default for Terminator { } } +/// The whitespace preservation behaviour when reading CSV data. +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum Trim { + /// Preserves fields and headers. This is the default. + None, + /// Trim whitespace from headers. + Headers, + /// Trim whitespace from fields, but not headers. + Fields, + /// Trim whitespace from fields and headers. + All, + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl Trim { + fn should_trim_fields(&self) -> bool { + self == &Trim::Fields || self == &Trim::All + } + + fn should_trim_headers(&self) -> bool { + self == &Trim::Headers || self == &Trim::All + } +} + +impl Default for Trim { + fn default() -> Trim { + Trim::None + } +} + /// A custom Serde deserializer for possibly invalid `Option` fields. /// /// When deserializing CSV data, it is sometimes desirable to simply ignore diff --git a/src/reader.rs b/src/reader.rs index 53507f7..627b0f1 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -12,7 +12,7 @@ use serde::de::DeserializeOwned; use byte_record::{self, ByteRecord, Position}; use error::{ErrorKind, Result, Utf8Error, new_error}; use string_record::{self, StringRecord}; -use Terminator; +use {Terminator, Trim}; /// Builds a CSV reader with various configuration knobs. /// @@ -24,6 +24,7 @@ pub struct ReaderBuilder { capacity: usize, flexible: bool, has_headers: bool, + trim: Trim, /// The underlying CSV parser builder. /// /// We explicitly put this on the heap because CoreReaderBuilder embeds an @@ -38,6 +39,7 @@ impl Default for ReaderBuilder { capacity: 8 * (1<<10), flexible: false, has_headers: true, + trim: Trim::default(), builder: Box::new(CoreReaderBuilder::default()), } } @@ -319,6 +321,59 @@ impl ReaderBuilder { self } + /// Whether fields are trimmed of leading and trailing whitespace or not. + /// + /// By default, no trimming is performed. This method permits one to + /// override that behavior and choose one of the following options: + /// + /// 1. `Trim::Headers` trims only header values. + /// 2. `Trim::Fields` trims only non-header or "field" values. + /// 3. `Trim::All` trims both header and non-header values. + /// + /// A value is only interpreted as a header value if this CSV reader is + /// configured to read a header record (which is the default). + /// + /// When reading string records, characters meeting the definition of + /// Unicode whitespace are trimmed. When reading byte records, characters + /// meeting the definition of ASCII whitespace are trimmed. ASCII + /// whitespace characters correspond to the set `[\t\n\v\f\r ]`. + /// + /// # Example + /// + /// This example shows what happens when all values are trimmed. + /// + /// ``` + /// extern crate csv; + /// + /// use std::error::Error; + /// use csv::{ReaderBuilder, StringRecord, Trim}; + /// + /// # fn main() { example().unwrap(); } + /// fn example() -> Result<(), Box> { + /// let data = "\ + /// city , country , pop + /// Boston,\" + /// United States\",4628910 + /// Concord, United States ,42695 + /// "; + /// let mut rdr = ReaderBuilder::new() + /// .trim(Trim::All) + /// .from_reader(data.as_bytes()); + /// let records = rdr + /// .records() + /// .collect::, csv::Error>>()?; + /// assert_eq!(records, vec![ + /// vec!["Boston", "United States", "4628910"], + /// vec!["Concord", "United States", "42695"], + /// ]); + /// Ok(()) + /// } + /// ``` + pub fn trim(&mut self, trim: Trim) -> &mut ReaderBuilder { + self.trim = trim; + self + } + /// The record terminator to use when parsing CSV. /// /// A record terminator can be any single byte. The default is a special @@ -710,6 +765,7 @@ struct ReaderState { /// set, every record must have the same number of fields, or else an error /// is reported. flexible: bool, + trim: Trim, /// The number of fields in the first record parsed. first_field_count: Option, /// The current position of the parser. @@ -776,6 +832,7 @@ impl Reader { headers: None, has_headers: builder.has_headers, flexible: builder.flexible, + trim: builder.trim, first_field_count: None, cur_pos: Position::new(), first: false, @@ -1447,7 +1504,7 @@ impl Reader { ) { // If we have string headers, then get byte headers. But if we have // byte headers, then get the string headers (or a UTF-8 error). - let (str_headers, byte_headers) = match headers { + let (mut str_headers, mut byte_headers) = match headers { Ok(string) => { let bytes = string.clone().into_byte_record(); (Ok(string), bytes) @@ -1459,6 +1516,12 @@ impl Reader { } } }; + if self.state.trim.should_trim_headers() { + if let Ok(ref mut str_headers) = str_headers.as_mut() { + str_headers.trim(); + } + byte_headers.trim(); + } self.state.headers = Some(Headers { byte_record: byte_headers, string_record: str_headers, @@ -1505,7 +1568,14 @@ impl Reader { /// } /// ``` pub fn read_record(&mut self, record: &mut StringRecord) -> Result { - string_record::read(self, record) + let result = string_record::read(self, record); + // We need to trim again because trimming string records includes + // Unicode whitespace. (ByteRecord trimming only includes ASCII + // whitespace.) + if self.state.trim.should_trim_fields() { + record.trim(); + } + result } /// Read a single row into the given byte record. Returns false when no @@ -1558,6 +1628,9 @@ impl Reader { if let Some(ref headers) = self.state.headers { self.state.first = true; record.clone_from(&headers.byte_record); + if self.state.trim.should_trim_fields() { + record.trim(); + } return Ok(!record.is_empty()); } } @@ -1569,8 +1642,14 @@ impl Reader { // never return the first row. Instead, we should attempt to // read and return the next one. if self.state.has_headers { - return self.read_byte_record_impl(record); + let result = self.read_byte_record_impl(record); + if self.state.trim.should_trim_fields() { + record.trim(); + } + return result; } + } else if self.state.trim.should_trim_fields() { + record.trim(); } Ok(ok) } @@ -2134,7 +2213,7 @@ mod tests { use error::ErrorKind; use string_record::StringRecord; - use super::{ReaderBuilder, Position}; + use super::{ReaderBuilder, Position, Trim}; fn b(s: &str) -> &[u8] { s.as_bytes() } fn s(b: &[u8]) -> &str { ::std::str::from_utf8(b).unwrap() } @@ -2168,6 +2247,102 @@ mod tests { assert!(!rdr.read_byte_record(&mut rec).unwrap()); } + #[test] + fn read_trimmed_records_and_headers() { + let data = b("foo, bar,\tbaz\n 1, 2, 3\n1\t,\t,3\t\t"); + let mut rdr = ReaderBuilder::new() + .has_headers(true) + .trim(Trim::All) + .from_reader(data); + let mut rec = ByteRecord::new(); + assert!(rdr.read_byte_record(&mut rec).unwrap()); + assert_eq!("1", s(&rec[0])); + assert_eq!("2", s(&rec[1])); + assert_eq!("3", s(&rec[2])); + let mut rec = StringRecord::new(); + assert!(rdr.read_record(&mut rec).unwrap()); + assert_eq!("1", &rec[0]); + assert_eq!("", &rec[1]); + assert_eq!("3", &rec[2]); + { + let headers = rdr.headers().unwrap(); + assert_eq!(3, headers.len()); + assert_eq!("foo", &headers[0]); + assert_eq!("bar", &headers[1]); + assert_eq!("baz", &headers[2]); + } + } + + #[test] + fn read_trimmed_header() { + let data = b("foo, bar,\tbaz\n 1, 2, 3\n1\t,\t,3\t\t"); + let mut rdr = ReaderBuilder::new() + .has_headers(true) + .trim(Trim::Headers) + .from_reader(data); + let mut rec = ByteRecord::new(); + assert!(rdr.read_byte_record(&mut rec).unwrap()); + assert_eq!(" 1", s(&rec[0])); + assert_eq!(" 2", s(&rec[1])); + assert_eq!(" 3", s(&rec[2])); + { + let headers = rdr.headers().unwrap(); + assert_eq!(3, headers.len()); + assert_eq!("foo", &headers[0]); + assert_eq!("bar", &headers[1]); + assert_eq!("baz", &headers[2]); + } + } + + #[test] + fn read_trimed_header_invalid_utf8() { + let data = &b"foo, b\xFFar,\tbaz\na,b,c\nd,e,f"[..]; + let mut rdr = ReaderBuilder::new() + .has_headers(true) + .trim(Trim::Headers) + .from_reader(data); + let mut rec = StringRecord::new(); + + let _ = rdr.read_record(&mut rec); // force the headers to be read + // Check the byte headers are trimmed and string headers are still errors + { + let headers = rdr.byte_headers().unwrap(); + assert_eq!(3, headers.len()); + assert_eq!(b"foo", &headers[0]); + assert_eq!(b"b\xFFar", &headers[1]); + assert_eq!(b"baz", &headers[2]); + } + match *rdr.headers().unwrap_err().kind() { + ErrorKind::Utf8 { pos: Some(ref pos), ref err } => { + assert_eq!(pos, &newpos(0, 1, 0)); + assert_eq!(err.field(), 1); + assert_eq!(err.valid_up_to(), 3); + } + ref err => panic!("match failed, got {:?}", err), + } + } + + #[test] + fn read_trimmed_records() { + let data = b("foo, bar,\tbaz\n 1, 2, 3\n1\t,\t,3\t\t"); + let mut rdr = ReaderBuilder::new() + .has_headers(true) + .trim(Trim::Fields) + .from_reader(data); + let mut rec = ByteRecord::new(); + assert!(rdr.read_byte_record(&mut rec).unwrap()); + assert_eq!("1", s(&rec[0])); + assert_eq!("2", s(&rec[1])); + assert_eq!("3", s(&rec[2])); + { + let headers = rdr.headers().unwrap(); + assert_eq!(3, headers.len()); + assert_eq!("foo", &headers[0]); + assert_eq!(" bar", &headers[1]); + assert_eq!("\tbaz", &headers[2]); + } + } + #[test] fn read_record_unequal_fails() { let data = b("foo\nbar,baz"); diff --git a/src/string_record.rs b/src/string_record.rs index 6cbdef6..bd518f3 100644 --- a/src/string_record.rs +++ b/src/string_record.rs @@ -445,6 +445,36 @@ impl StringRecord { self.0.clear(); } + /// Trim the fields of this record so that leading and trailing whitespace + /// is removed. + /// + /// This method uses the Unicode definition of whitespace. + /// + /// # Example + /// + /// ``` + /// use csv::StringRecord; + /// + /// let mut record = StringRecord::from(vec![ + /// " ", "\u{3000}\tfoo ", "bar ", "b a z", + /// ]); + /// record.trim(); + /// assert_eq!(record, vec!["", "foo", "bar", "b a z"]); + /// ``` + pub fn trim(&mut self) { + let length = self.len(); + if length == 0 { + return; + } + // TODO: We could likely do this in place, but for now, we allocate. + let mut trimmed = StringRecord::with_capacity( + self.as_slice().len(), self.len()); + for mut field in &*self { + trimmed.push_field(field.trim()); + } + *self = trimmed; + } + /// Add a new field to this record. /// /// # Example @@ -702,3 +732,75 @@ impl<'r> DoubleEndedIterator for StringRecordIter<'r> { }) } } + +#[cfg(test)] +mod tests { + use string_record::StringRecord; + + #[test] + fn trim_front() { + let mut rec = StringRecord::from(vec![" abc"]); + rec.trim(); + assert_eq!(rec.get(0), Some("abc")); + + let mut rec = StringRecord::from(vec![" abc", " xyz"]); + rec.trim(); + assert_eq!(rec.get(0), Some("abc")); + assert_eq!(rec.get(1), Some("xyz")); + } + + #[test] + fn trim_back() { + let mut rec = StringRecord::from(vec!["abc "]); + rec.trim(); + assert_eq!(rec.get(0), Some("abc")); + + let mut rec = StringRecord::from(vec!["abc ", "xyz "]); + rec.trim(); + assert_eq!(rec.get(0), Some("abc")); + assert_eq!(rec.get(1), Some("xyz")); + } + + #[test] + fn trim_both() { + let mut rec = StringRecord::from(vec![" abc "]); + rec.trim(); + assert_eq!(rec.get(0), Some("abc")); + + let mut rec = StringRecord::from(vec![" abc ", " xyz "]); + rec.trim(); + assert_eq!(rec.get(0), Some("abc")); + assert_eq!(rec.get(1), Some("xyz")); + } + + #[test] + fn trim_does_not_panic_on_empty_records_1() { + let mut rec = StringRecord::from(vec![""]); + rec.trim(); + assert_eq!(rec.get(0), Some("")); + } + + #[test] + fn trim_does_not_panic_on_empty_records_2() { + let mut rec = StringRecord::from(vec!["", ""]); + rec.trim(); + assert_eq!(rec.get(0), Some("")); + assert_eq!(rec.get(1), Some("")); + } + + #[test] + fn trim_does_not_panic_on_empty_records_3() { + let mut rec = StringRecord::new(); + rec.trim(); + assert_eq!(rec.as_slice().len(), 0); + } + + #[test] + fn trim_whitespace_only() { + let mut rec = StringRecord::from(vec![ + "\u{0009}\u{000A}\u{000B}\u{000C}\u{000D}\u{0020}\u{0085}\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}", + ]); + rec.trim(); + assert_eq!(rec.get(0), Some("")); + } +}