From d08388b9d8c6e00e7acc139619a55a8133a9510d Mon Sep 17 00:00:00 2001 From: Michael Edwards Date: Sun, 5 Nov 2017 17:36:27 +0100 Subject: [PATCH] reading: provide trim functionality This commit adds support for trimming CSV records. There are two levels of support: 1. Both `ByteRecord` and `StringRecord` have grown `trim` methods. A `ByteRecord` trims ASCII whitespace while a `StringRecord` trims Unicode whitespace. 2. The CSV reader can now be configured to automatically trim all records that it reads. This is useful when using Serde to match header names with spaces (for example) to struct member names. Fixes #78 --- benches/bench.rs | 22 ++++- src/byte_record.rs | 118 +++++++++++++++++++++++++++ src/lib.rs | 36 +++++++++ src/reader.rs | 185 +++++++++++++++++++++++++++++++++++++++++-- src/string_record.rs | 102 ++++++++++++++++++++++++ 5 files changed, 457 insertions(+), 6 deletions(-) diff --git a/benches/bench.rs b/benches/bench.rs index 637771b..3272a9c 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -11,7 +11,7 @@ use std::io; use serde::de::DeserializeOwned; use test::Bencher; -use csv::{ByteRecord, Reader, ReaderBuilder, StringRecord, Writer}; +use csv::{ByteRecord, Reader, ReaderBuilder, StringRecord, Writer, Trim}; static NFL: &'static str = include_str!("../examples/data/bench/nfl.csv"); @@ -128,6 +128,24 @@ macro_rules! bench { }; } +macro_rules! bench_trimmed { + ($name:ident, $data:ident, $counter:ident, $result:expr) => { + #[bench] + fn $name(b: &mut Bencher) { + let data = $data.as_bytes(); + b.bytes = data.len() as u64; + b.iter(|| { + let mut rdr = ReaderBuilder::new() + .has_headers(false) + .trim(Trim::All) + .from_reader(data); + assert_eq!($counter(&mut rdr), $result); + }) + } + }; +} + + macro_rules! bench_serde { (no_headers, $name:ident, $data:ident, $counter:ident, $type:ty, $result:expr) => { @@ -213,7 +231,9 @@ bench_serde_borrowed_bytes!( bench_serde_borrowed_str!( count_nfl_deserialize_borrowed_str, NFL, NFLRowBorrowed, true, 9999); bench!(count_nfl_iter_bytes, NFL, count_iter_bytes, 130000); +bench_trimmed!(count_nfl_iter_bytes_trimmed, NFL, count_iter_bytes, 130000); bench!(count_nfl_iter_str, NFL, count_iter_str, 130000); +bench_trimmed!(count_nfl_iter_str_trimmed, NFL, count_iter_str, 130000); bench!(count_nfl_read_bytes, NFL, count_read_bytes, 130000); bench!(count_nfl_read_str, NFL, count_read_str, 130000); bench_serde!( diff --git a/src/byte_record.rs b/src/byte_record.rs index 003b89e..85286b9 100644 --- a/src/byte_record.rs +++ b/src/byte_record.rs @@ -417,6 +417,59 @@ impl ByteRecord { self.truncate(0); } + /// Trim the fields of this record so that leading and trailing whitespace + /// is removed. + /// + /// This method uses the ASCII definition of whitespace. That is, only + /// bytes in the class `[\t\n\v\f\r ]` are trimmed. + /// + /// # Example + /// + /// ``` + /// use csv::ByteRecord; + /// + /// let mut record = ByteRecord::from(vec![ + /// " ", "\tfoo", "bar ", "b a z", + /// ]); + /// record.trim(); + /// assert_eq!(record, vec!["", "foo", "bar", "b a z"]); + /// ``` + pub fn trim(&mut self) { + let mut trimmed = 0; + for field in 0..self.len() { + self.0.bounds.ends[field] -= trimmed; + let bound = self.0.bounds.get(field).unwrap(); + let front_space = self.count_leading_whitespace(bound.clone()); + let back_space = + if front_space < bound.end - bound.start { + self.count_leading_whitespace(bound.clone().rev()) + } else { + 0 + }; + + self.0.fields.drain(bound.end - back_space..bound.end); + self.0.fields.drain(bound.start..bound.start + front_space); + self.0.bounds.ends[field] -= front_space + back_space; + trimmed += front_space + back_space; + } + } + + /// Returns amount of leading whitespace starting in the given range. + /// Whitespace is not counted past the end of the range. + fn count_leading_whitespace(&self, range: R) -> usize + where R: Iterator + { + let mut count = 0; + for i in range { + match self.0.fields[i] { + b'\t' | b'\n' | b'\x0B' | b'\x0C' | b'\r' | b' ' => {} + _ => break, + } + count += 1; + } + count + } + /// Add a new field to this record. /// /// # Example @@ -842,6 +895,71 @@ mod tests { assert_eq!(rec.get(1), None); } + #[test] + fn trim_whitespace_only() { + let mut rec = ByteRecord::from(vec![b" \t\n\r\x0c"]); + rec.trim(); + assert_eq!(rec.get(0), Some(b(""))); + } + + #[test] + fn trim_front() { + let mut rec = ByteRecord::from(vec![b" abc"]); + rec.trim(); + assert_eq!(rec.get(0), Some(b("abc"))); + + let mut rec = ByteRecord::from(vec![b(" abc"), b(" xyz")]); + rec.trim(); + assert_eq!(rec.get(0), Some(b("abc"))); + assert_eq!(rec.get(1), Some(b("xyz"))); + } + + #[test] + fn trim_back() { + let mut rec = ByteRecord::from(vec![b"abc "]); + rec.trim(); + assert_eq!(rec.get(0), Some(b("abc"))); + + let mut rec = ByteRecord::from(vec![b("abc "), b("xyz ")]); + rec.trim(); + assert_eq!(rec.get(0), Some(b("abc"))); + assert_eq!(rec.get(1), Some(b("xyz"))); + } + + #[test] + fn trim_both() { + let mut rec = ByteRecord::from(vec![b" abc "]); + rec.trim(); + assert_eq!(rec.get(0), Some(b("abc"))); + + let mut rec = ByteRecord::from(vec![b(" abc "), b(" xyz ")]); + rec.trim(); + assert_eq!(rec.get(0), Some(b("abc"))); + assert_eq!(rec.get(1), Some(b("xyz"))); + } + + #[test] + fn trim_does_not_panic_on_empty_records_1() { + let mut rec = ByteRecord::from(vec![b""]); + rec.trim(); + assert_eq!(rec.get(0), Some(b(""))); + } + + #[test] + fn trim_does_not_panic_on_empty_records_2() { + let mut rec = ByteRecord::from(vec![b"", b""]); + rec.trim(); + assert_eq!(rec.get(0), Some(b(""))); + assert_eq!(rec.get(1), Some(b(""))); + } + + #[test] + fn trim_does_not_panic_on_empty_records_3() { + let mut rec = ByteRecord::new(); + rec.trim(); + assert_eq!(rec.as_slice().len(), 0); + } + #[test] fn empty_field_1() { let mut rec = ByteRecord::new(); diff --git a/src/lib.rs b/src/lib.rs index f26ce3f..314bbf1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -286,6 +286,42 @@ impl Default for Terminator { } } +/// The whitespace preservation behaviour when reading CSV data. +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum Trim { + /// Preserves fields and headers. This is the default. + None, + /// Trim whitespace from headers. + Headers, + /// Trim whitespace from fields, but not headers. + Fields, + /// Trim whitespace from fields and headers. + All, + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl Trim { + fn should_trim_fields(&self) -> bool { + self == &Trim::Fields || self == &Trim::All + } + + fn should_trim_headers(&self) -> bool { + self == &Trim::Headers || self == &Trim::All + } +} + +impl Default for Trim { + fn default() -> Trim { + Trim::None + } +} + /// A custom Serde deserializer for possibly invalid `Option` fields. /// /// When deserializing CSV data, it is sometimes desirable to simply ignore diff --git a/src/reader.rs b/src/reader.rs index 53507f7..627b0f1 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -12,7 +12,7 @@ use serde::de::DeserializeOwned; use byte_record::{self, ByteRecord, Position}; use error::{ErrorKind, Result, Utf8Error, new_error}; use string_record::{self, StringRecord}; -use Terminator; +use {Terminator, Trim}; /// Builds a CSV reader with various configuration knobs. /// @@ -24,6 +24,7 @@ pub struct ReaderBuilder { capacity: usize, flexible: bool, has_headers: bool, + trim: Trim, /// The underlying CSV parser builder. /// /// We explicitly put this on the heap because CoreReaderBuilder embeds an @@ -38,6 +39,7 @@ impl Default for ReaderBuilder { capacity: 8 * (1<<10), flexible: false, has_headers: true, + trim: Trim::default(), builder: Box::new(CoreReaderBuilder::default()), } } @@ -319,6 +321,59 @@ impl ReaderBuilder { self } + /// Whether fields are trimmed of leading and trailing whitespace or not. + /// + /// By default, no trimming is performed. This method permits one to + /// override that behavior and choose one of the following options: + /// + /// 1. `Trim::Headers` trims only header values. + /// 2. `Trim::Fields` trims only non-header or "field" values. + /// 3. `Trim::All` trims both header and non-header values. + /// + /// A value is only interpreted as a header value if this CSV reader is + /// configured to read a header record (which is the default). + /// + /// When reading string records, characters meeting the definition of + /// Unicode whitespace are trimmed. When reading byte records, characters + /// meeting the definition of ASCII whitespace are trimmed. ASCII + /// whitespace characters correspond to the set `[\t\n\v\f\r ]`. + /// + /// # Example + /// + /// This example shows what happens when all values are trimmed. + /// + /// ``` + /// extern crate csv; + /// + /// use std::error::Error; + /// use csv::{ReaderBuilder, StringRecord, Trim}; + /// + /// # fn main() { example().unwrap(); } + /// fn example() -> Result<(), Box> { + /// let data = "\ + /// city , country , pop + /// Boston,\" + /// United States\",4628910 + /// Concord, United States ,42695 + /// "; + /// let mut rdr = ReaderBuilder::new() + /// .trim(Trim::All) + /// .from_reader(data.as_bytes()); + /// let records = rdr + /// .records() + /// .collect::, csv::Error>>()?; + /// assert_eq!(records, vec![ + /// vec!["Boston", "United States", "4628910"], + /// vec!["Concord", "United States", "42695"], + /// ]); + /// Ok(()) + /// } + /// ``` + pub fn trim(&mut self, trim: Trim) -> &mut ReaderBuilder { + self.trim = trim; + self + } + /// The record terminator to use when parsing CSV. /// /// A record terminator can be any single byte. The default is a special @@ -710,6 +765,7 @@ struct ReaderState { /// set, every record must have the same number of fields, or else an error /// is reported. flexible: bool, + trim: Trim, /// The number of fields in the first record parsed. first_field_count: Option, /// The current position of the parser. @@ -776,6 +832,7 @@ impl Reader { headers: None, has_headers: builder.has_headers, flexible: builder.flexible, + trim: builder.trim, first_field_count: None, cur_pos: Position::new(), first: false, @@ -1447,7 +1504,7 @@ impl Reader { ) { // If we have string headers, then get byte headers. But if we have // byte headers, then get the string headers (or a UTF-8 error). - let (str_headers, byte_headers) = match headers { + let (mut str_headers, mut byte_headers) = match headers { Ok(string) => { let bytes = string.clone().into_byte_record(); (Ok(string), bytes) @@ -1459,6 +1516,12 @@ impl Reader { } } }; + if self.state.trim.should_trim_headers() { + if let Ok(ref mut str_headers) = str_headers.as_mut() { + str_headers.trim(); + } + byte_headers.trim(); + } self.state.headers = Some(Headers { byte_record: byte_headers, string_record: str_headers, @@ -1505,7 +1568,14 @@ impl Reader { /// } /// ``` pub fn read_record(&mut self, record: &mut StringRecord) -> Result { - string_record::read(self, record) + let result = string_record::read(self, record); + // We need to trim again because trimming string records includes + // Unicode whitespace. (ByteRecord trimming only includes ASCII + // whitespace.) + if self.state.trim.should_trim_fields() { + record.trim(); + } + result } /// Read a single row into the given byte record. Returns false when no @@ -1558,6 +1628,9 @@ impl Reader { if let Some(ref headers) = self.state.headers { self.state.first = true; record.clone_from(&headers.byte_record); + if self.state.trim.should_trim_fields() { + record.trim(); + } return Ok(!record.is_empty()); } } @@ -1569,8 +1642,14 @@ impl Reader { // never return the first row. Instead, we should attempt to // read and return the next one. if self.state.has_headers { - return self.read_byte_record_impl(record); + let result = self.read_byte_record_impl(record); + if self.state.trim.should_trim_fields() { + record.trim(); + } + return result; } + } else if self.state.trim.should_trim_fields() { + record.trim(); } Ok(ok) } @@ -2134,7 +2213,7 @@ mod tests { use error::ErrorKind; use string_record::StringRecord; - use super::{ReaderBuilder, Position}; + use super::{ReaderBuilder, Position, Trim}; fn b(s: &str) -> &[u8] { s.as_bytes() } fn s(b: &[u8]) -> &str { ::std::str::from_utf8(b).unwrap() } @@ -2168,6 +2247,102 @@ mod tests { assert!(!rdr.read_byte_record(&mut rec).unwrap()); } + #[test] + fn read_trimmed_records_and_headers() { + let data = b("foo, bar,\tbaz\n 1, 2, 3\n1\t,\t,3\t\t"); + let mut rdr = ReaderBuilder::new() + .has_headers(true) + .trim(Trim::All) + .from_reader(data); + let mut rec = ByteRecord::new(); + assert!(rdr.read_byte_record(&mut rec).unwrap()); + assert_eq!("1", s(&rec[0])); + assert_eq!("2", s(&rec[1])); + assert_eq!("3", s(&rec[2])); + let mut rec = StringRecord::new(); + assert!(rdr.read_record(&mut rec).unwrap()); + assert_eq!("1", &rec[0]); + assert_eq!("", &rec[1]); + assert_eq!("3", &rec[2]); + { + let headers = rdr.headers().unwrap(); + assert_eq!(3, headers.len()); + assert_eq!("foo", &headers[0]); + assert_eq!("bar", &headers[1]); + assert_eq!("baz", &headers[2]); + } + } + + #[test] + fn read_trimmed_header() { + let data = b("foo, bar,\tbaz\n 1, 2, 3\n1\t,\t,3\t\t"); + let mut rdr = ReaderBuilder::new() + .has_headers(true) + .trim(Trim::Headers) + .from_reader(data); + let mut rec = ByteRecord::new(); + assert!(rdr.read_byte_record(&mut rec).unwrap()); + assert_eq!(" 1", s(&rec[0])); + assert_eq!(" 2", s(&rec[1])); + assert_eq!(" 3", s(&rec[2])); + { + let headers = rdr.headers().unwrap(); + assert_eq!(3, headers.len()); + assert_eq!("foo", &headers[0]); + assert_eq!("bar", &headers[1]); + assert_eq!("baz", &headers[2]); + } + } + + #[test] + fn read_trimed_header_invalid_utf8() { + let data = &b"foo, b\xFFar,\tbaz\na,b,c\nd,e,f"[..]; + let mut rdr = ReaderBuilder::new() + .has_headers(true) + .trim(Trim::Headers) + .from_reader(data); + let mut rec = StringRecord::new(); + + let _ = rdr.read_record(&mut rec); // force the headers to be read + // Check the byte headers are trimmed and string headers are still errors + { + let headers = rdr.byte_headers().unwrap(); + assert_eq!(3, headers.len()); + assert_eq!(b"foo", &headers[0]); + assert_eq!(b"b\xFFar", &headers[1]); + assert_eq!(b"baz", &headers[2]); + } + match *rdr.headers().unwrap_err().kind() { + ErrorKind::Utf8 { pos: Some(ref pos), ref err } => { + assert_eq!(pos, &newpos(0, 1, 0)); + assert_eq!(err.field(), 1); + assert_eq!(err.valid_up_to(), 3); + } + ref err => panic!("match failed, got {:?}", err), + } + } + + #[test] + fn read_trimmed_records() { + let data = b("foo, bar,\tbaz\n 1, 2, 3\n1\t,\t,3\t\t"); + let mut rdr = ReaderBuilder::new() + .has_headers(true) + .trim(Trim::Fields) + .from_reader(data); + let mut rec = ByteRecord::new(); + assert!(rdr.read_byte_record(&mut rec).unwrap()); + assert_eq!("1", s(&rec[0])); + assert_eq!("2", s(&rec[1])); + assert_eq!("3", s(&rec[2])); + { + let headers = rdr.headers().unwrap(); + assert_eq!(3, headers.len()); + assert_eq!("foo", &headers[0]); + assert_eq!(" bar", &headers[1]); + assert_eq!("\tbaz", &headers[2]); + } + } + #[test] fn read_record_unequal_fails() { let data = b("foo\nbar,baz"); diff --git a/src/string_record.rs b/src/string_record.rs index 6cbdef6..bd518f3 100644 --- a/src/string_record.rs +++ b/src/string_record.rs @@ -445,6 +445,36 @@ impl StringRecord { self.0.clear(); } + /// Trim the fields of this record so that leading and trailing whitespace + /// is removed. + /// + /// This method uses the Unicode definition of whitespace. + /// + /// # Example + /// + /// ``` + /// use csv::StringRecord; + /// + /// let mut record = StringRecord::from(vec![ + /// " ", "\u{3000}\tfoo ", "bar ", "b a z", + /// ]); + /// record.trim(); + /// assert_eq!(record, vec!["", "foo", "bar", "b a z"]); + /// ``` + pub fn trim(&mut self) { + let length = self.len(); + if length == 0 { + return; + } + // TODO: We could likely do this in place, but for now, we allocate. + let mut trimmed = StringRecord::with_capacity( + self.as_slice().len(), self.len()); + for mut field in &*self { + trimmed.push_field(field.trim()); + } + *self = trimmed; + } + /// Add a new field to this record. /// /// # Example @@ -702,3 +732,75 @@ impl<'r> DoubleEndedIterator for StringRecordIter<'r> { }) } } + +#[cfg(test)] +mod tests { + use string_record::StringRecord; + + #[test] + fn trim_front() { + let mut rec = StringRecord::from(vec![" abc"]); + rec.trim(); + assert_eq!(rec.get(0), Some("abc")); + + let mut rec = StringRecord::from(vec![" abc", " xyz"]); + rec.trim(); + assert_eq!(rec.get(0), Some("abc")); + assert_eq!(rec.get(1), Some("xyz")); + } + + #[test] + fn trim_back() { + let mut rec = StringRecord::from(vec!["abc "]); + rec.trim(); + assert_eq!(rec.get(0), Some("abc")); + + let mut rec = StringRecord::from(vec!["abc ", "xyz "]); + rec.trim(); + assert_eq!(rec.get(0), Some("abc")); + assert_eq!(rec.get(1), Some("xyz")); + } + + #[test] + fn trim_both() { + let mut rec = StringRecord::from(vec![" abc "]); + rec.trim(); + assert_eq!(rec.get(0), Some("abc")); + + let mut rec = StringRecord::from(vec![" abc ", " xyz "]); + rec.trim(); + assert_eq!(rec.get(0), Some("abc")); + assert_eq!(rec.get(1), Some("xyz")); + } + + #[test] + fn trim_does_not_panic_on_empty_records_1() { + let mut rec = StringRecord::from(vec![""]); + rec.trim(); + assert_eq!(rec.get(0), Some("")); + } + + #[test] + fn trim_does_not_panic_on_empty_records_2() { + let mut rec = StringRecord::from(vec!["", ""]); + rec.trim(); + assert_eq!(rec.get(0), Some("")); + assert_eq!(rec.get(1), Some("")); + } + + #[test] + fn trim_does_not_panic_on_empty_records_3() { + let mut rec = StringRecord::new(); + rec.trim(); + assert_eq!(rec.as_slice().len(), 0); + } + + #[test] + fn trim_whitespace_only() { + let mut rec = StringRecord::from(vec![ + "\u{0009}\u{000A}\u{000B}\u{000C}\u{000D}\u{0020}\u{0085}\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}", + ]); + rec.trim(); + assert_eq!(rec.get(0), Some("")); + } +}