Skip to content

Commit

Permalink
reading: provide trim functionality
Browse files Browse the repository at this point in the history
This commit adds support for trimming CSV records. There are two levels
of support:

  1. Both `ByteRecord` and `StringRecord` have grown `trim` methods. A
     `ByteRecord` trims ASCII whitespace while a `StringRecord` trims
     Unicode whitespace.
  2. The CSV reader can now be configured to automatically trim all
     records that it reads. This is useful when using Serde to match
     header names with spaces (for example) to struct member names.

Fixes #78
  • Loading branch information
medwards authored and BurntSushi committed Jan 30, 2018
1 parent 9030a4a commit d08388b
Show file tree
Hide file tree
Showing 5 changed files with 457 additions and 6 deletions.
22 changes: 21 additions & 1 deletion benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use std::io;
use serde::de::DeserializeOwned;
use test::Bencher;

use csv::{ByteRecord, Reader, ReaderBuilder, StringRecord, Writer};
use csv::{ByteRecord, Reader, ReaderBuilder, StringRecord, Writer, Trim};

static NFL: &'static str =
include_str!("../examples/data/bench/nfl.csv");
Expand Down Expand Up @@ -128,6 +128,24 @@ macro_rules! bench {
};
}

macro_rules! bench_trimmed {
($name:ident, $data:ident, $counter:ident, $result:expr) => {
#[bench]
fn $name(b: &mut Bencher) {
let data = $data.as_bytes();
b.bytes = data.len() as u64;
b.iter(|| {
let mut rdr = ReaderBuilder::new()
.has_headers(false)
.trim(Trim::All)
.from_reader(data);
assert_eq!($counter(&mut rdr), $result);
})
}
};
}


macro_rules! bench_serde {
(no_headers,
$name:ident, $data:ident, $counter:ident, $type:ty, $result:expr) => {
Expand Down Expand Up @@ -213,7 +231,9 @@ bench_serde_borrowed_bytes!(
bench_serde_borrowed_str!(
count_nfl_deserialize_borrowed_str, NFL, NFLRowBorrowed, true, 9999);
bench!(count_nfl_iter_bytes, NFL, count_iter_bytes, 130000);
bench_trimmed!(count_nfl_iter_bytes_trimmed, NFL, count_iter_bytes, 130000);
bench!(count_nfl_iter_str, NFL, count_iter_str, 130000);
bench_trimmed!(count_nfl_iter_str_trimmed, NFL, count_iter_str, 130000);
bench!(count_nfl_read_bytes, NFL, count_read_bytes, 130000);
bench!(count_nfl_read_str, NFL, count_read_str, 130000);
bench_serde!(
Expand Down
118 changes: 118 additions & 0 deletions src/byte_record.rs
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,59 @@ impl ByteRecord {
self.truncate(0);
}

/// Trim the fields of this record so that leading and trailing whitespace
/// is removed.
///
/// This method uses the ASCII definition of whitespace. That is, only
/// bytes in the class `[\t\n\v\f\r ]` are trimmed.
///
/// # Example
///
/// ```
/// use csv::ByteRecord;
///
/// let mut record = ByteRecord::from(vec![
/// " ", "\tfoo", "bar ", "b a z",
/// ]);
/// record.trim();
/// assert_eq!(record, vec!["", "foo", "bar", "b a z"]);
/// ```
pub fn trim(&mut self) {
let mut trimmed = 0;
for field in 0..self.len() {
self.0.bounds.ends[field] -= trimmed;
let bound = self.0.bounds.get(field).unwrap();
let front_space = self.count_leading_whitespace(bound.clone());
let back_space =
if front_space < bound.end - bound.start {
self.count_leading_whitespace(bound.clone().rev())
} else {
0
};

self.0.fields.drain(bound.end - back_space..bound.end);
self.0.fields.drain(bound.start..bound.start + front_space);
self.0.bounds.ends[field] -= front_space + back_space;
trimmed += front_space + back_space;
}
}

/// Returns amount of leading whitespace starting in the given range.
/// Whitespace is not counted past the end of the range.
fn count_leading_whitespace<R>(&self, range: R) -> usize
where R: Iterator<Item=usize>
{
let mut count = 0;
for i in range {
match self.0.fields[i] {
b'\t' | b'\n' | b'\x0B' | b'\x0C' | b'\r' | b' ' => {}
_ => break,
}
count += 1;
}
count
}

/// Add a new field to this record.
///
/// # Example
Expand Down Expand Up @@ -842,6 +895,71 @@ mod tests {
assert_eq!(rec.get(1), None);
}

#[test]
fn trim_whitespace_only() {
let mut rec = ByteRecord::from(vec![b" \t\n\r\x0c"]);
rec.trim();
assert_eq!(rec.get(0), Some(b("")));
}

#[test]
fn trim_front() {
let mut rec = ByteRecord::from(vec![b" abc"]);
rec.trim();
assert_eq!(rec.get(0), Some(b("abc")));

let mut rec = ByteRecord::from(vec![b(" abc"), b(" xyz")]);
rec.trim();
assert_eq!(rec.get(0), Some(b("abc")));
assert_eq!(rec.get(1), Some(b("xyz")));
}

#[test]
fn trim_back() {
let mut rec = ByteRecord::from(vec![b"abc "]);
rec.trim();
assert_eq!(rec.get(0), Some(b("abc")));

let mut rec = ByteRecord::from(vec![b("abc "), b("xyz ")]);
rec.trim();
assert_eq!(rec.get(0), Some(b("abc")));
assert_eq!(rec.get(1), Some(b("xyz")));
}

#[test]
fn trim_both() {
let mut rec = ByteRecord::from(vec![b" abc "]);
rec.trim();
assert_eq!(rec.get(0), Some(b("abc")));

let mut rec = ByteRecord::from(vec![b(" abc "), b(" xyz ")]);
rec.trim();
assert_eq!(rec.get(0), Some(b("abc")));
assert_eq!(rec.get(1), Some(b("xyz")));
}

#[test]
fn trim_does_not_panic_on_empty_records_1() {
let mut rec = ByteRecord::from(vec![b""]);
rec.trim();
assert_eq!(rec.get(0), Some(b("")));
}

#[test]
fn trim_does_not_panic_on_empty_records_2() {
let mut rec = ByteRecord::from(vec![b"", b""]);
rec.trim();
assert_eq!(rec.get(0), Some(b("")));
assert_eq!(rec.get(1), Some(b("")));
}

#[test]
fn trim_does_not_panic_on_empty_records_3() {
let mut rec = ByteRecord::new();
rec.trim();
assert_eq!(rec.as_slice().len(), 0);
}

#[test]
fn empty_field_1() {
let mut rec = ByteRecord::new();
Expand Down
36 changes: 36 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,42 @@ impl Default for Terminator {
}
}

/// The whitespace preservation behaviour when reading CSV data.
#[derive(Clone, Copy, Debug, PartialEq)]
pub enum Trim {
/// Preserves fields and headers. This is the default.
None,
/// Trim whitespace from headers.
Headers,
/// Trim whitespace from fields, but not headers.
Fields,
/// Trim whitespace from fields and headers.
All,
/// Hints that destructuring should not be exhaustive.
///
/// This enum may grow additional variants, so this makes sure clients
/// don't count on exhaustive matching. (Otherwise, adding a new variant
/// could break existing code.)
#[doc(hidden)]
__Nonexhaustive,
}

impl Trim {
fn should_trim_fields(&self) -> bool {
self == &Trim::Fields || self == &Trim::All
}

fn should_trim_headers(&self) -> bool {
self == &Trim::Headers || self == &Trim::All
}
}

impl Default for Trim {
fn default() -> Trim {
Trim::None
}
}

/// A custom Serde deserializer for possibly invalid `Option<T>` fields.
///
/// When deserializing CSV data, it is sometimes desirable to simply ignore
Expand Down

0 comments on commit d08388b

Please sign in to comment.