This crate has been completely rewritten.

The old parser was kludgy, relied on CSV data being encoded as UTF-8 correctly and most importantly, slow. The parser has been rewritten to operate on bytes using a real state machine. I've also taken this opportunity to vastly simplify (and isolate) the type based encoding/decoding machinery. There are too many breaking changes to enumerate them. Usage of this crate is pretty simple, so I recommend checking out the updated API docs and re-working your code from that: http:https://burntsushi.net/rustdoc/csv/index.html [breaking-change]
BurntSushi · Sep 22, 2014 · 4b79594 · 4b79594
1 parent 70a278d
commit 4b79594
Show file tree

Hide file tree

Showing 19 changed files with 1,870 additions and 49,337 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ examples/data/ss10pusa.csv
 build
 target
 Cargo.lock
+scratch*
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,13 +1,10 @@
 [package]
 name = "rust-csv"
-version = "0.2.4"
+version = "0.3.0"
 authors = ["Andrew Gallant <[email protected]>"]
 
 [lib]
 name = "csv"
 path = "src/lib.rs"
 crate_type = ["dylib", "rlib"]
 
-[dependencies.quickcheck]
-git = "git:https://github.com/BurntSushi/quickcheck"
-
diff --git a/README.md b/README.md
@@ -1,13 +1,24 @@
-This crate provides a streaming CSV (comma separated values) encoder and
-decoder that works with the `Encoder` and `Decoder` traits in Rust's
-`serialize` crate. It [conforms closely to RFC
-4180](http:https://burntsushi.net/rustdoc/csv/#compliance-with-rfc-4180).
+This crate provides a streaming CSV (comma separated values) writer and
+reader that works with the `serialize` crate to do type based encoding
+and decoding. There are two primary goals of this project:
+
+1. The default mode of parsing should *just work*. This means the parser
+ will bias toward providing *a* parse over a *correct* parse (with
+ respect to [RFC 4180](http:https://tools.ietf.org/html/rfc4180)).
+2. Convenient to use by default, but when performance is needed, the
+ API will provide an escape hatch.
 
 [![Build status](https://api.travis-ci.org/BurntSushi/rust-csv.png)](https://travis-ci.org/BurntSushi/rust-csv)
 
 Licensed under the [UNLICENSE](http:https://unlicense.org).
 
 
+### Documentation
+
+The API is fully documented with lots of examples:
+[http:https://burntsushi.net/rustdoc/csv/](http:https://burntsushi.net/rustdoc/csv/).
+
+
 ### Simple examples
 
 Here is a full working Rust program that decodes records from a CSV file. Each
@@ -21,10 +32,10 @@ use std::path::Path;
 
 fn main() {
  let fp = &Path::new("./data/simple.csv");
- let mut rdr = csv::Decoder::from_file(fp);
+ let mut rdr = csv::Reader::from_file(fp);
 
- for record in rdr.iter_decode::<(String, String, uint)>() {
- let (s1, s2, dist) = record.unwrap();
+ for record in rdr.decode() {
+ let (s1, s2, dist): (String, String, uint) = record.unwrap();
  println!("({}, {}): {}", s1, s2, dist);
  }
 }
@@ -47,10 +58,10 @@ struct Record {
 
 fn main() {
  let fp = &Path::new("./data/simple.csv");
- let mut rdr = csv::Decoder::from_file(fp);
+ let mut rdr = csv::Reader::from_file(fp);
 
- for record in rdr.iter_decode::<Record>() {
- let record = record.unwrap();
+ for record in rdr.decode() {
+ let record: Record = record.unwrap();
  println!("({}, {}): {}", record.s1, record.s2, record.dist);
  }
 }
@@ -67,15 +78,10 @@ struct Record {
 }
 ```
 
-You can also read CSV headers, change the separator, use `enum` types or just
+You can also read CSV headers, change the delimiter, use `enum` types or just
 get plain access to records as vectors of strings. There are examples with more
 details in the documentation.
 
-### Documentation
-
-The API is fully documented with lots of examples:
-[http:https://burntsushi.net/rustdoc/csv/](http:https://burntsushi.net/rustdoc/csv/).
-
 
 ### Installation
 
@@ -99,12 +105,8 @@ git = "git:https://github.com/BurntSushi/rust-csv"
 
 ### Related work
 
-The only other CSV parser I know of that builds is
-[Geal/rust-csv](https://github.com/Geal/rust-csv), but it doesn't support the
-`Encoder` or `Decoder` API.
-
-Another one popped up at
-[arjantop/rust-tabular](https://github.com/arjantop/rust-tabular) just
-recently, which also does not support the `Encoder` or `Decoder` API.
-However, it does support parsing fixed-width tables.
+The only other one I know is
+[arjantop/rust-tabular](https://github.com/arjantop/rust-tabular),
+which does not support the `Encoder` or `Decoder` API. However, it does support 
+parsing fixed-width tables.
 
diff --git a/examples/data/medium.csv → examples/data/bench.csv b/examples/data/medium.csv → examples/data/bench.csv
diff --git a/examples/data/large.csv b/examples/data/large.csv
diff --git a/examples/data/short.csv b/examples/data/short.csv
diff --git a/examples/nfl_plays.rs b/examples/nfl_plays.rs
@@ -2,7 +2,6 @@ extern crate csv;
 extern crate serialize;
 
 use std::path::Path;
-use csv::Decoder;
 
 #[allow(dead_code)]
 #[deriving(Decodable)]
@@ -25,8 +24,8 @@ struct Play {
 fn main() {
  let fp = &Path::new("./data/2012_nfl_pbp_data.csv");
 
- let mut dec = Decoder::from_file(fp);
- match dec.decode_all::<Play>() {
+ let mut dec = csv::Reader::from_file(fp);
+ match csv::collect(dec.decode::<Play>()) {
  Err(err) => fail!("{}", err),
  Ok(plays) => {
  println!("Found {} plays.", plays.len());

diff --git a/examples/simple.rs b/examples/simple.rs
@@ -4,10 +4,10 @@ use std::path::Path;
 
 fn main() {
  let fp = &Path::new("./data/simple.csv");
- let mut rdr = csv::Decoder::from_file(fp);
+ let mut rdr = csv::Reader::from_file(fp);
 
- for record in rdr.iter_decode::<(String, String, uint)>() {
- let (s1, s2, dist) = record.unwrap();
+ for record in rdr.decode() {
+ let (s1, s2, dist): (String, String, uint) = record.unwrap();
  println!("({}, {}): {}", s1, s2, dist);
  }
 }
diff --git a/examples/simple_missing.rs b/examples/simple_missing.rs
@@ -12,10 +12,10 @@ struct Record {
 
 fn main() {
  let fp = &Path::new("./data/simple_missing.csv");
- let mut rdr = csv::Decoder::from_file(fp);
+ let mut rdr = csv::Reader::from_file(fp);
 
- for record in rdr.iter_decode::<Record>() {
- let record = record.unwrap();
+ for record in rdr.decode() {
+ let record: Record = record.unwrap();
  println!("({}, {}): {}", record.s1, record.s2, record.dist);
  }
 }
diff --git a/examples/simple_struct.rs b/examples/simple_struct.rs
@@ -12,10 +12,10 @@ struct Record {
 
 fn main() {
  let fp = &Path::new("./data/simple.csv");
- let mut rdr = csv::Decoder::from_file(fp);
+ let mut rdr = csv::Reader::from_file(fp);
 
- for record in rdr.iter_decode::<Record>() {
- let record = record.unwrap();
+ for record in rdr.decode() {
+ let record: Record = record.unwrap();
  println!("({}, {}): {}", record.s1, record.s2, record.dist);
  }
 }
diff --git a/examples/stream.rs b/examples/stream.rs
@@ -1,32 +1,30 @@
 extern crate csv;
 
 use std::comm::channel;
-use std::io::{ChanReader, ChanWriter, Reader, Writer};
-use std::io::timer::sleep;
+use std::io;
 use std::task::spawn;
 use std::time::Duration;
 
-use csv::{Decoder, Encoder};
-
 fn main() {
  let (send, recv) = channel();
  spawn(proc() {
- let mut w = ChanWriter::new(send);
- let mut enc = Encoder::to_writer(&mut w as &mut Writer);
+ let w = io::ChanWriter::new(send);
+ let mut enc = csv::Writer::from_writer(w);
  for x in range(1u, 6) {
  match enc.encode((x, x * x)) {
  Ok(_) => {},
  Err(err) => fail!("Failed encoding: {}", err),
  }
- sleep(Duration::milliseconds(500));
+ io::timer::sleep(Duration::milliseconds(500));
  }
  });
 
- let mut r = ChanReader::new(recv);
+ let r = io::ChanReader::new(recv);
  // We create a CSV reader with a small buffer so that we can see streaming
  // in action on small inputs.
- let mut dec = Decoder::from_reader_capacity(&mut r as &mut Reader, 1);
- for r in dec.iter() {
+ let buf = io::BufferedReader::with_capacity(1, r);
+ let mut dec = csv::Reader::from_buffer(buf);
+ for r in dec.records() {
  println!("Record: {}", r);
  }
 }
diff --git a/src/bench.rs b/src/bench.rs
@@ -1,11 +1,12 @@
 use std::fmt::Show;
 use std::io;
+use std::io::Reader as IoReader;
+use std::io::Writer as IoWriter;
 use stdtest::Bencher;
-use super::Decoder;
 
-static CSV_SHORT: &'static str = "./examples/data/short.csv";
-static CSV_MEDIUM: &'static str = "./examples/data/medium.csv";
-static CSV_LARGE: &'static str = "./examples/data/large.csv";
+use Reader;
+
+static CSV_DATA: &'static str = "./examples/data/bench.csv";
 
 fn ordie<T, E: Show>(r: Result<T, E>) -> T {
  r.or_else(|e: E| -> Result<T, E> fail!(e.to_string())).unwrap()
@@ -19,33 +20,38 @@ fn file_to_mem(fp: &str) -> io::MemReader {
  io::MemReader::new(bs)
 }
 
+fn reader<'a>(rdr: &'a mut io::MemReader)
+ -> Reader<io::RefReader<'a, io::MemReader>> {
+ let _ = ordie(rdr.seek(0, io::SeekSet));
+ Reader::from_reader(rdr.by_ref())
+}
+
 #[bench]
-fn short_raw_records(b: &mut Bencher) {
- let mut data = file_to_mem(CSV_SHORT);
+fn raw_records(b: &mut Bencher) {
+ let mut data = file_to_mem(CSV_DATA);
  b.iter(|| {
- let _ = ordie(data.seek(0, io::SeekSet));
- let mut dec = Decoder::from_reader(&mut data as &mut io::Reader);
- for _ in dec.iter() {}
+ let mut dec = reader(&mut data);
+ while !dec.done() {
+ for r in dec { let _ = r.unwrap(); }
+ }
  })
 }
 
 #[bench]
-fn medium_raw_records(b: &mut Bencher) {
- let mut data = file_to_mem(CSV_MEDIUM);
+fn byte_records(b: &mut Bencher) {
+ let mut data = file_to_mem(CSV_DATA);
  b.iter(|| {
- let _ = ordie(data.seek(0, io::SeekSet));
- let mut dec = Decoder::from_reader(&mut data as &mut io::Reader);
- for _ in dec.iter() {}
+ let mut dec = reader(&mut data);
+ for r in dec.byte_records() { let _ = r.unwrap(); }
  })
 }
 
 #[bench]
-fn large_raw_records(b: &mut Bencher) {
- let mut data = file_to_mem(CSV_LARGE);
+fn string_records(b: &mut Bencher) {
+ let mut data = file_to_mem(CSV_DATA);
  b.iter(|| {
- let _ = ordie(data.seek(0, io::SeekSet));
- let mut dec = Decoder::from_reader(&mut data as &mut io::Reader);
- for _ in dec.iter() {}
+ let mut dec = reader(&mut data);
+ for r in dec.records() { let _ = r.unwrap(); }
  })
 }
 
@@ -68,14 +74,10 @@ struct Play {
 }
 
 #[bench]
-fn short_decoded_records(b: &mut Bencher) {
- let mut data = file_to_mem(CSV_SHORT);
+fn decoded_records(b: &mut Bencher) {
+ let mut data = file_to_mem(CSV_DATA);
  b.iter(|| {
- let _ = ordie(data.seek(0, io::SeekSet));
- let mut dec = Decoder::from_reader(&mut data as &mut io::Reader);
- match dec.decode_all::<Play>() {
- Ok(_) => {}
- Err(err) => fail!("{}", err),
- }
+ let mut dec = reader(&mut data);
+ for r in dec.decode::<Play>() { let _ = r.unwrap(); }
  })
 }
diff --git a/src/bytestr.rs b/src/bytestr.rs
@@ -0,0 +1,90 @@
+use std::fmt;
+use std::hash;
+
+/// A type that represents unadulterated byte strings.
+///
+/// Byte strings represent *any* 8 bit character encoding. There are no
+/// restrictions placed on the type of encoding used. (This means that there
+/// may be *multiple* encodings in any particular byte string!)
+///
+/// Many CSV files in the wild aren't just malformed with respect to RFC 4180,
+/// but they are commonly *not* UTF-8 encoded. Even worse, some of them are
+/// encoded improperly. Therefore, any useful CSV parser must be flexible with
+/// respect to encodings.
+///
+/// Thus, this CSV parser uses byte strings internally. This means that
+/// quotes and field and record separators *must* be ASCII. Otherwise,
+/// the parser places no other restrictions on the content of data in each
+/// cell.
+///
+/// Note that most of the methods in the encoder/decoder will assume UTF-8
+/// encoding, but they also expose some lower level methods that use byte
+/// strings when absolutely necessary. This type is exposed in case you need
+/// to deal with the raw bytes directly.
+#[deriving(Clone, PartialEq, Eq, PartialOrd, Ord)]
+pub struct ByteString(Vec<u8>);
+
+impl ByteString {
+ /// Create a new byte string from a vector or slice of bytes.
+ pub fn from_bytes<S: CloneableVector<u8>>(bs: S) -> ByteString {
+ ByteString(bs.into_vec())
+ }
+
+ /// Consumes this byte string into a vector of bytes.
+ pub fn into_bytes(self) -> Vec<u8> {
+ let ByteString(chars) = self;
+ chars
+ }
+
+ /// Returns this byte string as a slice of bytes.
+ pub fn as_bytes<'a>(&'a self) -> &'a [u8] {
+ let &ByteString(ref chars) = self;
+ chars.as_slice()
+ }
+
+ /// Consumes the byte string and decodes it into a Unicode string. If the
+ /// decoding fails, then the original ByteString is returned.
+ pub fn as_utf8_string(self) -> Result<String, ByteString> {
+ String::from_utf8(self.into_bytes()).map_err(ByteString)
+ }
+}
+
+impl fmt::Show for ByteString {
+ /// Writes the underlying bytes as a `&[u8]`.
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let ByteString(ref chars) = *self;
+ // XXX: Ideally, we could just do this:
+ //
+ // f.write(chars.as_slice())
+ //
+ // and let the output device figure out how to render it. But it seems
+ // the formatting infrastructure assumes that the data is UTF-8
+ // encodable, which obviously doesn't work with raw byte strings.
+ //
+ // For now, we just show the bytes, e.g., `[255, 50, 48, 49, ...]`.
+ write!(f, "{}", chars.as_slice())
+ }
+}
+
+impl Slice<u8> for ByteString {
+ fn as_slice<'a>(&'a self) -> &'a [u8] {
+ let ByteString(ref chars) = *self;
+ chars.as_slice()
+ }
+}
+
+impl<H: hash::Writer> hash::Hash<H> for ByteString {
+ fn hash(&self, hasher: &mut H) {
+ self.as_slice().hash(hasher);
+ }
+}
+
+impl<S: Str> Equiv<S> for ByteString {
+ fn equiv(&self, other: &S) -> bool {
+ self.as_bytes() == other.as_slice().as_bytes()
+ }
+}
+
+impl Collection for ByteString {
+ fn len(&self) -> uint { self.as_bytes().len() }
+}