-
Notifications
You must be signed in to change notification settings - Fork 213
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This crate has been completely rewritten.
The old parser was kludgy, relied on CSV data being encoded as UTF-8 correctly and most importantly, slow. The parser has been rewritten to operate on bytes using a real state machine. I've also taken this opportunity to vastly simplify (and isolate) the type based encoding/decoding machinery. There are too many breaking changes to enumerate them. Usage of this crate is pretty simple, so I recommend checking out the updated API docs and re-working your code from that: http:https://burntsushi.net/rustdoc/csv/index.html [breaking-change]
- Loading branch information
1 parent
70a278d
commit 4b79594
Showing
19 changed files
with
1,870 additions
and
49,337 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,3 +5,4 @@ examples/data/ss10pusa.csv | |
build | ||
target | ||
Cargo.lock | ||
scratch* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,10 @@ | ||
[package] | ||
name = "rust-csv" | ||
version = "0.2.4" | ||
version = "0.3.0" | ||
authors = ["Andrew Gallant <[email protected]>"] | ||
|
||
[lib] | ||
name = "csv" | ||
path = "src/lib.rs" | ||
crate_type = ["dylib", "rlib"] | ||
|
||
[dependencies.quickcheck] | ||
git = "git:https://github.com/BurntSushi/quickcheck" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,32 +1,30 @@ | ||
extern crate csv; | ||
|
||
use std::comm::channel; | ||
use std::io::{ChanReader, ChanWriter, Reader, Writer}; | ||
use std::io::timer::sleep; | ||
use std::io; | ||
use std::task::spawn; | ||
use std::time::Duration; | ||
|
||
use csv::{Decoder, Encoder}; | ||
|
||
fn main() { | ||
let (send, recv) = channel(); | ||
spawn(proc() { | ||
let mut w = ChanWriter::new(send); | ||
let mut enc = Encoder::to_writer(&mut w as &mut Writer); | ||
let w = io::ChanWriter::new(send); | ||
let mut enc = csv::Writer::from_writer(w); | ||
for x in range(1u, 6) { | ||
match enc.encode((x, x * x)) { | ||
Ok(_) => {}, | ||
Err(err) => fail!("Failed encoding: {}", err), | ||
} | ||
sleep(Duration::milliseconds(500)); | ||
io::timer::sleep(Duration::milliseconds(500)); | ||
} | ||
}); | ||
|
||
let mut r = ChanReader::new(recv); | ||
let r = io::ChanReader::new(recv); | ||
// We create a CSV reader with a small buffer so that we can see streaming | ||
// in action on small inputs. | ||
let mut dec = Decoder::from_reader_capacity(&mut r as &mut Reader, 1); | ||
for r in dec.iter() { | ||
let buf = io::BufferedReader::with_capacity(1, r); | ||
let mut dec = csv::Reader::from_buffer(buf); | ||
for r in dec.records() { | ||
println!("Record: {}", r); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
use std::fmt; | ||
use std::hash; | ||
|
||
/// A type that represents unadulterated byte strings. | ||
/// | ||
/// Byte strings represent *any* 8 bit character encoding. There are no | ||
/// restrictions placed on the type of encoding used. (This means that there | ||
/// may be *multiple* encodings in any particular byte string!) | ||
/// | ||
/// Many CSV files in the wild aren't just malformed with respect to RFC 4180, | ||
/// but they are commonly *not* UTF-8 encoded. Even worse, some of them are | ||
/// encoded improperly. Therefore, any useful CSV parser must be flexible with | ||
/// respect to encodings. | ||
/// | ||
/// Thus, this CSV parser uses byte strings internally. This means that | ||
/// quotes and field and record separators *must* be ASCII. Otherwise, | ||
/// the parser places no other restrictions on the content of data in each | ||
/// cell. | ||
/// | ||
/// Note that most of the methods in the encoder/decoder will assume UTF-8 | ||
/// encoding, but they also expose some lower level methods that use byte | ||
/// strings when absolutely necessary. This type is exposed in case you need | ||
/// to deal with the raw bytes directly. | ||
#[deriving(Clone, PartialEq, Eq, PartialOrd, Ord)] | ||
pub struct ByteString(Vec<u8>); | ||
|
||
impl ByteString { | ||
/// Create a new byte string from a vector or slice of bytes. | ||
pub fn from_bytes<S: CloneableVector<u8>>(bs: S) -> ByteString { | ||
ByteString(bs.into_vec()) | ||
} | ||
|
||
/// Consumes this byte string into a vector of bytes. | ||
pub fn into_bytes(self) -> Vec<u8> { | ||
let ByteString(chars) = self; | ||
chars | ||
} | ||
|
||
/// Returns this byte string as a slice of bytes. | ||
pub fn as_bytes<'a>(&'a self) -> &'a [u8] { | ||
let &ByteString(ref chars) = self; | ||
chars.as_slice() | ||
} | ||
|
||
/// Consumes the byte string and decodes it into a Unicode string. If the | ||
/// decoding fails, then the original ByteString is returned. | ||
pub fn as_utf8_string(self) -> Result<String, ByteString> { | ||
String::from_utf8(self.into_bytes()).map_err(ByteString) | ||
} | ||
} | ||
|
||
impl fmt::Show for ByteString { | ||
/// Writes the underlying bytes as a `&[u8]`. | ||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||
let ByteString(ref chars) = *self; | ||
// XXX: Ideally, we could just do this: | ||
// | ||
// f.write(chars.as_slice()) | ||
// | ||
// and let the output device figure out how to render it. But it seems | ||
// the formatting infrastructure assumes that the data is UTF-8 | ||
// encodable, which obviously doesn't work with raw byte strings. | ||
// | ||
// For now, we just show the bytes, e.g., `[255, 50, 48, 49, ...]`. | ||
write!(f, "{}", chars.as_slice()) | ||
} | ||
} | ||
|
||
impl Slice<u8> for ByteString { | ||
fn as_slice<'a>(&'a self) -> &'a [u8] { | ||
let ByteString(ref chars) = *self; | ||
chars.as_slice() | ||
} | ||
} | ||
|
||
impl<H: hash::Writer> hash::Hash<H> for ByteString { | ||
fn hash(&self, hasher: &mut H) { | ||
self.as_slice().hash(hasher); | ||
} | ||
} | ||
|
||
impl<S: Str> Equiv<S> for ByteString { | ||
fn equiv(&self, other: &S) -> bool { | ||
self.as_bytes() == other.as_slice().as_bytes() | ||
} | ||
} | ||
|
||
impl Collection for ByteString { | ||
fn len(&self) -> uint { self.as_bytes().len() } | ||
} |
Oops, something went wrong.