serde: fix bug in handling of invalid UTF-8

This fixes a bug that was tripped if one tried to use Serde to deserialize records that are not valid UTF-8. In particular, given a struct like the following: #[derive(Debug, Deserialize)] struct Row { h1: String, #[serde(with = "serde_bytes")] h2: Vec<u8>, h3: String, } then using ByteRecord::deserialize would still return an invalid UTF-8 error if invalid UTF-8 bytes were found in the h2 field, even though the field explicitly supports arbitrary bytes. The root cause of this bug is that the deserializer was attempting to run raw byte buffers through UTF-8 validation. Instead, it should simply use the raw bytes directly. We add a regression test that exhibits code that did not work but now does work. Thanks to @dcarosone, whose question led to this bug's discovery: https://users.rust-lang.org/t/csv-serde-vs-non-utf8-easily/23262
BurntSushi · Dec 15, 2018 · 9e644e6 · matklad · Dec 15, 2018 · BurntSushi
1 parent f6d46b1
commit 9e644e6
Showing 1 changed file with 32 additions and 2 deletions.
diff --git a/src/deserializer.rs b/src/deserializer.rs
@@ -413,8 +413,10 @@ impl<'a, 'de: 'a, T: DeRecord<'de>> Deserializer<'de>
  self,
  visitor: V,
  ) -> Result<V::Value, Self::Error> {
- self.next_field()
- .and_then(|f| visitor.visit_byte_buf(f.as_bytes().to_vec()))
+ self.next_field_bytes()
+ .and_then(|f| visitor.visit_byte_buf(f.to_vec()))
+ // self.next_field()
+ // .and_then(|f| visitor.visit_byte_buf(f.as_bytes().to_vec()))
  }
 
  fn deserialize_option<V: Visitor<'de>>(
@@ -775,6 +777,10 @@ mod tests {
  deserialize_string_record(&record, Some(&headers))
  }
 
+ fn b<'a, T: AsRef<[u8]> + ?Sized>(bytes: &'a T) -> &'a [u8] {
+ bytes.as_ref()
+ }
+
  #[test]
  fn with_header() {
  #[derive(Deserialize, Debug, PartialEq)]
@@ -1160,4 +1166,28 @@ mod tests {
  properties: Properties { prop1: 3.0, prop2: 4.0 },
  });
  }
+
+ #[test]
+ fn partially_invalid_utf8() {
+ #[derive(Debug, Deserialize, PartialEq)]
+ struct Row {
+ h1: String,
+ #[serde(with = "serde_bytes")]
+ h2: Vec<u8>,
+ h3: String,
+ }
+
+ let headers = ByteRecord::from(vec![b"h1", b"h2", b"h3"]);
+ let record = ByteRecord::from(vec![
+ b(b"baz"), b(b"foo\xFFbar"), b(b"quux"),
+ ]);
+ let got: Row = deserialize_byte_record(
+ &record, Some(&headers),
+ ).unwrap();
+ assert_eq!(got, Row {
+ h1: "baz".to_string(),
+ h2: b"foo\xFFbar".to_vec(),
+ h3: "quux".to_string(),
+ });
+ }
 }