Skip to content

Commit

Permalink
Improve performance of casting StringView/BinaryView to `Dictiona…
Browse files Browse the repository at this point in the history
…ryArray` (#5872)

* zero-copy dict to view

* view to dict

* refactor to use try_append_view

* unchecked view

* make fmt happy

* update test

* add comments

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
XiangpengHao and alamb committed Jun 13, 2024
1 parent c6359bf commit 8752e01
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 16 deletions.
84 changes: 80 additions & 4 deletions arrow-cast/src/cast/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -188,10 +188,34 @@ pub(crate) fn cast_to_dictionary<K: ArrowDictionaryKeyType>(
Decimal256(_, _) => {
pack_numeric_to_dictionary::<K, Decimal256Type>(array, dict_value_type, cast_options)
}
Utf8 => pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, cast_options),
LargeUtf8 => pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, cast_options),
Binary => pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, cast_options),
LargeBinary => pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, cast_options),
Utf8 => {
// If the input is a view type, we can avoid casting (thus copying) the data
if array.data_type() == &DataType::Utf8View {
return string_view_to_dictionary::<K, i32>(array);
}
pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, cast_options)
}
LargeUtf8 => {
// If the input is a view type, we can avoid casting (thus copying) the data
if array.data_type() == &DataType::Utf8View {
return string_view_to_dictionary::<K, i64>(array);
}
pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, cast_options)
}
Binary => {
// If the input is a view type, we can avoid casting (thus copying) the data
if array.data_type() == &DataType::BinaryView {
return binary_view_to_dictionary::<K, i32>(array);
}
pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, cast_options)
}
LargeBinary => {
// If the input is a view type, we can avoid casting (thus copying) the data
if array.data_type() == &DataType::BinaryView {
return binary_view_to_dictionary::<K, i64>(array);
}
pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, cast_options)
}
_ => Err(ArrowError::CastError(format!(
"Unsupported output type for dictionary packing: {dict_value_type:?}"
))),
Expand Down Expand Up @@ -226,6 +250,58 @@ where
Ok(Arc::new(b.finish()))
}

pub(crate) fn string_view_to_dictionary<K, O: OffsetSizeTrait>(
array: &dyn Array,
) -> Result<ArrayRef, ArrowError>
where
K: ArrowDictionaryKeyType,
{
let mut b = GenericByteDictionaryBuilder::<K, GenericStringType<O>>::with_capacity(
array.len(),
1024,
1024,
);
let string_view = array.as_any().downcast_ref::<StringViewArray>().unwrap();
for v in string_view.iter() {
match v {
Some(v) => {
b.append(v)?;
}
None => {
b.append_null();
}
}
}

Ok(Arc::new(b.finish()))
}

pub(crate) fn binary_view_to_dictionary<K, O: OffsetSizeTrait>(
array: &dyn Array,
) -> Result<ArrayRef, ArrowError>
where
K: ArrowDictionaryKeyType,
{
let mut b = GenericByteDictionaryBuilder::<K, GenericBinaryType<O>>::with_capacity(
array.len(),
1024,
1024,
);
let binary_view = array.as_any().downcast_ref::<BinaryViewArray>().unwrap();
for v in binary_view.iter() {
match v {
Some(v) => {
b.append(v)?;
}
None => {
b.append_null();
}
}
}

Ok(Arc::new(b.finish()))
}

// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
// key types of K
pub(crate) fn pack_byte_to_dictionary<K, T>(
Expand Down
36 changes: 24 additions & 12 deletions arrow-cast/src/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5205,10 +5205,10 @@ mod tests {

const VIEW_TEST_DATA: [Option<&str>; 5] = [
Some("hello"),
Some("world"),
Some("repeated"),
None,
Some("large payload over 12 bytes"),
Some("lulu"),
Some("repeated"),
];

fn _test_string_to_view<O>()
Expand Down Expand Up @@ -5291,6 +5291,26 @@ mod tests {
assert_eq!(casted_binary_array.as_ref(), &binary_view_array);
}

#[test]
fn test_view_to_dict() {
let string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA);
let string_dict_array: DictionaryArray<Int8Type> = VIEW_TEST_DATA.into_iter().collect();
let casted_type = string_dict_array.data_type();
let casted_dict_array = cast(&string_view_array, casted_type).unwrap();
assert_eq!(casted_dict_array.data_type(), casted_type);
assert_eq!(casted_dict_array.as_ref(), &string_dict_array);

let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA);
let binary_dict_array = string_dict_array.downcast_dict::<StringArray>().unwrap();
let binary_buffer = cast(&binary_dict_array.values(), &DataType::Binary).unwrap();
let binary_dict_array =
DictionaryArray::<Int8Type>::new(binary_dict_array.keys().clone(), binary_buffer);
let casted_type = binary_dict_array.data_type();
let casted_binary_array = cast(&binary_view_array, casted_type).unwrap();
assert_eq!(casted_binary_array.data_type(), casted_type);
assert_eq!(casted_binary_array.as_ref(), &binary_dict_array);
}

#[test]
fn test_view_to_string() {
_test_view_to_string::<i32>();
Expand Down Expand Up @@ -5330,23 +5350,15 @@ mod tests {
where
O: OffsetSizeTrait,
{
let data: Vec<Option<&[u8]>> = vec![
Some(b"hello"),
Some(b"world"),
None,
Some(b"large payload over 12 bytes"),
Some(b"lulu"),
];

let view_array = {
let mut builder = BinaryViewBuilder::new().with_block_size(8); // multiple buffers.
for s in data.iter() {
for s in VIEW_TEST_DATA.iter() {
builder.append_option(*s);
}
builder.finish()
};

let expected_binary_array = GenericBinaryArray::<O>::from(data);
let expected_binary_array = GenericBinaryArray::<O>::from_iter(VIEW_TEST_DATA);
let expected_type = expected_binary_array.data_type();

assert!(can_cast_types(view_array.data_type(), expected_type));
Expand Down

0 comments on commit 8752e01

Please sign in to comment.