Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance of casting StringView/BinaryView to DictionaryArray #5872

Merged
merged 10 commits into from
Jun 13, 2024
84 changes: 80 additions & 4 deletions arrow-cast/src/cast/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -188,10 +188,34 @@ pub(crate) fn cast_to_dictionary<K: ArrowDictionaryKeyType>(
Decimal256(_, _) => {
pack_numeric_to_dictionary::<K, Decimal256Type>(array, dict_value_type, cast_options)
}
Utf8 => pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, cast_options),
LargeUtf8 => pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, cast_options),
Binary => pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, cast_options),
LargeBinary => pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, cast_options),
Utf8 => {
// If the input is a view type, we can avoid casting (thus copying) the data
if array.data_type() == &DataType::Utf8View {
return string_view_to_dictionary::<K, i32>(array);
}
pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, cast_options)
}
LargeUtf8 => {
// If the input is a view type, we can avoid casting (thus copying) the data
if array.data_type() == &DataType::Utf8View {
return string_view_to_dictionary::<K, i64>(array);
}
pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, cast_options)
}
Binary => {
// If the input is a view type, we can avoid casting (thus copying) the data
if array.data_type() == &DataType::BinaryView {
return binary_view_to_dictionary::<K, i32>(array);
}
pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, cast_options)
}
LargeBinary => {
// If the input is a view type, we can avoid casting (thus copying) the data
if array.data_type() == &DataType::BinaryView {
return binary_view_to_dictionary::<K, i64>(array);
}
pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, cast_options)
}
_ => Err(ArrowError::CastError(format!(
"Unsupported output type for dictionary packing: {dict_value_type:?}"
))),
Expand Down Expand Up @@ -226,6 +250,58 @@ where
Ok(Arc::new(b.finish()))
}

pub(crate) fn string_view_to_dictionary<K, O: OffsetSizeTrait>(
array: &dyn Array,
) -> Result<ArrayRef, ArrowError>
where
K: ArrowDictionaryKeyType,
{
let mut b = GenericByteDictionaryBuilder::<K, GenericStringType<O>>::with_capacity(
array.len(),
1024,
1024,
);
let string_view = array.as_any().downcast_ref::<StringViewArray>().unwrap();
for v in string_view.iter() {
match v {
Some(v) => {
b.append(v)?;
}
None => {
b.append_null();
}
}
}

Ok(Arc::new(b.finish()))
}

pub(crate) fn binary_view_to_dictionary<K, O: OffsetSizeTrait>(
array: &dyn Array,
) -> Result<ArrayRef, ArrowError>
where
K: ArrowDictionaryKeyType,
{
let mut b = GenericByteDictionaryBuilder::<K, GenericBinaryType<O>>::with_capacity(
array.len(),
1024,
1024,
);
let binary_view = array.as_any().downcast_ref::<BinaryViewArray>().unwrap();
for v in binary_view.iter() {
match v {
Some(v) => {
b.append(v)?;
}
None => {
b.append_null();
}
}
}

Ok(Arc::new(b.finish()))
}

// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
// key types of K
pub(crate) fn pack_byte_to_dictionary<K, T>(
Expand Down
36 changes: 24 additions & 12 deletions arrow-cast/src/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5205,10 +5205,10 @@ mod tests {

const VIEW_TEST_DATA: [Option<&str>; 5] = [
Some("hello"),
Some("world"),
Some("repeated"),
None,
Some("large payload over 12 bytes"),
Some("lulu"),
Some("repeated"),
];

fn _test_string_to_view<O>()
Expand Down Expand Up @@ -5291,6 +5291,26 @@ mod tests {
assert_eq!(casted_binary_array.as_ref(), &binary_view_array);
}

#[test]
fn test_view_to_dict() {
let string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA);
let string_dict_array: DictionaryArray<Int8Type> = VIEW_TEST_DATA.into_iter().collect();
let casted_type = string_dict_array.data_type();
let casted_dict_array = cast(&string_view_array, casted_type).unwrap();
assert_eq!(casted_dict_array.data_type(), casted_type);
assert_eq!(casted_dict_array.as_ref(), &string_dict_array);

let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA);
let binary_dict_array = string_dict_array.downcast_dict::<StringArray>().unwrap();
let binary_buffer = cast(&binary_dict_array.values(), &DataType::Binary).unwrap();
let binary_dict_array =
DictionaryArray::<Int8Type>::new(binary_dict_array.keys().clone(), binary_buffer);
let casted_type = binary_dict_array.data_type();
let casted_binary_array = cast(&binary_view_array, casted_type).unwrap();
assert_eq!(casted_binary_array.data_type(), casted_type);
assert_eq!(casted_binary_array.as_ref(), &binary_dict_array);
}

#[test]
fn test_view_to_string() {
_test_view_to_string::<i32>();
Expand Down Expand Up @@ -5330,23 +5350,15 @@ mod tests {
where
O: OffsetSizeTrait,
{
let data: Vec<Option<&[u8]>> = vec![
Some(b"hello"),
Some(b"world"),
None,
Some(b"large payload over 12 bytes"),
Some(b"lulu"),
];

let view_array = {
let mut builder = BinaryViewBuilder::new().with_block_size(8); // multiple buffers.
for s in data.iter() {
for s in VIEW_TEST_DATA.iter() {
builder.append_option(*s);
}
builder.finish()
};

let expected_binary_array = GenericBinaryArray::<O>::from(data);
let expected_binary_array = GenericBinaryArray::<O>::from_iter(VIEW_TEST_DATA);
let expected_type = expected_binary_array.data_type();

assert!(can_cast_types(view_array.data_type(), expected_type));
Expand Down
Loading