Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance of casting StringView/BinaryView to DictionaryArray #5872

Merged
merged 10 commits into from
Jun 13, 2024
Prev Previous commit
Next Next commit
view to dict
  • Loading branch information
XiangpengHao committed Jun 11, 2024
commit bc626fe3f20213a3023534af3ec580fd1772a70e
80 changes: 76 additions & 4 deletions arrow-cast/src/cast/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,10 +216,30 @@ pub(crate) fn cast_to_dictionary<K: ArrowDictionaryKeyType>(
Decimal256(_, _) => {
pack_numeric_to_dictionary::<K, Decimal256Type>(array, dict_value_type, cast_options)
}
Utf8 => pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, cast_options),
LargeUtf8 => pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, cast_options),
Binary => pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, cast_options),
LargeBinary => pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, cast_options),
Utf8 => {
if array.data_type() == &DataType::Utf8View {
return string_view_to_dictionary::<K, i32>(array);
}
pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, cast_options)
}
LargeUtf8 => {
if array.data_type() == &DataType::Utf8View {
return string_view_to_dictionary::<K, i64>(array);
}
pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, cast_options)
}
Binary => {
if array.data_type() == &DataType::BinaryView {
return binary_view_to_dictionary::<K, i32>(array);
}
pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, cast_options)
}
LargeBinary => {
if array.data_type() == &DataType::BinaryView {
return binary_view_to_dictionary::<K, i64>(array);
}
pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, cast_options)
}
_ => Err(ArrowError::CastError(format!(
"Unsupported output type for dictionary packing: {dict_value_type:?}"
))),
Expand Down Expand Up @@ -254,6 +274,58 @@ where
Ok(Arc::new(b.finish()))
}

pub(crate) fn string_view_to_dictionary<K, O: OffsetSizeTrait>(
array: &dyn Array,
) -> Result<ArrayRef, ArrowError>
where
K: ArrowDictionaryKeyType,
{
let mut b = GenericByteDictionaryBuilder::<K, GenericStringType<O>>::with_capacity(
array.len(),
1024,
1024,
);
let string_view = array.as_any().downcast_ref::<StringViewArray>().unwrap();
for v in string_view.iter() {
match v {
Some(v) => {
b.append(v)?;
}
None => {
b.append_null();
}
}
}

return Ok(Arc::new(b.finish()));
}

pub(crate) fn binary_view_to_dictionary<K, O: OffsetSizeTrait>(
array: &dyn Array,
) -> Result<ArrayRef, ArrowError>
where
K: ArrowDictionaryKeyType,
{
let mut b = GenericByteDictionaryBuilder::<K, GenericBinaryType<O>>::with_capacity(
array.len(),
1024,
1024,
);
let binary_view = array.as_any().downcast_ref::<BinaryViewArray>().unwrap();
for v in binary_view.iter() {
match v {
Some(v) => {
b.append(v)?;
}
None => {
b.append_null();
}
}
}

return Ok(Arc::new(b.finish()));
}

// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
// key types of K
pub(crate) fn pack_byte_to_dictionary<K, T>(
Expand Down
20 changes: 20 additions & 0 deletions arrow-cast/src/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5273,6 +5273,26 @@ mod tests {
assert_eq!(casted_binary_array.as_ref(), &binary_view_array);
}

#[test]
fn test_view_to_dict() {
let string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA);
let string_dict_array: DictionaryArray<Int8Type> = VIEW_TEST_DATA.into_iter().collect();
let casted_type = string_dict_array.data_type();
let casted_dict_array = cast(&string_view_array, casted_type).unwrap();
assert_eq!(casted_dict_array.data_type(), casted_type);
assert_eq!(casted_dict_array.as_ref(), &string_dict_array);

let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA);
let binary_dict_array = string_dict_array.downcast_dict::<StringArray>().unwrap();
let binary_buffer = cast(&binary_dict_array.values(), &DataType::Binary).unwrap();
let binary_dict_array =
DictionaryArray::<Int8Type>::new(binary_dict_array.keys().clone(), binary_buffer);
let casted_type = binary_dict_array.data_type();
let casted_binary_array = cast(&binary_view_array, casted_type).unwrap();
assert_eq!(casted_binary_array.data_type(), casted_type);
assert_eq!(casted_binary_array.as_ref(), &binary_dict_array);
}

#[test]
fn test_view_to_string() {
_test_view_to_string::<i32>();
Expand Down