Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance of casting StringView/BinaryView to DictionaryArray #5872

Merged
merged 10 commits into from
Jun 13, 2024
Prev Previous commit
Next Next commit
refactor to use try_append_view
  • Loading branch information
XiangpengHao committed Jun 11, 2024
commit 0308dd48b12989fd0d610be1d1f9e90647688c6a
72 changes: 20 additions & 52 deletions arrow-cast/src/cast/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,22 +93,10 @@ pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(
.downcast_dict::<StringArray>()
.unwrap();

let string_values = dict_array.values();
let value_offsets = string_values.value_offsets();
let value_buffer = string_values.values().clone();

let view_buffer =
view_from_dict_values(value_offsets, &value_buffer, dict_array.keys());

// Safety:
// the buffer is from StringArray which is utf8.
let string_view = unsafe {
StringViewArray::new_unchecked(
view_buffer,
vec![value_buffer],
dict_array.nulls().cloned(),
)
};
let string_view = view_from_dict_values::<K, StringViewType, GenericStringType<i32>>(
dict_array.values(),
dict_array.keys(),
);
Ok(Arc::new(string_view))
}
BinaryView => {
Expand All @@ -119,61 +107,41 @@ pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(
.downcast_dict::<BinaryArray>()
.unwrap();

let binary_values = dict_array.values();
let value_offsets = binary_values.value_offsets();
let value_buffer = binary_values.values().clone();

let view_buffer =
view_from_dict_values(value_offsets, &value_buffer, dict_array.keys());
let binary_view = unsafe {
BinaryViewArray::new_unchecked(
view_buffer,
vec![value_buffer],
dict_array.nulls().cloned(),
)
};
let binary_view = view_from_dict_values::<K, BinaryViewType, BinaryType>(
dict_array.values(),
dict_array.keys(),
);
Ok(Arc::new(binary_view))
}
_ => unpack_dictionary::<K>(array, to_type, cast_options),
}
}

fn view_from_dict_values<K: ArrowDictionaryKeyType>(
value_offsets: &[i32],
value_buffer: &arrow_buffer::Buffer,
fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V: ByteArrayType>(
array: &GenericByteArray<V>,
keys: &PrimitiveArray<K>,
) -> ScalarBuffer<u128> {
let mut view_builder = BufferBuilder::<u128>::new(keys.len());
) -> GenericByteViewArray<T> {
let value_buffer = array.values();
let value_offsets = array.value_offsets();
let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len());
builder.append_block(value_buffer.clone());
for i in keys.iter() {
match i {
Some(v) => {
let idx = v.to_usize().unwrap();
let offset = value_offsets[idx];
let end = value_offsets[idx + 1];
let length = end - offset;
let value_buf = &value_buffer[offset as usize..end as usize];

if length <= 12 {
let mut view_buffer = [0; 16];
view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
view_buffer[4..4 + value_buf.len()].copy_from_slice(value_buf);
view_builder.append(u128::from_le_bytes(view_buffer));
} else {
let view = ByteView {
length: length as u32,
prefix: u32::from_le_bytes(value_buf[0..4].try_into().unwrap()),
buffer_index: 0,
offset: offset as u32,
};
view_builder.append(view.into());
}
builder
.try_append_view(0, offset.as_usize() as u32, length.as_usize() as u32)
.unwrap();
}
None => {
view_builder.append_n_zeroed(1);
builder.append_null();
}
}
}
ScalarBuffer::new(view_builder.finish(), 0, keys.len())
builder.finish()
}

// Unpack a dictionary where the keys are of type <K> into a flattened array of type to_type
Expand Down
30 changes: 24 additions & 6 deletions arrow-cast/src/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5255,18 +5255,36 @@ mod tests {

#[test]
fn test_dict_to_view() {
let string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA);
let string_dict_array: DictionaryArray<Int8Type> = VIEW_TEST_DATA.into_iter().collect();
let values = StringArray::from_iter(VIEW_TEST_DATA);
let keys = Int8Array::from_iter([Some(1), Some(0), None, Some(3), None, Some(1), Some(4)]);
let string_dict_array =
DictionaryArray::<Int8Type>::try_new(keys, Arc::new(values)).unwrap();
let typed_dict = string_dict_array.downcast_dict::<StringArray>().unwrap();

let string_view_array = {
let mut builder = StringViewBuilder::new().with_block_size(8); // multiple buffers.
for v in typed_dict.into_iter() {
builder.append_option(v);
}
builder.finish()
};
let expected_string_array_type = string_view_array.data_type();
let casted_string_array = cast(&string_dict_array, expected_string_array_type).unwrap();
assert_eq!(casted_string_array.data_type(), expected_string_array_type);
assert_eq!(casted_string_array.as_ref(), &string_view_array);

let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA);
let binary_dict_array = string_dict_array.downcast_dict::<StringArray>().unwrap();
let binary_buffer = cast(&binary_dict_array.values(), &DataType::Binary).unwrap();
let binary_buffer = cast(&typed_dict.values(), &DataType::Binary).unwrap();
let binary_dict_array =
DictionaryArray::<Int8Type>::new(binary_dict_array.keys().clone(), binary_buffer);
DictionaryArray::<Int8Type>::new(typed_dict.keys().clone(), binary_buffer);
let typed_binary_dict = binary_dict_array.downcast_dict::<BinaryArray>().unwrap();

let binary_view_array = {
let mut builder = BinaryViewBuilder::new().with_block_size(8); // multiple buffers.
for v in typed_binary_dict.into_iter() {
builder.append_option(v);
}
builder.finish()
};
let expected_binary_array_type = binary_view_array.data_type();
let casted_binary_array = cast(&binary_dict_array, expected_binary_array_type).unwrap();
assert_eq!(casted_binary_array.data_type(), expected_binary_array_type);
Expand Down
Loading