-
Notifications
You must be signed in to change notification settings - Fork 1.1k
fix: cast Binary/String dictionary to view #8912
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,111 +28,92 @@ pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>( | |
| ) -> Result<ArrayRef, ArrowError> { | ||
| use DataType::*; | ||
|
|
||
| match to_type { | ||
| Dictionary(to_index_type, to_value_type) => { | ||
| let dict_array = array | ||
| .as_any() | ||
| .downcast_ref::<DictionaryArray<K>>() | ||
| .ok_or_else(|| { | ||
| ArrowError::ComputeError( | ||
| "Internal Error: Cannot cast dictionary to DictionaryArray of expected type".to_string(), | ||
| ) | ||
| })?; | ||
| let array = array.as_dictionary::<K>(); | ||
| let from_child_type = array.values().data_type(); | ||
| match (from_child_type, to_type) { | ||
| (_, Dictionary(to_index_type, to_value_type)) => { | ||
| dictionary_to_dictionary_cast(array, to_index_type, to_value_type, cast_options) | ||
| } | ||
| // `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data | ||
| // copy of the value buffer. Fast path which avoids copying underlying values buffer. | ||
| // TODO: handle LargeUtf8/LargeBinary -> View (need to check offsets can fit) | ||
| // TODO: handle cross types (String -> BinaryView, Binary -> StringView) | ||
| // (need to validate utf8?) | ||
| (Utf8, Utf8View) => view_from_dict_values::<K, Utf8Type, StringViewType>( | ||
| array.keys(), | ||
| array.values().as_string::<i32>(), | ||
| ), | ||
| (Binary, BinaryView) => view_from_dict_values::<K, BinaryType, BinaryViewType>( | ||
| array.keys(), | ||
| array.values().as_binary::<i32>(), | ||
| ), | ||
|
Comment on lines
+39
to
+49
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the main change; we check the values beforehand to see if it is valid for the fast path, which was missing before (and the assumption lead to an error). I've intentionally kept the behaviour as intended (limited the fast path only for |
||
| _ => unpack_dictionary(array, to_type, cast_options), | ||
| } | ||
| } | ||
|
|
||
| let keys_array: ArrayRef = | ||
| Arc::new(PrimitiveArray::<K>::from(dict_array.keys().to_data())); | ||
| let values_array = dict_array.values(); | ||
| let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?; | ||
| let cast_values = cast_with_options(values_array, to_value_type, cast_options)?; | ||
| fn dictionary_to_dictionary_cast<K: ArrowDictionaryKeyType>( | ||
| array: &DictionaryArray<K>, | ||
| to_index_type: &DataType, | ||
| to_value_type: &DataType, | ||
| cast_options: &CastOptions, | ||
| ) -> Result<ArrayRef, ArrowError> { | ||
| use DataType::*; | ||
|
|
||
| // Failure to cast keys (because they don't fit in the | ||
| // target type) results in NULL values; | ||
| if cast_keys.null_count() > keys_array.null_count() { | ||
| return Err(ArrowError::ComputeError(format!( | ||
| "Could not convert {} dictionary indexes from {:?} to {:?}", | ||
| cast_keys.null_count() - keys_array.null_count(), | ||
| keys_array.data_type(), | ||
| to_index_type | ||
| ))); | ||
| } | ||
| let keys_array: ArrayRef = Arc::new(PrimitiveArray::<K>::from(array.keys().to_data())); | ||
| let values_array = array.values(); | ||
| let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?; | ||
| let cast_values = cast_with_options(values_array, to_value_type, cast_options)?; | ||
|
|
||
| let data = cast_keys.into_data(); | ||
| let builder = data | ||
| .into_builder() | ||
| .data_type(to_type.clone()) | ||
| .child_data(vec![cast_values.into_data()]); | ||
| // Failure to cast keys (because they don't fit in the | ||
| // target type) results in NULL values; | ||
| if cast_keys.null_count() > keys_array.null_count() { | ||
| return Err(ArrowError::ComputeError(format!( | ||
| "Could not convert {} dictionary indexes from {:?} to {:?}", | ||
| cast_keys.null_count() - keys_array.null_count(), | ||
| keys_array.data_type(), | ||
| to_index_type | ||
| ))); | ||
| } | ||
|
|
||
| // Safety | ||
| // Cast keys are still valid | ||
| let data = unsafe { builder.build_unchecked() }; | ||
| let data = cast_keys.into_data(); | ||
| let builder = data | ||
| .into_builder() | ||
| .data_type(Dictionary( | ||
| Box::new(to_index_type.clone()), | ||
| Box::new(to_value_type.clone()), | ||
| )) | ||
| .child_data(vec![cast_values.into_data()]); | ||
|
|
||
| // create the appropriate array type | ||
| let new_array: ArrayRef = match **to_index_type { | ||
| Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)), | ||
| Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)), | ||
| Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)), | ||
| Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)), | ||
| UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)), | ||
| UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)), | ||
| UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)), | ||
| UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)), | ||
| _ => { | ||
| return Err(ArrowError::CastError(format!( | ||
| "Unsupported type {to_index_type} for dictionary index" | ||
| ))); | ||
| } | ||
| }; | ||
| // Safety | ||
| // Cast keys are still valid | ||
| let data = unsafe { builder.build_unchecked() }; | ||
|
|
||
| Ok(new_array) | ||
| // create the appropriate array type | ||
| let new_array: ArrayRef = match to_index_type { | ||
| Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)), | ||
| Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)), | ||
| Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)), | ||
| Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)), | ||
| UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)), | ||
| UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)), | ||
| UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)), | ||
| UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)), | ||
| _ => { | ||
| return Err(ArrowError::CastError(format!( | ||
| "Unsupported type {to_index_type} for dictionary index" | ||
| ))); | ||
| } | ||
| Utf8View => { | ||
| // `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer. | ||
| // we handle it here to avoid the copy. | ||
| let dict_array = array | ||
| .as_dictionary::<K>() | ||
| .downcast_dict::<StringArray>() | ||
| .ok_or_else(|| { | ||
| ArrowError::ComputeError( | ||
| "Internal Error: Cannot cast Utf8View to StringArray of expected type" | ||
| .to_string(), | ||
| ) | ||
| })?; | ||
| }; | ||
|
|
||
| let string_view = view_from_dict_values::<K, StringViewType, GenericStringType<i32>>( | ||
| dict_array.values(), | ||
| dict_array.keys(), | ||
| )?; | ||
| Ok(Arc::new(string_view)) | ||
| } | ||
| BinaryView => { | ||
| // `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer. | ||
| // we handle it here to avoid the copy. | ||
| let dict_array = array | ||
| .as_dictionary::<K>() | ||
| .downcast_dict::<BinaryArray>() | ||
| .ok_or_else(|| { | ||
| ArrowError::ComputeError( | ||
| "Internal Error: Cannot cast BinaryView to BinaryArray of expected type" | ||
| .to_string(), | ||
| ) | ||
| })?; | ||
|
|
||
| let binary_view = view_from_dict_values::<K, BinaryViewType, BinaryType>( | ||
| dict_array.values(), | ||
| dict_array.keys(), | ||
| )?; | ||
| Ok(Arc::new(binary_view)) | ||
| } | ||
| _ => unpack_dictionary::<K>(array, to_type, cast_options), | ||
| } | ||
| Ok(new_array) | ||
| } | ||
|
|
||
| fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V: ByteArrayType>( | ||
| array: &GenericByteArray<V>, | ||
| fn view_from_dict_values<K: ArrowDictionaryKeyType, V: ByteArrayType, T: ByteViewType>( | ||
| keys: &PrimitiveArray<K>, | ||
| ) -> Result<GenericByteViewArray<T>, ArrowError> { | ||
| let value_buffer = array.values(); | ||
| let value_offsets = array.value_offsets(); | ||
| values: &GenericByteArray<V>, | ||
| ) -> Result<ArrayRef, ArrowError> { | ||
| let value_buffer = values.values(); | ||
| let value_offsets = values.value_offsets(); | ||
| let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len()); | ||
| builder.append_block(value_buffer.clone()); | ||
| for i in keys.iter() { | ||
|
|
@@ -157,21 +138,17 @@ fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V: ByteArra | |
| } | ||
| } | ||
| } | ||
| Ok(builder.finish()) | ||
| Ok(Arc::new(builder.finish())) | ||
| } | ||
|
|
||
| // Unpack a dictionary where the keys are of type <K> into a flattened array of type to_type | ||
| pub(crate) fn unpack_dictionary<K>( | ||
| array: &dyn Array, | ||
| // Unpack a dictionary into a flattened array of type to_type | ||
| pub(crate) fn unpack_dictionary<K: ArrowDictionaryKeyType>( | ||
| array: &DictionaryArray<K>, | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moved the cast to dictionary array to the parent function ( |
||
| to_type: &DataType, | ||
| cast_options: &CastOptions, | ||
| ) -> Result<ArrayRef, ArrowError> | ||
| where | ||
| K: ArrowDictionaryKeyType, | ||
| { | ||
| let dict_array = array.as_dictionary::<K>(); | ||
| let cast_dict_values = cast_with_options(dict_array.values(), to_type, cast_options)?; | ||
| take(cast_dict_values.as_ref(), dict_array.keys(), None) | ||
| ) -> Result<ArrayRef, ArrowError> { | ||
| let cast_dict_values = cast_with_options(array.values(), to_type, cast_options)?; | ||
| take(cast_dict_values.as_ref(), array.keys(), None) | ||
| } | ||
|
|
||
| /// Pack a data type into a dictionary array passing the values through a primitive array | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I moved this inner logic into a separate function to be similar to how the other arms delegate to functions for the full logic.