Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 82 additions & 105 deletions arrow-cast/src/cast/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,111 +28,92 @@ pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(
) -> Result<ArrayRef, ArrowError> {
use DataType::*;

match to_type {
Dictionary(to_index_type, to_value_type) => {
let dict_array = array
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved this inner logic into a separate function to be similar to how the other arms delegate to functions for the full logic.

.as_any()
.downcast_ref::<DictionaryArray<K>>()
.ok_or_else(|| {
ArrowError::ComputeError(
"Internal Error: Cannot cast dictionary to DictionaryArray of expected type".to_string(),
)
})?;
let array = array.as_dictionary::<K>();
let from_child_type = array.values().data_type();
match (from_child_type, to_type) {
(_, Dictionary(to_index_type, to_value_type)) => {
dictionary_to_dictionary_cast(array, to_index_type, to_value_type, cast_options)
}
// `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data
// copy of the value buffer. Fast path which avoids copying underlying values buffer.
// TODO: handle LargeUtf8/LargeBinary -> View (need to check offsets can fit)
// TODO: handle cross types (String -> BinaryView, Binary -> StringView)
// (need to validate utf8?)
(Utf8, Utf8View) => view_from_dict_values::<K, Utf8Type, StringViewType>(
array.keys(),
array.values().as_string::<i32>(),
),
(Binary, BinaryView) => view_from_dict_values::<K, BinaryType, BinaryViewType>(
array.keys(),
array.values().as_binary::<i32>(),
),
Comment on lines +39 to +49
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the main change; we check the values beforehand to see if it is valid for the fast path, which was missing before (and the assumption lead to an error). I've intentionally kept the behaviour as intended (limited the fast path only for Dictionary<Utf8> -> Utf8View, Dictionary<Binary> -> BinaryView) since was mainly interested in just fixing the cast bug. Left comments for potentially extending this fast path to be valid for more combinations (can raise issues for this).

_ => unpack_dictionary(array, to_type, cast_options),
}
}

let keys_array: ArrayRef =
Arc::new(PrimitiveArray::<K>::from(dict_array.keys().to_data()));
let values_array = dict_array.values();
let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?;
let cast_values = cast_with_options(values_array, to_value_type, cast_options)?;
fn dictionary_to_dictionary_cast<K: ArrowDictionaryKeyType>(
array: &DictionaryArray<K>,
to_index_type: &DataType,
to_value_type: &DataType,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
use DataType::*;

// Failure to cast keys (because they don't fit in the
// target type) results in NULL values;
if cast_keys.null_count() > keys_array.null_count() {
return Err(ArrowError::ComputeError(format!(
"Could not convert {} dictionary indexes from {:?} to {:?}",
cast_keys.null_count() - keys_array.null_count(),
keys_array.data_type(),
to_index_type
)));
}
let keys_array: ArrayRef = Arc::new(PrimitiveArray::<K>::from(array.keys().to_data()));
let values_array = array.values();
let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?;
let cast_values = cast_with_options(values_array, to_value_type, cast_options)?;

let data = cast_keys.into_data();
let builder = data
.into_builder()
.data_type(to_type.clone())
.child_data(vec![cast_values.into_data()]);
// Failure to cast keys (because they don't fit in the
// target type) results in NULL values;
if cast_keys.null_count() > keys_array.null_count() {
return Err(ArrowError::ComputeError(format!(
"Could not convert {} dictionary indexes from {:?} to {:?}",
cast_keys.null_count() - keys_array.null_count(),
keys_array.data_type(),
to_index_type
)));
}

// Safety
// Cast keys are still valid
let data = unsafe { builder.build_unchecked() };
let data = cast_keys.into_data();
let builder = data
.into_builder()
.data_type(Dictionary(
Box::new(to_index_type.clone()),
Box::new(to_value_type.clone()),
))
.child_data(vec![cast_values.into_data()]);

// create the appropriate array type
let new_array: ArrayRef = match **to_index_type {
Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)),
Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)),
Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)),
Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)),
UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)),
UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)),
UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)),
UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)),
_ => {
return Err(ArrowError::CastError(format!(
"Unsupported type {to_index_type} for dictionary index"
)));
}
};
// Safety
// Cast keys are still valid
let data = unsafe { builder.build_unchecked() };

Ok(new_array)
// create the appropriate array type
let new_array: ArrayRef = match to_index_type {
Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)),
Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)),
Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)),
Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)),
UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)),
UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)),
UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)),
UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)),
_ => {
return Err(ArrowError::CastError(format!(
"Unsupported type {to_index_type} for dictionary index"
)));
}
Utf8View => {
// `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
// we handle it here to avoid the copy.
let dict_array = array
.as_dictionary::<K>()
.downcast_dict::<StringArray>()
.ok_or_else(|| {
ArrowError::ComputeError(
"Internal Error: Cannot cast Utf8View to StringArray of expected type"
.to_string(),
)
})?;
};

let string_view = view_from_dict_values::<K, StringViewType, GenericStringType<i32>>(
dict_array.values(),
dict_array.keys(),
)?;
Ok(Arc::new(string_view))
}
BinaryView => {
// `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
// we handle it here to avoid the copy.
let dict_array = array
.as_dictionary::<K>()
.downcast_dict::<BinaryArray>()
.ok_or_else(|| {
ArrowError::ComputeError(
"Internal Error: Cannot cast BinaryView to BinaryArray of expected type"
.to_string(),
)
})?;

let binary_view = view_from_dict_values::<K, BinaryViewType, BinaryType>(
dict_array.values(),
dict_array.keys(),
)?;
Ok(Arc::new(binary_view))
}
_ => unpack_dictionary::<K>(array, to_type, cast_options),
}
Ok(new_array)
}

fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V: ByteArrayType>(
array: &GenericByteArray<V>,
fn view_from_dict_values<K: ArrowDictionaryKeyType, V: ByteArrayType, T: ByteViewType>(
keys: &PrimitiveArray<K>,
) -> Result<GenericByteViewArray<T>, ArrowError> {
let value_buffer = array.values();
let value_offsets = array.value_offsets();
values: &GenericByteArray<V>,
) -> Result<ArrayRef, ArrowError> {
let value_buffer = values.values();
let value_offsets = values.value_offsets();
let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len());
builder.append_block(value_buffer.clone());
for i in keys.iter() {
Expand All @@ -157,21 +138,17 @@ fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V: ByteArra
}
}
}
Ok(builder.finish())
Ok(Arc::new(builder.finish()))
}

// Unpack a dictionary where the keys are of type <K> into a flattened array of type to_type
pub(crate) fn unpack_dictionary<K>(
array: &dyn Array,
// Unpack a dictionary into a flattened array of type to_type
pub(crate) fn unpack_dictionary<K: ArrowDictionaryKeyType>(
array: &DictionaryArray<K>,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved the cast to dictionary array to the parent function (dictionary_cast()) so we don't need to always cast in each of the specialized functions.

to_type: &DataType,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError>
where
K: ArrowDictionaryKeyType,
{
let dict_array = array.as_dictionary::<K>();
let cast_dict_values = cast_with_options(dict_array.values(), to_type, cast_options)?;
take(cast_dict_values.as_ref(), dict_array.keys(), None)
) -> Result<ArrayRef, ArrowError> {
let cast_dict_values = cast_with_options(array.values(), to_type, cast_options)?;
take(cast_dict_values.as_ref(), array.keys(), None)
}

/// Pack a data type into a dictionary array passing the values through a primitive array
Expand Down
56 changes: 56 additions & 0 deletions arrow-cast/src/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11866,4 +11866,60 @@ mod tests {
// Verify the run-ends were cast correctly (run ends at 3, 6, 9)
assert_eq!(run_array.run_ends().values(), &[3i64, 6i64, 9i64]);
}

#[test]
fn test_string_dicts_to_binary_view() {
let expected = BinaryViewArray::from_iter(vec![
VIEW_TEST_DATA[1],
VIEW_TEST_DATA[0],
None,
VIEW_TEST_DATA[3],
None,
VIEW_TEST_DATA[1],
VIEW_TEST_DATA[4],
]);

let values_arrays: [ArrayRef; _] = [
Arc::new(StringArray::from_iter(VIEW_TEST_DATA)),
Arc::new(StringViewArray::from_iter(VIEW_TEST_DATA)),
Arc::new(LargeStringArray::from_iter(VIEW_TEST_DATA)),
];
for values in values_arrays {
let keys =
Int8Array::from_iter([Some(1), Some(0), None, Some(3), None, Some(1), Some(4)]);
let string_dict_array =
DictionaryArray::<Int8Type>::try_new(keys, Arc::new(values)).unwrap();

let casted = cast(&string_dict_array, &DataType::BinaryView).unwrap();
assert_eq!(casted.as_ref(), &expected);
}
}

#[test]
fn test_binary_dicts_to_string_view() {
let expected = StringViewArray::from_iter(vec![
VIEW_TEST_DATA[1],
VIEW_TEST_DATA[0],
None,
VIEW_TEST_DATA[3],
None,
VIEW_TEST_DATA[1],
VIEW_TEST_DATA[4],
]);

let values_arrays: [ArrayRef; _] = [
Arc::new(BinaryArray::from_iter(VIEW_TEST_DATA)),
Arc::new(BinaryViewArray::from_iter(VIEW_TEST_DATA)),
Arc::new(LargeBinaryArray::from_iter(VIEW_TEST_DATA)),
];
for values in values_arrays {
let keys =
Int8Array::from_iter([Some(1), Some(0), None, Some(3), None, Some(1), Some(4)]);
let string_dict_array =
DictionaryArray::<Int8Type>::try_new(keys, Arc::new(values)).unwrap();

let casted = cast(&string_dict_array, &DataType::Utf8View).unwrap();
assert_eq!(casted.as_ref(), &expected);
}
}
}
Loading