From 1fbe5a5449854d7c3f19a5ee2c5e3df8a928e605 Mon Sep 17 00:00:00 2001 From: jhrotko Date: Thu, 13 Nov 2025 17:25:58 +0000 Subject: [PATCH] [wip] integrate uuid type in gandiva --- c/pom.xml | 2 +- ci/scripts/jni_build.sh | 1 - ci/scripts/jni_macos_build.sh | 11 +++- dataset/pom.xml | 2 +- dataset/src/main/cpp/jni_wrapper.cc | 5 ++ gandiva/pom.xml | 2 +- gandiva/proto/gandiva/types.proto | 1 + .../main/cpp/expression_registry_helper.cc | 15 +++++ .../gandiva/evaluator/ExpressionRegistry.java | 15 ++++- .../arrow/gandiva/evaluator/UuidType.java | 62 +++++++++++++++++++ pom.xml | 2 +- 11 files changed, 110 insertions(+), 8 deletions(-) create mode 100644 gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/UuidType.java diff --git a/c/pom.xml b/c/pom.xml index 290cb561c1..830b519752 100644 --- a/c/pom.xml +++ b/c/pom.xml @@ -22,7 +22,7 @@ under the License. org.apache.arrow arrow-java-root - 18.3.0 + 18.3.0-SNAPSHOT arrow-c-data diff --git a/ci/scripts/jni_build.sh b/ci/scripts/jni_build.sh index aec6fc325c..7a2b95f823 100755 --- a/ci/scripts/jni_build.sh +++ b/ci/scripts/jni_build.sh @@ -63,7 +63,6 @@ cmake \ -DCMAKE_PREFIX_PATH="${arrow_install_dir}" \ -DCMAKE_INSTALL_PREFIX="${prefix_dir}" \ -DCMAKE_UNITY_BUILD="${CMAKE_UNITY_BUILD:-OFF}" \ - -DProtobuf_USE_STATIC_LIBS=ON \ -GNinja \ "${EXTRA_CMAKE_OPTIONS[@]}" cmake --build "${build_dir}" diff --git a/ci/scripts/jni_macos_build.sh b/ci/scripts/jni_macos_build.sh index f7543b6f7a..5c90725984 100755 --- a/ci/scripts/jni_macos_build.sh +++ b/ci/scripts/jni_macos_build.sh @@ -67,7 +67,7 @@ export ARROW_BUILD_TESTS export ARROW_DATASET : "${ARROW_GANDIVA:=ON}" export ARROW_GANDIVA -: "${ARROW_ORC:=ON}" +: "${ARROW_ORC:=OFF}" export ARROW_ORC : "${ARROW_PARQUET:=ON}" : "${ARROW_S3:=ON}" @@ -125,7 +125,14 @@ if [ "${ARROW_RUN_TESTS:-}" == "ON" ]; then github_actions_group_end fi -export JAVA_JNI_CMAKE_ARGS="-DProtobuf_ROOT=${build_dir}/cpp/protobuf_ep-install" +# Don't set Protobuf_ROOT if it doesn't exist (when using bundled dependencies) +# Instead, let CMake find the system protobuf +if [ -d "${build_dir}/cpp/protobuf_ep-install" ]; then + export JAVA_JNI_CMAKE_ARGS="-DProtobuf_ROOT=${build_dir}/cpp/protobuf_ep-install" +else + # Use system protobuf - set library path explicitly + export JAVA_JNI_CMAKE_ARGS="-DProtobuf_LIBRARY=/usr/local/lib/libprotobuf.dylib -DProtobuf_PROTOC_EXECUTABLE=/usr/local/bin/protoc" +fi "${source_dir}/ci/scripts/jni_build.sh" \ "${source_dir}" \ "${install_dir}" \ diff --git a/dataset/pom.xml b/dataset/pom.xml index efbe310ea2..fba0e0ff7f 100644 --- a/dataset/pom.xml +++ b/dataset/pom.xml @@ -22,7 +22,7 @@ under the License. org.apache.arrow arrow-java-root - 18.3.0 + 18.3.0-SNAPSHOT arrow-dataset diff --git a/dataset/src/main/cpp/jni_wrapper.cc b/dataset/src/main/cpp/jni_wrapper.cc index 49cc85251c..2380ca9dcb 100644 --- a/dataset/src/main/cpp/jni_wrapper.cc +++ b/dataset/src/main/cpp/jni_wrapper.cc @@ -25,9 +25,14 @@ #include "arrow/c/helpers.h" #include "arrow/dataset/api.h" #include "arrow/dataset/file_base.h" +#include "arrow/dataset/file_parquet.h" +#include "arrow/dataset/file_ipc.h" #ifdef ARROW_CSV #include "arrow/dataset/file_csv.h" #endif +#ifdef ARROW_JSON +#include "arrow/dataset/file_json.h" +#endif #include "arrow/filesystem/api.h" #include "arrow/filesystem/path_util.h" #include "arrow/engine/substrait/util.h" diff --git a/gandiva/pom.xml b/gandiva/pom.xml index 95c62b58bc..98a4622628 100644 --- a/gandiva/pom.xml +++ b/gandiva/pom.xml @@ -22,7 +22,7 @@ under the License. org.apache.arrow arrow-java-root - 18.3.0 + 18.3.0-SNAPSHOT org.apache.arrow.gandiva diff --git a/gandiva/proto/gandiva/types.proto b/gandiva/proto/gandiva/types.proto index 4ce342681d..27299f273e 100644 --- a/gandiva/proto/gandiva/types.proto +++ b/gandiva/proto/gandiva/types.proto @@ -85,6 +85,7 @@ message ExtGandivaType { optional TimeUnit timeUnit = 6; // used by TIME32/TIME64 optional string timeZone = 7; // used by TIMESTAMP optional IntervalType intervalType = 8; // used by INTERVAL + optional string extensionName = 9; // used by extension types (e.g., "uuid") } message Field { diff --git a/gandiva/src/main/cpp/expression_registry_helper.cc b/gandiva/src/main/cpp/expression_registry_helper.cc index 21077ff1db..647dfef3ce 100644 --- a/gandiva/src/main/cpp/expression_registry_helper.cc +++ b/gandiva/src/main/cpp/expression_registry_helper.cc @@ -17,6 +17,7 @@ #include +#include #include #include #include @@ -42,6 +43,13 @@ gandiva::types::TimeUnit MapTimeUnit(arrow::TimeUnit::type& unit) { } void ArrowToProtobuf(DataTypePtr type, gandiva::types::ExtGandivaType* gandiva_data_type) { + // Handle extension types by preserving extension name and using storage type + if (type->id() == arrow::Type::EXTENSION) { + auto ext_type = std::dynamic_pointer_cast(type); + gandiva_data_type->set_extensionname(ext_type->extension_name()); + type = ext_type->storage_type(); + } + switch (type->id()) { case arrow::Type::BOOL: gandiva_data_type->set_type(gandiva::types::GandivaType::BOOL); @@ -85,6 +93,13 @@ void ArrowToProtobuf(DataTypePtr type, gandiva::types::ExtGandivaType* gandiva_d case arrow::Type::BINARY: gandiva_data_type->set_type(gandiva::types::GandivaType::BINARY); break; + case arrow::Type::FIXED_SIZE_BINARY: { + gandiva_data_type->set_type(gandiva::types::GandivaType::FIXED_SIZE_BINARY); + std::shared_ptr fixed_size_binary_type = + std::dynamic_pointer_cast(type); + gandiva_data_type->set_width(fixed_size_binary_type->byte_width()); + break; + } case arrow::Type::DATE32: gandiva_data_type->set_type(gandiva::types::GandivaType::DATE32); break; diff --git a/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java b/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java index 49625edf27..b1b7bff9f0 100644 --- a/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java +++ b/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java @@ -126,6 +126,18 @@ private static Set getSupportedFunctionsFromGandiva() throws } private static ArrowType getArrowType(ExtGandivaType type) { + // Check if this is an extension type + if (type.hasExtensionName() && !type.getExtensionName().isEmpty()) { + String extensionName = type.getExtensionName(); + + // Handle known extension types + if ("arrow.uuid".equals(extensionName)) { + // this should be the new Arrow UUID type from: https://github.com/apache/arrow-java/pull/903 + return new UuidType(); + } + throw new UnsupportedOperationException("Cannot get ArrowType for unknown extension type: " + extensionName); + } + switch (type.getType().getNumber()) { case GandivaType.BOOL_VALUE: return ArrowType.Bool.INSTANCE; @@ -155,6 +167,8 @@ private static ArrowType getArrowType(ExtGandivaType type) { return new ArrowType.Utf8(); case GandivaType.BINARY_VALUE: return new ArrowType.Binary(); + case GandivaType.FIXED_SIZE_BINARY_VALUE: + return new ArrowType.FixedSizeBinary(type.getWidth()); case GandivaType.DATE32_VALUE: return new ArrowType.Date(DateUnit.DAY); case GandivaType.DATE64_VALUE: @@ -171,7 +185,6 @@ private static ArrowType getArrowType(ExtGandivaType type) { return new ArrowType.Decimal(0, 0, 128); case GandivaType.INTERVAL_VALUE: return new ArrowType.Interval(mapArrowIntervalUnit(type.getIntervalType())); - case GandivaType.FIXED_SIZE_BINARY_VALUE: case GandivaType.MAP_VALUE: case GandivaType.DICTIONARY_VALUE: case GandivaType.LIST_VALUE: diff --git a/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/UuidType.java b/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/UuidType.java new file mode 100644 index 0000000000..914bb57703 --- /dev/null +++ b/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/UuidType.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.gandiva.evaluator; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.ArrowType.ExtensionType; +import org.apache.arrow.vector.types.pojo.FieldType; + +/** UUID extension type for Gandiva. THIS SHOULD NOT BE INCLUDED!!!*/ +public class UuidType extends ExtensionType { + + @Override + public ArrowType storageType() { + return new ArrowType.FixedSizeBinary(16); + } + + @Override + public String extensionName() { + return "arrow.uuid"; + } + + @Override + public boolean extensionEquals(ExtensionType other) { + return other instanceof UuidType; + } + + @Override + public ArrowType deserialize(ArrowType storageType, String serializedData) { + if (!storageType.equals(storageType())) { + throw new UnsupportedOperationException( + "Cannot construct UuidType from underlying type " + storageType); + } + return new UuidType(); + } + + @Override + public String serialize() { + return ""; + } + + @Override + public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator) { + return new FixedSizeBinaryVector(name, allocator, 16); + } +} diff --git a/pom.xml b/pom.xml index 1dcda5c40a..a16134e899 100644 --- a/pom.xml +++ b/pom.xml @@ -28,7 +28,7 @@ under the License. org.apache.arrow arrow-java-root - 18.3.0 + 18.3.0-SNAPSHOT pom Apache Arrow Java Root POM