diff --git a/src/iceberg/expression/predicate.cc b/src/iceberg/expression/predicate.cc index 959443c04..e1fcee650 100644 --- a/src/iceberg/expression/predicate.cc +++ b/src/iceberg/expression/predicate.cc @@ -25,7 +25,9 @@ #include "iceberg/expression/expressions.h" #include "iceberg/expression/literal.h" +#include "iceberg/expression/term.h" #include "iceberg/result.h" +#include "iceberg/transform.h" #include "iceberg/type.h" #include "iceberg/util/checked_cast.h" #include "iceberg/util/formatter_internal.h" @@ -143,6 +145,27 @@ bool IsFloatingType(TypeId type) { return type == TypeId::kFloat || type == TypeId::kDouble; } +/// \brief Count the number of UTF-8 code points in a string. +/// This matches the behavior of TruncateUtils::TruncateUTF8. +/// +/// NOTE: This counts code points, not grapheme clusters (user-perceived characters). +/// Per the Iceberg spec, combining marks count as separate code points. +/// Example: "é" as e + combining-acute (U+0065 U+0301) = 2 code points, +/// but "é" as single precomposed character (U+00E9) = 1 code point. +/// +/// \param str The UTF-8 encoded string +/// \return The number of code points (not bytes, not graphemes) +inline int32_t CountUTF8CodePoints(std::string_view str) { + int32_t code_point_count = 0; + for (unsigned char c : str) { + // Start of a new UTF-8 code point (not a continuation byte 10xxxxxx) + if ((c & 0xC0) != 0x80) { + ++code_point_count; + } + } + return code_point_count; +} + bool IsNan(const Literal& literal) { const auto& value = literal.value(); if (std::holds_alternative(value)) { @@ -246,7 +269,69 @@ Result> UnboundPredicate::BindLiteralOperation( } } - // TODO(gangwu): translate truncate(col) == value to startsWith(value) + // Optimize: translate truncate(col, width) == value to col startsWith(value) + // This optimization allows better predicate pushdown and index usage + // IMPORTANT: Only valid when literal has exactly `width` UTF-8 code points + // + // NOTE: This rewrite is safe because: + // - Iceberg string comparisons are binary (byte-for-byte), no collation + // - STARTS_WITH uses the same binary comparison semantics as equality + // - truncate(col, w) == "value" ⟺ col STARTS_WITH "value" when len(value) == w + // - When source has < w code points, truncate returns full string; equality + // implies exact match, so STARTS_WITH remains valid (short-string invariance) + if (BASE::op() == Expression::Operation::kEq && + bound_term->kind() == Term::Kind::kTransform) { + // Safe to cast after kind check confirms it's a transform + auto* transform_term = dynamic_cast(bound_term.get()); + if (!transform_term) { + // Should never happen after kind check, but be defensive + return std::make_shared(BASE::op(), std::move(bound_term), + std::move(literal)); + } + + if (transform_term->transform()->transform_type() == TransformType::kTruncate && + literal.type()->type_id() == TypeId::kString && + !literal.IsNull()) { // Null safety: skip null literals + + // Extract width parameter using type-safe API + auto width_opt = transform_term->transform()->param(); + if (!width_opt) { + // Should never happen for truncate, but be defensive + return std::make_shared(BASE::op(), std::move(bound_term), + std::move(literal)); + } + + int32_t truncate_width = *width_opt; + + // Skip width=0: truncate(col, 0) == "" would rewrite to STARTS_WITH("") + // which is tautologically true and could accidentally broaden filters + // (Note: Transform::Truncate already validates width > 0, but defensive check) + if (truncate_width == 0) { + return std::make_shared(BASE::op(), std::move(bound_term), + std::move(literal)); + } + + auto& string_value = std::get(literal.value()); + + // Count UTF-8 code points (not bytes!) + // Truncate uses code points: "José" has 4 code points but 5 bytes + int32_t code_point_count = CountUTF8CodePoints(string_value); + + // Only optimize if literal code point count equals truncate width + // Example: truncate(col, 5) == "Alice" (5 code points) can be optimized + // truncate(col, 10) == "abc" (3 code points) CANNOT + // truncate(col, 4) == "José" (4 code points, 5 bytes) CAN be optimized + if (code_point_count == truncate_width) { + // Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value" + // This benefits from strict metrics evaluation for startsWith in manifest + // filtering + return std::make_shared(Expression::Operation::kStartsWith, + transform_term->reference(), + std::move(literal)); + } + } + } + return std::make_shared(BASE::op(), std::move(bound_term), std::move(literal)); } diff --git a/src/iceberg/test/predicate_test.cc b/src/iceberg/test/predicate_test.cc index 532e908b4..6df50889d 100644 --- a/src/iceberg/test/predicate_test.cc +++ b/src/iceberg/test/predicate_test.cc @@ -439,6 +439,217 @@ TEST_F(PredicateTest, ComplexExpressionCombinations) { EXPECT_EQ(nested->op(), Expression::Operation::kAnd); } +TEST_F(PredicateTest, TruncateOptimizationToStartsWith) { + // Test that truncate(col) == "value" is optimized to col STARTS_WITH "value" + + // Create a truncate transform expression: truncate(name, 5) + auto truncate_expr = Expressions::Truncate("name", 5); + + // Create predicate: truncate(name, 5) == "Alice" + auto truncate_eq_pred = std::make_shared>( + Expression::Operation::kEq, truncate_expr, Literal::String("Alice")); + + // Bind the predicate to the schema + auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true); + ASSERT_THAT(bound_result, IsOk()); + auto bound_pred = bound_result.value(); + + // After optimization, it should be a STARTS_WITH operation + EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith); + + // Verify it's a BoundLiteralPredicate + auto* literal_pred = dynamic_cast(bound_pred.get()); + ASSERT_NE(literal_pred, nullptr); + + // The term should now be a direct reference to "name", not a transform + EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference); + + // The literal should still be "Alice" + EXPECT_EQ(literal_pred->literal(), Literal::String("Alice")); +} + +TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonEquality) { + // Test that optimization is NOT applied for non-equality operations + + auto truncate_expr = Expressions::Truncate("name", 5); + + // Test with less-than (should NOT be optimized) + auto truncate_lt_pred = std::make_shared>( + Expression::Operation::kLt, truncate_expr, Literal::String("Bob")); + auto bound_lt_result = truncate_lt_pred->Bind(*schema_, /*case_sensitive=*/true); + ASSERT_THAT(bound_lt_result, IsOk()); + auto bound_lt = bound_lt_result.value(); + + // Should remain as kLt, not converted to STARTS_WITH + EXPECT_EQ(bound_lt->op(), Expression::Operation::kLt); + + // The term should still be a transform + auto* literal_pred_lt = dynamic_cast(bound_lt.get()); + ASSERT_NE(literal_pred_lt, nullptr); + EXPECT_EQ(literal_pred_lt->term()->kind(), Term::Kind::kTransform); +} + +TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonString) { + // Test that optimization is NOT applied for non-string types + // (truncate can also work on binary types, but optimization only applies to strings) + + // Create a schema with binary field + auto binary_schema = std::make_shared( + std::vector{SchemaField::MakeOptional(1, "data", binary())}, + /*schema_id=*/0); + + auto truncate_expr = Expressions::Truncate("data", 10); + auto truncate_eq_pred = std::make_shared>( + Expression::Operation::kEq, truncate_expr, + Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05})); + + auto bound_result = truncate_eq_pred->Bind(*binary_schema, /*case_sensitive=*/true); + ASSERT_THAT(bound_result, IsOk()); + auto bound_pred = bound_result.value(); + + // Should remain as kEq, not converted to STARTS_WITH (binary doesn't support + // startsWith) + EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq); + + // The term should still be a transform + auto* literal_pred = dynamic_cast(bound_pred.get()); + ASSERT_NE(literal_pred, nullptr); + EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform); +} + +TEST_F(PredicateTest, TruncateOptimizationNotAppliedForWidthMismatch) { + // CRITICAL TEST: Optimization must NOT apply when literal length != truncate width + // Example: truncate(col, 10) == "abc" should NOT become STARTS_WITH + // Because "abc1234567" would match STARTS_WITH but NOT truncate equality + + auto truncate_expr = Expressions::Truncate("name", 10); + + // Literal "abc" has length 3, but truncate width is 10 + auto truncate_eq_pred = std::make_shared>( + Expression::Operation::kEq, truncate_expr, Literal::String("abc")); + + auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true); + ASSERT_THAT(bound_result, IsOk()); + auto bound_pred = bound_result.value(); + + // Should remain as kEq, NOT converted to STARTS_WITH + EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq); + + // The term should still be a transform (not optimized away) + auto* literal_pred = dynamic_cast(bound_pred.get()); + ASSERT_NE(literal_pred, nullptr); + EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform); +} + +TEST_F(PredicateTest, TruncateOptimizationAppliedWhenLengthMatches) { + // Test that optimization IS applied when literal length == truncate width + + auto truncate_expr = Expressions::Truncate("name", 5); + + // Literal "Alice" has length 5, matching truncate width 5 + auto truncate_eq_pred = std::make_shared>( + Expression::Operation::kEq, truncate_expr, Literal::String("Alice")); + + auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true); + ASSERT_THAT(bound_result, IsOk()); + auto bound_pred = bound_result.value(); + + // Should be optimized to STARTS_WITH + EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith); + + // The term should be a direct reference (optimization applied) + auto* literal_pred = dynamic_cast(bound_pred.get()); + ASSERT_NE(literal_pred, nullptr); + EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference); +} + +TEST_F(PredicateTest, TruncateOptimizationWithUTF8Accents) { + // CRITICAL: Test UTF-8 code points vs bytes + // "José" = 4 UTF-8 code points but 5 bytes (é = 0xC3 0xA9) + + auto truncate_expr = Expressions::Truncate("name", 4); + + // "José" has 4 code points, matching truncate width 4 + auto truncate_eq_pred = std::make_shared>( + Expression::Operation::kEq, truncate_expr, Literal::String("José")); + + auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true); + ASSERT_THAT(bound_result, IsOk()); + auto bound_pred = bound_result.value(); + + // Should be optimized to STARTS_WITH (code points match) + EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith); + + auto* literal_pred = dynamic_cast(bound_pred.get()); + ASSERT_NE(literal_pred, nullptr); + EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference); +} + +TEST_F(PredicateTest, TruncateOptimizationWithUTF8Emoji) { + // Test multi-byte UTF-8 characters + // "Hi👋" = 3 UTF-8 code points but 6 bytes (👋 = 4 bytes: 0xF0 0x9F 0x91 0x8B) + + auto truncate_expr = Expressions::Truncate("name", 3); + + // "Hi👋" has 3 code points + auto truncate_eq_pred = std::make_shared>( + Expression::Operation::kEq, truncate_expr, Literal::String("Hi👋")); + + auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true); + ASSERT_THAT(bound_result, IsOk()); + auto bound_pred = bound_result.value(); + + // Should be optimized to STARTS_WITH + EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith); + + auto* literal_pred = dynamic_cast(bound_pred.get()); + ASSERT_NE(literal_pred, nullptr); + EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference); +} + +TEST_F(PredicateTest, TruncateOptimizationNotAppliedWhenUTF8LengthMismatch) { + // "José" has 4 code points but we're comparing against width 5 + // Should NOT optimize + + auto truncate_expr = Expressions::Truncate("name", 5); + + auto truncate_eq_pred = std::make_shared>( + Expression::Operation::kEq, truncate_expr, Literal::String("José")); + + auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true); + ASSERT_THAT(bound_result, IsOk()); + auto bound_pred = bound_result.value(); + + // Should NOT be optimized (code points 4 != width 5) + EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq); + + auto* literal_pred = dynamic_cast(bound_pred.get()); + ASSERT_NE(literal_pred, nullptr); + EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform); +} + +TEST_F(PredicateTest, TruncateOptimizationNotAppliedForEmptyLiteralWithNonZeroWidth) { + // Empty literal with w > 0 should NOT optimize + // Empty string has 0 code points, which != width + // NOTE: width=0 is rejected by Transform::Truncate, so not tested here + + auto truncate_expr = Expressions::Truncate("name", 5); + + auto truncate_eq_pred = std::make_shared>( + Expression::Operation::kEq, truncate_expr, Literal::String("")); + + auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true); + ASSERT_THAT(bound_result, IsOk()); + auto bound_pred = bound_result.value(); + + // Should NOT be optimized (0 code points != width 5) + EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq); + + auto* literal_pred = dynamic_cast(bound_pred.get()); + ASSERT_NE(literal_pred, nullptr); + EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform); +} + TEST_F(PredicateTest, BoundUnaryPredicateNegate) { auto is_null_pred = Expressions::IsNull("name"); auto bound_null = is_null_pred->Bind(*schema_, /*case_sensitive=*/true).value(); diff --git a/src/iceberg/transform.h b/src/iceberg/transform.h index d06e31f22..79751541b 100644 --- a/src/iceberg/transform.h +++ b/src/iceberg/transform.h @@ -141,6 +141,19 @@ class ICEBERG_EXPORT Transform : public util::Formattable { /// \brief Returns the transform type. TransformType transform_type() const; + /// \brief Returns the optional parameter for parameterized transforms. + /// + /// For transforms like bucket(N) or truncate(W), returns the parameter value. + /// For non-parameterized transforms (identity, year, etc.), returns std::nullopt. + /// + /// \return The parameter if present, otherwise std::nullopt + std::optional param() const { + if (auto* p = std::get_if(¶m_)) { + return *p; + } + return std::nullopt; + } + /// \brief Binds this transform to a source type, returning a typed TransformFunction. /// /// This creates a concrete transform implementation based on the transform type and