Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 86 additions & 1 deletion src/iceberg/expression/predicate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@

#include "iceberg/expression/expressions.h"
#include "iceberg/expression/literal.h"
#include "iceberg/expression/term.h"
#include "iceberg/result.h"
#include "iceberg/transform.h"
#include "iceberg/type.h"
#include "iceberg/util/checked_cast.h"
#include "iceberg/util/formatter_internal.h"
Expand Down Expand Up @@ -143,6 +145,27 @@ bool IsFloatingType(TypeId type) {
return type == TypeId::kFloat || type == TypeId::kDouble;
}

/// \brief Count the number of UTF-8 code points in a string.
/// This matches the behavior of TruncateUtils::TruncateUTF8.
///
/// NOTE: This counts code points, not grapheme clusters (user-perceived characters).
/// Per the Iceberg spec, combining marks count as separate code points.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Iceberg spec does not have this. We should not mislead readers.

/// Example: "é" as e + combining-acute (U+0065 U+0301) = 2 code points,
/// but "é" as single precomposed character (U+00E9) = 1 code point.
///
/// \param str The UTF-8 encoded string
/// \return The number of code points (not bytes, not graphemes)
inline int32_t CountUTF8CodePoints(std::string_view str) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps this should be moved to string_util.h or truncate_util.h

int32_t code_point_count = 0;
for (unsigned char c : str) {
// Start of a new UTF-8 code point (not a continuation byte 10xxxxxx)
if ((c & 0xC0) != 0x80) {
++code_point_count;
}
}
return code_point_count;
}

bool IsNan(const Literal& literal) {
const auto& value = literal.value();
if (std::holds_alternative<float>(value)) {
Expand Down Expand Up @@ -246,7 +269,69 @@ Result<std::shared_ptr<Expression>> UnboundPredicate<B>::BindLiteralOperation(
}
}

// TODO(gangwu): translate truncate(col) == value to startsWith(value)
// Optimize: translate truncate(col, width) == value to col startsWith(value)
// This optimization allows better predicate pushdown and index usage
// IMPORTANT: Only valid when literal has exactly `width` UTF-8 code points
//
// NOTE: This rewrite is safe because:
// - Iceberg string comparisons are binary (byte-for-byte), no collation
// - STARTS_WITH uses the same binary comparison semantics as equality
// - truncate(col, w) == "value" ⟺ col STARTS_WITH "value" when len(value) == w
// - When source has < w code points, truncate returns full string; equality
// implies exact match, so STARTS_WITH remains valid (short-string invariance)
if (BASE::op() == Expression::Operation::kEq &&
bound_term->kind() == Term::Kind::kTransform) {
// Safe to cast after kind check confirms it's a transform
auto* transform_term = dynamic_cast<BoundTransform*>(bound_term.get());
if (!transform_term) {
// Should never happen after kind check, but be defensive
return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
std::move(literal));
}
Comment on lines +285 to +290
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
auto* transform_term = dynamic_cast<BoundTransform*>(bound_term.get());
if (!transform_term) {
// Should never happen after kind check, but be defensive
return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
std::move(literal));
}
auto* transform_term = internal::checked_cast<BoundTransform*>(bound_term.get());

Let's reuse the existing pattern for cast.


if (transform_term->transform()->transform_type() == TransformType::kTruncate &&
literal.type()->type_id() == TypeId::kString &&
!literal.IsNull()) { // Null safety: skip null literals

// Extract width parameter using type-safe API
auto width_opt = transform_term->transform()->param();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can greatly simplify this line and below.

We can add transform_func like below.

class ICEBERG_EXPORT BoundTransform : public BoundTerm {
 public:
  const std::shared_ptr<TransformFunction>& transform_func() const { return transform_func_; }
};

Then we just need to call transform_term->transform_func()->Transform(literal) and check if its return value is same as literal. We don't even need CountUTF8CodePoints above.

if (!width_opt) {
// Should never happen for truncate, but be defensive
return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
std::move(literal));
}

int32_t truncate_width = *width_opt;

// Skip width=0: truncate(col, 0) == "" would rewrite to STARTS_WITH("")
// which is tautologically true and could accidentally broaden filters
// (Note: Transform::Truncate already validates width > 0, but defensive check)
if (truncate_width == 0) {
return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
std::move(literal));
}

auto& string_value = std::get<std::string>(literal.value());

// Count UTF-8 code points (not bytes!)
// Truncate uses code points: "José" has 4 code points but 5 bytes
int32_t code_point_count = CountUTF8CodePoints(string_value);

// Only optimize if literal code point count equals truncate width
// Example: truncate(col, 5) == "Alice" (5 code points) can be optimized
// truncate(col, 10) == "abc" (3 code points) CANNOT
// truncate(col, 4) == "José" (4 code points, 5 bytes) CAN be optimized
if (code_point_count == truncate_width) {
// Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value"
// This benefits from strict metrics evaluation for startsWith in manifest
// filtering
return std::make_shared<BoundLiteralPredicate>(Expression::Operation::kStartsWith,
transform_term->reference(),
std::move(literal));
}
}
}

return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
std::move(literal));
}
Expand Down
211 changes: 211 additions & 0 deletions src/iceberg/test/predicate_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,217 @@ TEST_F(PredicateTest, ComplexExpressionCombinations) {
EXPECT_EQ(nested->op(), Expression::Operation::kAnd);
}

TEST_F(PredicateTest, TruncateOptimizationToStartsWith) {
// Test that truncate(col) == "value" is optimized to col STARTS_WITH "value"

// Create a truncate transform expression: truncate(name, 5)
auto truncate_expr = Expressions::Truncate("name", 5);

// Create predicate: truncate(name, 5) == "Alice"
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
Expression::Operation::kEq, truncate_expr, Literal::String("Alice"));

// Bind the predicate to the schema
auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
ASSERT_THAT(bound_result, IsOk());
auto bound_pred = bound_result.value();

// After optimization, it should be a STARTS_WITH operation
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);

// Verify it's a BoundLiteralPredicate
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
ASSERT_NE(literal_pred, nullptr);

// The term should now be a direct reference to "name", not a transform
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);

// The literal should still be "Alice"
EXPECT_EQ(literal_pred->literal(), Literal::String("Alice"));
}

TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonEquality) {
// Test that optimization is NOT applied for non-equality operations

auto truncate_expr = Expressions::Truncate("name", 5);

// Test with less-than (should NOT be optimized)
auto truncate_lt_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
Expression::Operation::kLt, truncate_expr, Literal::String("Bob"));
auto bound_lt_result = truncate_lt_pred->Bind(*schema_, /*case_sensitive=*/true);
ASSERT_THAT(bound_lt_result, IsOk());
auto bound_lt = bound_lt_result.value();

// Should remain as kLt, not converted to STARTS_WITH
EXPECT_EQ(bound_lt->op(), Expression::Operation::kLt);

// The term should still be a transform
auto* literal_pred_lt = dynamic_cast<BoundLiteralPredicate*>(bound_lt.get());
ASSERT_NE(literal_pred_lt, nullptr);
EXPECT_EQ(literal_pred_lt->term()->kind(), Term::Kind::kTransform);
}

TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonString) {
// Test that optimization is NOT applied for non-string types
// (truncate can also work on binary types, but optimization only applies to strings)

// Create a schema with binary field
auto binary_schema = std::make_shared<Schema>(
std::vector<SchemaField>{SchemaField::MakeOptional(1, "data", binary())},
/*schema_id=*/0);

auto truncate_expr = Expressions::Truncate("data", 10);
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
Expression::Operation::kEq, truncate_expr,
Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05}));

auto bound_result = truncate_eq_pred->Bind(*binary_schema, /*case_sensitive=*/true);
ASSERT_THAT(bound_result, IsOk());
auto bound_pred = bound_result.value();

// Should remain as kEq, not converted to STARTS_WITH (binary doesn't support
// startsWith)
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);

// The term should still be a transform
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
ASSERT_NE(literal_pred, nullptr);
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
}

TEST_F(PredicateTest, TruncateOptimizationNotAppliedForWidthMismatch) {
// CRITICAL TEST: Optimization must NOT apply when literal length != truncate width
// Example: truncate(col, 10) == "abc" should NOT become STARTS_WITH
// Because "abc1234567" would match STARTS_WITH but NOT truncate equality

auto truncate_expr = Expressions::Truncate("name", 10);

// Literal "abc" has length 3, but truncate width is 10
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
Expression::Operation::kEq, truncate_expr, Literal::String("abc"));

auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
ASSERT_THAT(bound_result, IsOk());
auto bound_pred = bound_result.value();

// Should remain as kEq, NOT converted to STARTS_WITH
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);

// The term should still be a transform (not optimized away)
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
ASSERT_NE(literal_pred, nullptr);
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
}

TEST_F(PredicateTest, TruncateOptimizationAppliedWhenLengthMatches) {
// Test that optimization IS applied when literal length == truncate width

auto truncate_expr = Expressions::Truncate("name", 5);

// Literal "Alice" has length 5, matching truncate width 5
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
Expression::Operation::kEq, truncate_expr, Literal::String("Alice"));

auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
ASSERT_THAT(bound_result, IsOk());
auto bound_pred = bound_result.value();

// Should be optimized to STARTS_WITH
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);

// The term should be a direct reference (optimization applied)
auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
ASSERT_NE(literal_pred, nullptr);
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
}

TEST_F(PredicateTest, TruncateOptimizationWithUTF8Accents) {
// CRITICAL: Test UTF-8 code points vs bytes
// "José" = 4 UTF-8 code points but 5 bytes (é = 0xC3 0xA9)

auto truncate_expr = Expressions::Truncate("name", 4);

// "José" has 4 code points, matching truncate width 4
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
Expression::Operation::kEq, truncate_expr, Literal::String("José"));

auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
ASSERT_THAT(bound_result, IsOk());
auto bound_pred = bound_result.value();

// Should be optimized to STARTS_WITH (code points match)
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);

auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
ASSERT_NE(literal_pred, nullptr);
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
}

TEST_F(PredicateTest, TruncateOptimizationWithUTF8Emoji) {
// Test multi-byte UTF-8 characters
// "Hi👋" = 3 UTF-8 code points but 6 bytes (👋 = 4 bytes: 0xF0 0x9F 0x91 0x8B)

auto truncate_expr = Expressions::Truncate("name", 3);

// "Hi👋" has 3 code points
auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
Expression::Operation::kEq, truncate_expr, Literal::String("Hi👋"));

auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
ASSERT_THAT(bound_result, IsOk());
auto bound_pred = bound_result.value();

// Should be optimized to STARTS_WITH
EXPECT_EQ(bound_pred->op(), Expression::Operation::kStartsWith);

auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
ASSERT_NE(literal_pred, nullptr);
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kReference);
}

TEST_F(PredicateTest, TruncateOptimizationNotAppliedWhenUTF8LengthMismatch) {
// "José" has 4 code points but we're comparing against width 5
// Should NOT optimize

auto truncate_expr = Expressions::Truncate("name", 5);

auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
Expression::Operation::kEq, truncate_expr, Literal::String("José"));

auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
ASSERT_THAT(bound_result, IsOk());
auto bound_pred = bound_result.value();

// Should NOT be optimized (code points 4 != width 5)
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);

auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
ASSERT_NE(literal_pred, nullptr);
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
}

TEST_F(PredicateTest, TruncateOptimizationNotAppliedForEmptyLiteralWithNonZeroWidth) {
// Empty literal with w > 0 should NOT optimize
// Empty string has 0 code points, which != width
// NOTE: width=0 is rejected by Transform::Truncate, so not tested here

auto truncate_expr = Expressions::Truncate("name", 5);

auto truncate_eq_pred = std::make_shared<UnboundPredicate<BoundTransform>>(
Expression::Operation::kEq, truncate_expr, Literal::String(""));

auto bound_result = truncate_eq_pred->Bind(*schema_, /*case_sensitive=*/true);
ASSERT_THAT(bound_result, IsOk());
auto bound_pred = bound_result.value();

// Should NOT be optimized (0 code points != width 5)
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);

auto* literal_pred = dynamic_cast<BoundLiteralPredicate*>(bound_pred.get());
ASSERT_NE(literal_pred, nullptr);
EXPECT_EQ(literal_pred->term()->kind(), Term::Kind::kTransform);
}

TEST_F(PredicateTest, BoundUnaryPredicateNegate) {
auto is_null_pred = Expressions::IsNull("name");
auto bound_null = is_null_pred->Bind(*schema_, /*case_sensitive=*/true).value();
Expand Down
13 changes: 13 additions & 0 deletions src/iceberg/transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,19 @@ class ICEBERG_EXPORT Transform : public util::Formattable {
/// \brief Returns the transform type.
TransformType transform_type() const;

/// \brief Returns the optional parameter for parameterized transforms.
///
/// For transforms like bucket(N) or truncate(W), returns the parameter value.
/// For non-parameterized transforms (identity, year, etc.), returns std::nullopt.
///
/// \return The parameter if present, otherwise std::nullopt
std::optional<int32_t> param() const {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we remove this function? This makes it harder to evolve the variant in the future.

if (auto* p = std::get_if<int32_t>(&param_)) {
return *p;
}
return std::nullopt;
}

/// \brief Binds this transform to a source type, returning a typed TransformFunction.
///
/// This creates a concrete transform implementation based on the transform type and
Expand Down
Loading