-
Notifications
You must be signed in to change notification settings - Fork 63
feat: optimize truncate(col) == value to startsWith predicate #289
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -25,7 +25,9 @@ | |||||||||||||||
|
|
||||||||||||||||
| #include "iceberg/expression/expressions.h" | ||||||||||||||||
| #include "iceberg/expression/literal.h" | ||||||||||||||||
| #include "iceberg/expression/term.h" | ||||||||||||||||
| #include "iceberg/result.h" | ||||||||||||||||
| #include "iceberg/transform.h" | ||||||||||||||||
| #include "iceberg/type.h" | ||||||||||||||||
| #include "iceberg/util/checked_cast.h" | ||||||||||||||||
| #include "iceberg/util/formatter_internal.h" | ||||||||||||||||
|
|
@@ -143,6 +145,27 @@ bool IsFloatingType(TypeId type) { | |||||||||||||||
| return type == TypeId::kFloat || type == TypeId::kDouble; | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| /// \brief Count the number of UTF-8 code points in a string. | ||||||||||||||||
| /// This matches the behavior of TruncateUtils::TruncateUTF8. | ||||||||||||||||
| /// | ||||||||||||||||
| /// NOTE: This counts code points, not grapheme clusters (user-perceived characters). | ||||||||||||||||
| /// Per the Iceberg spec, combining marks count as separate code points. | ||||||||||||||||
| /// Example: "é" as e + combining-acute (U+0065 U+0301) = 2 code points, | ||||||||||||||||
| /// but "é" as single precomposed character (U+00E9) = 1 code point. | ||||||||||||||||
| /// | ||||||||||||||||
| /// \param str The UTF-8 encoded string | ||||||||||||||||
| /// \return The number of code points (not bytes, not graphemes) | ||||||||||||||||
| inline int32_t CountUTF8CodePoints(std::string_view str) { | ||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps this should be moved to |
||||||||||||||||
| int32_t code_point_count = 0; | ||||||||||||||||
| for (unsigned char c : str) { | ||||||||||||||||
| // Start of a new UTF-8 code point (not a continuation byte 10xxxxxx) | ||||||||||||||||
| if ((c & 0xC0) != 0x80) { | ||||||||||||||||
| ++code_point_count; | ||||||||||||||||
| } | ||||||||||||||||
| } | ||||||||||||||||
| return code_point_count; | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| bool IsNan(const Literal& literal) { | ||||||||||||||||
| const auto& value = literal.value(); | ||||||||||||||||
| if (std::holds_alternative<float>(value)) { | ||||||||||||||||
|
|
@@ -246,7 +269,69 @@ Result<std::shared_ptr<Expression>> UnboundPredicate<B>::BindLiteralOperation( | |||||||||||||||
| } | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| // TODO(gangwu): translate truncate(col) == value to startsWith(value) | ||||||||||||||||
| // Optimize: translate truncate(col, width) == value to col startsWith(value) | ||||||||||||||||
| // This optimization allows better predicate pushdown and index usage | ||||||||||||||||
| // IMPORTANT: Only valid when literal has exactly `width` UTF-8 code points | ||||||||||||||||
| // | ||||||||||||||||
| // NOTE: This rewrite is safe because: | ||||||||||||||||
| // - Iceberg string comparisons are binary (byte-for-byte), no collation | ||||||||||||||||
| // - STARTS_WITH uses the same binary comparison semantics as equality | ||||||||||||||||
| // - truncate(col, w) == "value" ⟺ col STARTS_WITH "value" when len(value) == w | ||||||||||||||||
| // - When source has < w code points, truncate returns full string; equality | ||||||||||||||||
| // implies exact match, so STARTS_WITH remains valid (short-string invariance) | ||||||||||||||||
| if (BASE::op() == Expression::Operation::kEq && | ||||||||||||||||
| bound_term->kind() == Term::Kind::kTransform) { | ||||||||||||||||
| // Safe to cast after kind check confirms it's a transform | ||||||||||||||||
| auto* transform_term = dynamic_cast<BoundTransform*>(bound_term.get()); | ||||||||||||||||
| if (!transform_term) { | ||||||||||||||||
| // Should never happen after kind check, but be defensive | ||||||||||||||||
| return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term), | ||||||||||||||||
| std::move(literal)); | ||||||||||||||||
| } | ||||||||||||||||
|
Comment on lines
+285
to
+290
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Let's reuse the existing pattern for cast. |
||||||||||||||||
|
|
||||||||||||||||
| if (transform_term->transform()->transform_type() == TransformType::kTruncate && | ||||||||||||||||
| literal.type()->type_id() == TypeId::kString && | ||||||||||||||||
| !literal.IsNull()) { // Null safety: skip null literals | ||||||||||||||||
|
|
||||||||||||||||
| // Extract width parameter using type-safe API | ||||||||||||||||
| auto width_opt = transform_term->transform()->param(); | ||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can greatly simplify this line and below. We can add class ICEBERG_EXPORT BoundTransform : public BoundTerm {
public:
const std::shared_ptr<TransformFunction>& transform_func() const { return transform_func_; }
};Then we just need to call |
||||||||||||||||
| if (!width_opt) { | ||||||||||||||||
| // Should never happen for truncate, but be defensive | ||||||||||||||||
| return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term), | ||||||||||||||||
| std::move(literal)); | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| int32_t truncate_width = *width_opt; | ||||||||||||||||
|
|
||||||||||||||||
| // Skip width=0: truncate(col, 0) == "" would rewrite to STARTS_WITH("") | ||||||||||||||||
| // which is tautologically true and could accidentally broaden filters | ||||||||||||||||
| // (Note: Transform::Truncate already validates width > 0, but defensive check) | ||||||||||||||||
| if (truncate_width == 0) { | ||||||||||||||||
| return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term), | ||||||||||||||||
| std::move(literal)); | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| auto& string_value = std::get<std::string>(literal.value()); | ||||||||||||||||
|
|
||||||||||||||||
| // Count UTF-8 code points (not bytes!) | ||||||||||||||||
| // Truncate uses code points: "José" has 4 code points but 5 bytes | ||||||||||||||||
| int32_t code_point_count = CountUTF8CodePoints(string_value); | ||||||||||||||||
|
|
||||||||||||||||
| // Only optimize if literal code point count equals truncate width | ||||||||||||||||
| // Example: truncate(col, 5) == "Alice" (5 code points) can be optimized | ||||||||||||||||
| // truncate(col, 10) == "abc" (3 code points) CANNOT | ||||||||||||||||
| // truncate(col, 4) == "José" (4 code points, 5 bytes) CAN be optimized | ||||||||||||||||
| if (code_point_count == truncate_width) { | ||||||||||||||||
| // Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value" | ||||||||||||||||
| // This benefits from strict metrics evaluation for startsWith in manifest | ||||||||||||||||
| // filtering | ||||||||||||||||
| return std::make_shared<BoundLiteralPredicate>(Expression::Operation::kStartsWith, | ||||||||||||||||
| transform_term->reference(), | ||||||||||||||||
| std::move(literal)); | ||||||||||||||||
| } | ||||||||||||||||
| } | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term), | ||||||||||||||||
| std::move(literal)); | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -141,6 +141,19 @@ class ICEBERG_EXPORT Transform : public util::Formattable { | |
| /// \brief Returns the transform type. | ||
| TransformType transform_type() const; | ||
|
|
||
| /// \brief Returns the optional parameter for parameterized transforms. | ||
| /// | ||
| /// For transforms like bucket(N) or truncate(W), returns the parameter value. | ||
| /// For non-parameterized transforms (identity, year, etc.), returns std::nullopt. | ||
| /// | ||
| /// \return The parameter if present, otherwise std::nullopt | ||
| std::optional<int32_t> param() const { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we remove this function? This makes it harder to evolve the variant in the future. |
||
| if (auto* p = std::get_if<int32_t>(¶m_)) { | ||
| return *p; | ||
| } | ||
| return std::nullopt; | ||
| } | ||
|
|
||
| /// \brief Binds this transform to a source type, returning a typed TransformFunction. | ||
| /// | ||
| /// This creates a concrete transform implementation based on the transform type and | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Iceberg spec does not have this. We should not mislead readers.