Skip to content

Commit a831452

Browse files
committed
fix: replace brittle regex parsing with type-safe Transform::param() API
Address critical code review feedback by removing regex-based width extraction. Changes: - Added Transform::param() public getter returning std::optional<int32_t> - Updated truncate optimization to use param() instead of ToString() + regex - Removed <regex> include (no longer needed) - More robust: immune to ToString() format changes - Better performance: no regex compilation or string parsing - Type-safe: compile-time checked, no runtime parsing errors This fixes the most critical issue from code review - the brittle dependency on Transform::ToString() string format. The new API is production-grade and follows C++ best practices for accessing configuration parameters.
1 parent a777ab7 commit a831452

File tree

3 files changed

+59
-43
lines changed

3 files changed

+59
-43
lines changed

src/iceberg/expression/predicate.cc

Lines changed: 42 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
#include <algorithm>
2323
#include <format>
24-
#include <regex>
2524

2625
#include "iceberg/exception.h"
2726
#include "iceberg/expression/expressions.h"
@@ -262,51 +261,53 @@ Result<std::shared_ptr<Expression>> UnboundPredicate<B>::BindLiteralOperation(
262261
// implies exact match, so STARTS_WITH remains valid (short-string invariance)
263262
if (BASE::op() == Expression::Operation::kEq &&
264263
bound_term->kind() == Term::Kind::kTransform) {
265-
// Use checked_cast for fail-fast debug behavior
266-
auto* transform_term =
267-
internal::checked_cast<BoundTransform*>(bound_term.get());
264+
// Safe to cast after kind check confirms it's a transform
265+
auto* transform_term = dynamic_cast<BoundTransform*>(bound_term.get());
266+
if (!transform_term) {
267+
// Should never happen after kind check, but be defensive
268+
return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
269+
std::move(literal));
270+
}
268271

269272
if (transform_term->transform()->transform_type() == TransformType::kTruncate &&
270273
literal.type()->type_id() == TypeId::kString &&
271274
!literal.IsNull()) { // Null safety: skip null literals
272275

273-
// TODO: Avoid ToString/regex parsing once Transform API exposes width directly
274-
// (e.g., TruncateTransform::width() getter would be cleaner and faster)
275-
// Extract width from transform string (format: "truncate[width]")
276-
std::string transform_str = transform_term->transform()->ToString();
277-
278-
// Static regex to avoid recompilation on each bind (micro-optimization)
279-
static const std::regex width_regex(R"(truncate\[(\d+)\])");
280-
std::smatch match;
281-
282-
if (std::regex_match(transform_str, match, width_regex)) {
283-
int32_t truncate_width = std::stoi(match[1].str());
284-
285-
// Skip width=0: truncate(col, 0) == "" would rewrite to STARTS_WITH("")
286-
// which is tautologically true and could accidentally broaden filters
287-
if (truncate_width == 0) {
288-
// Don't optimize; let the normal predicate handle this edge case
289-
return std::make_shared<BoundLiteralPredicate>(
290-
BASE::op(), std::move(bound_term), std::move(literal));
291-
}
292-
293-
auto& string_value = std::get<std::string>(literal.value());
294-
295-
// Count UTF-8 code points (not bytes!)
296-
// Truncate uses code points: "José" has 4 code points but 5 bytes
297-
int32_t code_point_count = CountUTF8CodePoints(string_value);
298-
299-
// Only optimize if literal code point count equals truncate width
300-
// Example: truncate(col, 5) == "Alice" (5 code points) can be optimized
301-
// truncate(col, 10) == "abc" (3 code points) CANNOT
302-
// truncate(col, 4) == "José" (4 code points, 5 bytes) CAN be optimized
303-
if (code_point_count == truncate_width) {
304-
// Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value"
305-
// This benefits from strict metrics evaluation for startsWith in manifest filtering
306-
return std::make_shared<BoundLiteralPredicate>(
307-
Expression::Operation::kStartsWith, transform_term->reference(),
308-
std::move(literal));
309-
}
276+
// Extract width parameter using type-safe API
277+
auto width_opt = transform_term->transform()->param();
278+
if (!width_opt) {
279+
// Should never happen for truncate, but be defensive
280+
return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
281+
std::move(literal));
282+
}
283+
284+
int32_t truncate_width = *width_opt;
285+
286+
// Skip width=0: truncate(col, 0) == "" would rewrite to STARTS_WITH("")
287+
// which is tautologically true and could accidentally broaden filters
288+
// (Note: Transform::Truncate already validates width > 0, but defensive check)
289+
if (truncate_width == 0) {
290+
return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
291+
std::move(literal));
292+
}
293+
294+
auto& string_value = std::get<std::string>(literal.value());
295+
296+
// Count UTF-8 code points (not bytes!)
297+
// Truncate uses code points: "José" has 4 code points but 5 bytes
298+
int32_t code_point_count = CountUTF8CodePoints(string_value);
299+
300+
// Only optimize if literal code point count equals truncate width
301+
// Example: truncate(col, 5) == "Alice" (5 code points) can be optimized
302+
// truncate(col, 10) == "abc" (3 code points) CANNOT
303+
// truncate(col, 4) == "José" (4 code points, 5 bytes) CAN be optimized
304+
if (code_point_count == truncate_width) {
305+
// Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value"
306+
// This benefits from strict metrics evaluation for startsWith in manifest
307+
// filtering
308+
return std::make_shared<BoundLiteralPredicate>(Expression::Operation::kStartsWith,
309+
transform_term->reference(),
310+
std::move(literal));
310311
}
311312
}
312313
}

src/iceberg/test/predicate_test.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@
1717
* under the License.
1818
*/
1919

20-
#include "iceberg/expression/expressions.h"
2120
#include "iceberg/expression/predicate.h"
21+
22+
#include "iceberg/expression/expressions.h"
2223
#include "iceberg/schema.h"
2324
#include "iceberg/test/matchers.h"
2425
#include "iceberg/type.h"
@@ -502,7 +503,8 @@ TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonString) {
502503
ASSERT_THAT(bound_result, IsOk());
503504
auto bound_pred = bound_result.value();
504505

505-
// Should remain as kEq, not converted to STARTS_WITH (binary doesn't support startsWith)
506+
// Should remain as kEq, not converted to STARTS_WITH (binary doesn't support
507+
// startsWith)
506508
EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
507509

508510
// The term should still be a transform

src/iceberg/transform.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,19 @@ class ICEBERG_EXPORT Transform : public util::Formattable {
141141
/// \brief Returns the transform type.
142142
TransformType transform_type() const;
143143

144+
/// \brief Returns the optional parameter for parameterized transforms.
145+
///
146+
/// For transforms like bucket(N) or truncate(W), returns the parameter value.
147+
/// For non-parameterized transforms (identity, year, etc.), returns std::nullopt.
148+
///
149+
/// \return The parameter if present, otherwise std::nullopt
150+
std::optional<int32_t> param() const {
151+
if (auto* p = std::get_if<int32_t>(&param_)) {
152+
return *p;
153+
}
154+
return std::nullopt;
155+
}
156+
144157
/// \brief Binds this transform to a source type, returning a typed TransformFunction.
145158
///
146159
/// This creates a concrete transform implementation based on the transform type and

0 commit comments

Comments
 (0)