fix: replace brittle regex parsing with type-safe Transform::param() API

shangxinli · shangxinli · commit a831452572eb · 2025-11-03T10:47:13.000-08:00
Address critical code review feedback by removing regex-based width extraction.

Changes:
- Added Transform::param() public getter returning std::optional&lt;int32_t&gt;
- Updated truncate optimization to use param() instead of ToString() + regex
- Removed &lt;regex&gt; include (no longer needed)
- More robust: immune to ToString() format changes
- Better performance: no regex compilation or string parsing
- Type-safe: compile-time checked, no runtime parsing errors

This fixes the most critical issue from code review - the brittle dependency
on Transform::ToString() string format. The new API is production-grade and
follows C++ best practices for accessing configuration parameters.
diff --git a/src/iceberg/expression/predicate.cc b/src/iceberg/expression/predicate.cc
@@ -21,7 +21,6 @@
 
 #include <algorithm>
 #include <format>
-#include <regex>
 
 #include "iceberg/exception.h"
 #include "iceberg/expression/expressions.h"
@@ -262,51 +261,53 @@ Result<std::shared_ptr<Expression>> UnboundPredicate<B>::BindLiteralOperation(
   //   implies exact match, so STARTS_WITH remains valid (short-string invariance)
   if (BASE::op() == Expression::Operation::kEq &&
       bound_term->kind() == Term::Kind::kTransform) {
-    // Use checked_cast for fail-fast debug behavior
-    auto* transform_term =
-        internal::checked_cast<BoundTransform*>(bound_term.get());
+    // Safe to cast after kind check confirms it's a transform
+    auto* transform_term = dynamic_cast<BoundTransform*>(bound_term.get());
+    if (!transform_term) {
+      // Should never happen after kind check, but be defensive
+      return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
+                                                     std::move(literal));
+    }
 
     if (transform_term->transform()->transform_type() == TransformType::kTruncate &&
         literal.type()->type_id() == TypeId::kString &&
         !literal.IsNull()) {  // Null safety: skip null literals
 
-      // TODO: Avoid ToString/regex parsing once Transform API exposes width directly
-      // (e.g., TruncateTransform::width() getter would be cleaner and faster)
-      // Extract width from transform string (format: "truncate[width]")
-      std::string transform_str = transform_term->transform()->ToString();
-
-      // Static regex to avoid recompilation on each bind (micro-optimization)
-      static const std::regex width_regex(R"(truncate\[(\d+)\])");
-      std::smatch match;
-
-      if (std::regex_match(transform_str, match, width_regex)) {
-        int32_t truncate_width = std::stoi(match[1].str());
-
-        // Skip width=0: truncate(col, 0) == "" would rewrite to STARTS_WITH("")
-        // which is tautologically true and could accidentally broaden filters
-        if (truncate_width == 0) {
-          // Don't optimize; let the normal predicate handle this edge case
-          return std::make_shared<BoundLiteralPredicate>(
-              BASE::op(), std::move(bound_term), std::move(literal));
-        }
-
-        auto& string_value = std::get<std::string>(literal.value());
-
-        // Count UTF-8 code points (not bytes!)
-        // Truncate uses code points: "José" has 4 code points but 5 bytes
-        int32_t code_point_count = CountUTF8CodePoints(string_value);
-
-        // Only optimize if literal code point count equals truncate width
-        // Example: truncate(col, 5) == "Alice" (5 code points) can be optimized
-        //          truncate(col, 10) == "abc" (3 code points) CANNOT
-        //          truncate(col, 4) == "José" (4 code points, 5 bytes) CAN be optimized
-        if (code_point_count == truncate_width) {
-          // Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value"
-          // This benefits from strict metrics evaluation for startsWith in manifest filtering
-          return std::make_shared<BoundLiteralPredicate>(
-              Expression::Operation::kStartsWith, transform_term->reference(),
-              std::move(literal));
-        }
+      // Extract width parameter using type-safe API
+      auto width_opt = transform_term->transform()->param();
+      if (!width_opt) {
+        // Should never happen for truncate, but be defensive
+        return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
+                                                       std::move(literal));
+      }
+
+      int32_t truncate_width = *width_opt;
+
+      // Skip width=0: truncate(col, 0) == "" would rewrite to STARTS_WITH("")
+      // which is tautologically true and could accidentally broaden filters
+      // (Note: Transform::Truncate already validates width > 0, but defensive check)
+      if (truncate_width == 0) {
+        return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term),
+                                                       std::move(literal));
+      }
+
+      auto& string_value = std::get<std::string>(literal.value());
+
+      // Count UTF-8 code points (not bytes!)
+      // Truncate uses code points: "José" has 4 code points but 5 bytes
+      int32_t code_point_count = CountUTF8CodePoints(string_value);
+
+      // Only optimize if literal code point count equals truncate width
+      // Example: truncate(col, 5) == "Alice" (5 code points) can be optimized
+      //          truncate(col, 10) == "abc" (3 code points) CANNOT
+      //          truncate(col, 4) == "José" (4 code points, 5 bytes) CAN be optimized
+      if (code_point_count == truncate_width) {
+        // Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value"
+        // This benefits from strict metrics evaluation for startsWith in manifest
+        // filtering
+        return std::make_shared<BoundLiteralPredicate>(Expression::Operation::kStartsWith,
+                                                       transform_term->reference(),
+                                                       std::move(literal));
       }
     }
   }
diff --git a/src/iceberg/test/predicate_test.cc b/src/iceberg/test/predicate_test.cc
@@ -17,8 +17,9 @@
  * under the License.
  */
 
-#include "iceberg/expression/expressions.h"
 #include "iceberg/expression/predicate.h"
+
+#include "iceberg/expression/expressions.h"
 #include "iceberg/schema.h"
 #include "iceberg/test/matchers.h"
 #include "iceberg/type.h"
@@ -502,7 +503,8 @@ TEST_F(PredicateTest, TruncateOptimizationNotAppliedForNonString) {
   ASSERT_THAT(bound_result, IsOk());
   auto bound_pred = bound_result.value();
 
-  // Should remain as kEq, not converted to STARTS_WITH (binary doesn't support startsWith)
+  // Should remain as kEq, not converted to STARTS_WITH (binary doesn't support
+  // startsWith)
   EXPECT_EQ(bound_pred->op(), Expression::Operation::kEq);
 
   // The term should still be a transform
diff --git a/src/iceberg/transform.h b/src/iceberg/transform.h
@@ -141,6 +141,19 @@ class ICEBERG_EXPORT Transform : public util::Formattable {
   /// \brief Returns the transform type.
   TransformType transform_type() const;
 
+  /// \brief Returns the optional parameter for parameterized transforms.
+  ///
+  /// For transforms like bucket(N) or truncate(W), returns the parameter value.
+  /// For non-parameterized transforms (identity, year, etc.), returns std::nullopt.
+  ///
+  /// \return The parameter if present, otherwise std::nullopt
+  std::optional<int32_t> param() const {
+    if (auto* p = std::get_if<int32_t>(&param_)) {
+      return *p;
+    }
+    return std::nullopt;
+  }
+
   /// \brief Binds this transform to a source type, returning a typed TransformFunction.
   ///
   /// This creates a concrete transform implementation based on the transform type and