|
21 | 21 |
|
22 | 22 | #include <algorithm> |
23 | 23 | #include <format> |
24 | | -#include <regex> |
25 | 24 |
|
26 | 25 | #include "iceberg/exception.h" |
27 | 26 | #include "iceberg/expression/expressions.h" |
@@ -262,51 +261,53 @@ Result<std::shared_ptr<Expression>> UnboundPredicate<B>::BindLiteralOperation( |
262 | 261 | // implies exact match, so STARTS_WITH remains valid (short-string invariance) |
263 | 262 | if (BASE::op() == Expression::Operation::kEq && |
264 | 263 | bound_term->kind() == Term::Kind::kTransform) { |
265 | | - // Use checked_cast for fail-fast debug behavior |
266 | | - auto* transform_term = |
267 | | - internal::checked_cast<BoundTransform*>(bound_term.get()); |
| 264 | + // Safe to cast after kind check confirms it's a transform |
| 265 | + auto* transform_term = dynamic_cast<BoundTransform*>(bound_term.get()); |
| 266 | + if (!transform_term) { |
| 267 | + // Should never happen after kind check, but be defensive |
| 268 | + return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term), |
| 269 | + std::move(literal)); |
| 270 | + } |
268 | 271 |
|
269 | 272 | if (transform_term->transform()->transform_type() == TransformType::kTruncate && |
270 | 273 | literal.type()->type_id() == TypeId::kString && |
271 | 274 | !literal.IsNull()) { // Null safety: skip null literals |
272 | 275 |
|
273 | | - // TODO: Avoid ToString/regex parsing once Transform API exposes width directly |
274 | | - // (e.g., TruncateTransform::width() getter would be cleaner and faster) |
275 | | - // Extract width from transform string (format: "truncate[width]") |
276 | | - std::string transform_str = transform_term->transform()->ToString(); |
277 | | - |
278 | | - // Static regex to avoid recompilation on each bind (micro-optimization) |
279 | | - static const std::regex width_regex(R"(truncate\[(\d+)\])"); |
280 | | - std::smatch match; |
281 | | - |
282 | | - if (std::regex_match(transform_str, match, width_regex)) { |
283 | | - int32_t truncate_width = std::stoi(match[1].str()); |
284 | | - |
285 | | - // Skip width=0: truncate(col, 0) == "" would rewrite to STARTS_WITH("") |
286 | | - // which is tautologically true and could accidentally broaden filters |
287 | | - if (truncate_width == 0) { |
288 | | - // Don't optimize; let the normal predicate handle this edge case |
289 | | - return std::make_shared<BoundLiteralPredicate>( |
290 | | - BASE::op(), std::move(bound_term), std::move(literal)); |
291 | | - } |
292 | | - |
293 | | - auto& string_value = std::get<std::string>(literal.value()); |
294 | | - |
295 | | - // Count UTF-8 code points (not bytes!) |
296 | | - // Truncate uses code points: "José" has 4 code points but 5 bytes |
297 | | - int32_t code_point_count = CountUTF8CodePoints(string_value); |
298 | | - |
299 | | - // Only optimize if literal code point count equals truncate width |
300 | | - // Example: truncate(col, 5) == "Alice" (5 code points) can be optimized |
301 | | - // truncate(col, 10) == "abc" (3 code points) CANNOT |
302 | | - // truncate(col, 4) == "José" (4 code points, 5 bytes) CAN be optimized |
303 | | - if (code_point_count == truncate_width) { |
304 | | - // Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value" |
305 | | - // This benefits from strict metrics evaluation for startsWith in manifest filtering |
306 | | - return std::make_shared<BoundLiteralPredicate>( |
307 | | - Expression::Operation::kStartsWith, transform_term->reference(), |
308 | | - std::move(literal)); |
309 | | - } |
| 276 | + // Extract width parameter using type-safe API |
| 277 | + auto width_opt = transform_term->transform()->param(); |
| 278 | + if (!width_opt) { |
| 279 | + // Should never happen for truncate, but be defensive |
| 280 | + return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term), |
| 281 | + std::move(literal)); |
| 282 | + } |
| 283 | + |
| 284 | + int32_t truncate_width = *width_opt; |
| 285 | + |
| 286 | + // Skip width=0: truncate(col, 0) == "" would rewrite to STARTS_WITH("") |
| 287 | + // which is tautologically true and could accidentally broaden filters |
| 288 | + // (Note: Transform::Truncate already validates width > 0, but defensive check) |
| 289 | + if (truncate_width == 0) { |
| 290 | + return std::make_shared<BoundLiteralPredicate>(BASE::op(), std::move(bound_term), |
| 291 | + std::move(literal)); |
| 292 | + } |
| 293 | + |
| 294 | + auto& string_value = std::get<std::string>(literal.value()); |
| 295 | + |
| 296 | + // Count UTF-8 code points (not bytes!) |
| 297 | + // Truncate uses code points: "José" has 4 code points but 5 bytes |
| 298 | + int32_t code_point_count = CountUTF8CodePoints(string_value); |
| 299 | + |
| 300 | + // Only optimize if literal code point count equals truncate width |
| 301 | + // Example: truncate(col, 5) == "Alice" (5 code points) can be optimized |
| 302 | + // truncate(col, 10) == "abc" (3 code points) CANNOT |
| 303 | + // truncate(col, 4) == "José" (4 code points, 5 bytes) CAN be optimized |
| 304 | + if (code_point_count == truncate_width) { |
| 305 | + // Rewrite: truncate(col, width) == "value" → col STARTS_WITH "value" |
| 306 | + // This benefits from strict metrics evaluation for startsWith in manifest |
| 307 | + // filtering |
| 308 | + return std::make_shared<BoundLiteralPredicate>(Expression::Operation::kStartsWith, |
| 309 | + transform_term->reference(), |
| 310 | + std::move(literal)); |
310 | 311 | } |
311 | 312 | } |
312 | 313 | } |
|
0 commit comments