From 288a488e55cd3a16da1a1279625c921697454652 Mon Sep 17 00:00:00 2001 From: Anthony Alaribe Date: Wed, 28 Jan 2026 16:36:53 -0800 Subject: [PATCH 1/6] Upgrade to DataFusion 52 with Utf8View support and fix WAL metadata limits - Update delta-rs to ffb794ba to include Utf8View predicate fixes - Migrate string types to Utf8View for better performance - Fix WAL metadata size limit by using hashed topic keys (16-char hex) - Add bincode serialization for WAL entries (schema-less, compact) - Remove unnecessary session state from DML operations - Add buffer_consistency_test.rs with comprehensive buffer/Delta tests - Update test utilities and assertions for Utf8View compatibility --- Cargo.lock | 568 ++++++++++++-------------- Cargo.toml | 24 +- src/buffered_write_layer.rs | 49 ++- src/config.rs | 26 +- src/database.rs | 348 ++++++++++++---- src/dml.rs | 27 +- src/functions.rs | 50 ++- src/mem_buffer.rs | 30 +- src/object_store_cache.rs | 2 +- src/pgwire_handlers.rs | 8 +- src/schema_loader.rs | 5 +- src/test_utils.rs | 95 ++++- src/wal.rs | 150 +++++-- tests/buffer_consistency_test.rs | 319 +++++++++++++++ tests/connection_pressure_test.rs | 2 +- tests/delta_rs_api_test.rs | 16 +- tests/integration_test.rs | 2 +- tests/test_custom_functions.rs | 19 +- tests/test_dml_operations.rs | 54 +-- tests/test_postgres_json_functions.rs | 27 +- 20 files changed, 1289 insertions(+), 532 deletions(-) create mode 100644 tests/buffer_consistency_test.rs diff --git a/Cargo.lock b/Cargo.lock index a5039b6..a4602d7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -341,9 +341,9 @@ dependencies = [ [[package]] name = "arrow-pg" -version = "0.9.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c43a4d328a3f45a159e9b7ee666b7f754eeec4a761a83647780d1a69dd55a1c8" +checksum = "88ce1ffbf30cd0198a53f1f838226337aa136c2eb58530253ed8796b97c05e2e" dependencies = [ "bytes", "chrono", @@ -424,19 +424,14 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.19" +version = "0.4.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" +checksum = "d10e4f991a553474232bc0a31799f6d24b034a84c0971d80d2e2f78b2e576e40" dependencies = [ - "bzip2 0.5.2", - "flate2", - "futures-core", - "memchr", + "compression-codecs", + "compression-core", "pin-project-lite", "tokio", - "xz2", - "zstd", - "zstd-safe", ] [[package]] @@ -458,7 +453,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -475,7 +470,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -1128,7 +1123,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -1197,7 +1192,7 @@ checksum = "f9abbd1bc6865053c427f7198e6af43bfdedc55ab791faed4fbd361d789575ff" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -1222,15 +1217,6 @@ dependencies = [ "either", ] -[[package]] -name = "bzip2" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" -dependencies = [ - "bzip2-sys", -] - [[package]] name = "bzip2" version = "0.6.1" @@ -1240,16 +1226,6 @@ dependencies = [ "libbz2-rs-sys", ] -[[package]] -name = "bzip2-sys" -version = "0.1.13+1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" -dependencies = [ - "cc", - "pkg-config", -] - [[package]] name = "cc" version = "1.2.50" @@ -1329,7 +1305,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -1397,6 +1373,27 @@ dependencies = [ "unicode-width 0.2.2", ] +[[package]] +name = "compression-codecs" +version = "0.4.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00828ba6fd27b45a448e57dbfe84f1029d4c9f26b368157e9a448a5f49a2ec2a" +dependencies = [ + "bzip2", + "compression-core", + "flate2", + "liblzma", + "memchr", + "zstd", + "zstd-safe", +] + +[[package]] +name = "compression-core" +version = "0.4.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d" + [[package]] name = "concurrent-queue" version = "2.5.0" @@ -1699,7 +1696,7 @@ dependencies = [ "proc-macro2", "quote", "strsim 0.11.1", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -1713,7 +1710,7 @@ dependencies = [ "proc-macro2", "quote", "strsim 0.11.1", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -1735,7 +1732,7 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ "darling_core 0.20.11", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -1746,7 +1743,7 @@ checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" dependencies = [ "darling_core 0.21.3", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -1765,15 +1762,15 @@ dependencies = [ [[package]] name = "datafusion" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ba7cb113e9c0bedf9e9765926031e132fa05a1b09ba6e93a6d1a4d7044457b8" +checksum = "d12ee9fdc6cdb5898c7691bb994f0ba606c4acc93a2258d78bb9f26ff8158bb3" dependencies = [ "arrow", "arrow-schema", "async-trait", "bytes", - "bzip2 0.6.1", + "bzip2", "chrono", "datafusion-catalog", "datafusion-catalog-listing", @@ -1803,27 +1800,26 @@ dependencies = [ "flate2", "futures", "itertools 0.14.0", + "liblzma", "log", "object_store", "parking_lot", "parquet", "rand 0.9.2", "regex", - "rstest", "sqlparser", "tempfile", "tokio", "url", "uuid", - "xz2", "zstd", ] [[package]] name = "datafusion-catalog" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66a3a799f914a59b1ea343906a0486f17061f39509af74e874a866428951130d" +checksum = "462dc9ef45e5d688aeaae49a7e310587e81b6016b9d03bace5626ad0043e5a9e" dependencies = [ "arrow", "async-trait", @@ -1846,9 +1842,9 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db1b113c80d7a0febcd901476a57aef378e717c54517a163ed51417d87621b0" +checksum = "1b96dbf1d728fc321817b744eb5080cdd75312faa6980b338817f68f3caa4208" dependencies = [ "arrow", "async-trait", @@ -1865,21 +1861,20 @@ dependencies = [ "itertools 0.14.0", "log", "object_store", - "tokio", ] [[package]] name = "datafusion-common" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c10f7659e96127d25e8366be7c8be4109595d6a2c3eac70421f380a7006a1b0" +checksum = "3237a6ff0d2149af4631290074289cae548c9863c885d821315d54c6673a074a" dependencies = [ "ahash 0.8.12", "arrow", "arrow-ipc", "chrono", "half", - "hashbrown 0.14.5", + "hashbrown 0.16.1", "indexmap 2.12.1", "libc", "log", @@ -1894,9 +1889,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b92065bbc6532c6651e2f7dd30b55cba0c7a14f860c7e1d15f165c41a1868d95" +checksum = "70b5e34026af55a1bfccb1ef0a763cf1f64e77c696ffcf5a128a278c31236528" dependencies = [ "futures", "log", @@ -1905,15 +1900,15 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fde13794244bc7581cd82f6fff217068ed79cdc344cafe4ab2c3a1c3510b38d6" +checksum = "1b2a6be734cc3785e18bbf2a7f2b22537f6b9fb960d79617775a51568c281842" dependencies = [ "arrow", "async-compression", "async-trait", "bytes", - "bzip2 0.6.1", + "bzip2", "chrono", "datafusion-common", "datafusion-common-runtime", @@ -1928,21 +1923,21 @@ dependencies = [ "futures", "glob", "itertools 0.14.0", + "liblzma", "log", "object_store", "rand 0.9.2", "tokio", "tokio-util", "url", - "xz2", "zstd", ] [[package]] name = "datafusion-datasource-arrow" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "804fa9b4ecf3157982021770617200ef7c1b2979d57bec9044748314775a9aea" +checksum = "1739b9b07c9236389e09c74f770e88aff7055250774e9def7d3f4f56b3dcc7be" dependencies = [ "arrow", "arrow-ipc", @@ -1964,9 +1959,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61a1641a40b259bab38131c5e6f48fac0717bedb7dc93690e604142a849e0568" +checksum = "61c73bc54b518bbba7c7650299d07d58730293cfba4356f6f428cc94c20b7600" dependencies = [ "arrow", "async-trait", @@ -1987,9 +1982,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adeacdb00c1d37271176f8fb6a1d8ce096baba16ea7a4b2671840c5c9c64fe85" +checksum = "37812c8494c698c4d889374ecfabbff780f1f26d9ec095dd1bddfc2a8ca12559" dependencies = [ "arrow", "async-trait", @@ -2009,9 +2004,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43d0b60ffd66f28bfb026565d62b0a6cbc416da09814766a3797bba7d85a3cd9" +checksum = "2210937ecd9f0e824c397e73f4b5385c97cd1aff43ab2b5836fcfd2d321523fb" dependencies = [ "arrow", "async-trait", @@ -2039,18 +2034,19 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b99e13947667b36ad713549237362afb054b2d8f8cc447751e23ec61202db07" +checksum = "2c825f969126bc2ef6a6a02d94b3c07abff871acf4d6dd759ce1255edb7923ce" [[package]] name = "datafusion-execution" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63695643190679037bc946ad46a263b62016931547bf119859c511f7ff2f5178" +checksum = "fa03ef05a2c2f90dd6c743e3e111078e322f4b395d20d4b4d431a245d79521ae" dependencies = [ "arrow", "async-trait", + "chrono", "dashmap", "datafusion-common", "datafusion-expr", @@ -2065,9 +2061,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9a4787cbf5feb1ab351f789063398f67654a6df75c4d37d7f637dc96f951a91" +checksum = "ef33934c1f98ee695cc51192cc5f9ed3a8febee84fdbcd9131bf9d3a9a78276f" dependencies = [ "arrow", "async-trait", @@ -2088,9 +2084,9 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ce2fb1b8c15c9ac45b0863c30b268c69dc9ee7a1ee13ecf5d067738338173dc" +checksum = "000c98206e3dd47d2939a94b6c67af4bfa6732dd668ac4fafdbde408fd9134ea" dependencies = [ "arrow", "datafusion-common", @@ -2101,9 +2097,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "794a9db7f7b96b3346fc007ff25e994f09b8f0511b4cf7dff651fadfe3ebb28f" +checksum = "379b01418ab95ca947014066248c22139fe9af9289354de10b445bd000d5d276" dependencies = [ "arrow", "arrow-buffer", @@ -2111,6 +2107,7 @@ dependencies = [ "blake2", "blake3", "chrono", + "chrono-tz", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -2131,9 +2128,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c25210520a9dcf9c2b2cbbce31ebd4131ef5af7fc60ee92b266dc7d159cb305" +checksum = "fd00d5454ba4c3f8ebbd04bd6a6a9dc7ced7c56d883f70f2076c188be8459e4c" dependencies = [ "ahash 0.8.12", "arrow", @@ -2152,9 +2149,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62f4a66f3b87300bb70f4124b55434d2ae3fe80455f3574701d0348da040b55d" +checksum = "aec06b380729a87210a4e11f555ec2d729a328142253f8d557b87593622ecc9f" dependencies = [ "ahash 0.8.12", "arrow", @@ -2165,9 +2162,9 @@ dependencies = [ [[package]] name = "datafusion-functions-json" -version = "0.51.1" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f427c97cd0d574a2dab3456cbe65695fed700e1136afc09d1ad7093a0ec9fb71" +checksum = "d3ce789cf93834ff0303811ce4080a5c349311fad52e3924ad26f933f59189f3" dependencies = [ "datafusion", "jiter", @@ -2177,9 +2174,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae5c06eed03918dc7fe7a9f082a284050f0e9ecf95d72f57712d1496da03b8c4" +checksum = "904f48d45e0f1eb7d0eb5c0f80f2b5c6046a85454364a6b16a2e0b46f62e7dff" dependencies = [ "arrow", "arrow-ord", @@ -2200,9 +2197,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db4fed1d71738fbe22e2712d71396db04c25de4111f1ec252b8f4c6d3b25d7f5" +checksum = "e9a0d20e2b887e11bee24f7734d780a2588b925796ac741c3118dd06d5aa77f0" dependencies = [ "arrow", "async-trait", @@ -2216,9 +2213,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d92206aa5ae21892f1552b4d61758a862a70956e6fd7a95cb85db1de74bc6d1" +checksum = "d3414b0a07e39b6979fe3a69c7aa79a9f1369f1d5c8e52146e66058be1b285ee" dependencies = [ "arrow", "datafusion-common", @@ -2234,9 +2231,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53ae9bcc39800820d53a22d758b3b8726ff84a5a3e24cecef04ef4e5fdf1c7cc" +checksum = "5bf2feae63cd4754e31add64ce75cae07d015bce4bb41cd09872f93add32523a" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -2244,20 +2241,20 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1063ad4c9e094b3f798acee16d9a47bd7372d9699be2de21b05c3bd3f34ab848" +checksum = "c4fe888aeb6a095c4bcbe8ac1874c4b9a4c7ffa2ba849db7922683ba20875aaf" dependencies = [ "datafusion-doc", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] name = "datafusion-optimizer" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f35f9ec5d08b87fd1893a30c2929f2559c2f9806ca072d8fefca5009dc0f06a" +checksum = "8a6527c063ae305c11be397a86d8193936f4b84d137fe40bd706dfc178cf733c" dependencies = [ "arrow", "chrono", @@ -2275,9 +2272,9 @@ dependencies = [ [[package]] name = "datafusion-pg-catalog" -version = "0.13.1" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f637c63fabff04818905edcb55f67de7072eba7420a4cffcea98b87dbce87182" +checksum = "daafc06d0478b70b13e8f3d906f2d47c49027efd3718263851137cf6d1d3e0a4" dependencies = [ "async-trait", "datafusion", @@ -2289,9 +2286,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c30cc8012e9eedcb48bbe112c6eff4ae5ed19cf3003cb0f505662e88b7014c5d" +checksum = "0bb028323dd4efd049dd8a78d78fe81b2b969447b39c51424167f973ac5811d9" dependencies = [ "ahash 0.8.12", "arrow", @@ -2301,19 +2298,21 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-physical-expr-common", "half", - "hashbrown 0.14.5", + "hashbrown 0.16.1", "indexmap 2.12.1", "itertools 0.14.0", "parking_lot", "paste", "petgraph", + "recursive", + "tokio", ] [[package]] name = "datafusion-physical-expr-adapter" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f9ff2dbd476221b1f67337699eff432781c4e6e1713d2aefdaa517dfbf79768" +checksum = "78fe0826aef7eab6b4b61533d811234a7a9e5e458331ebbf94152a51fc8ab433" dependencies = [ "arrow", "datafusion-common", @@ -2326,23 +2325,26 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90da43e1ec550b172f34c87ec68161986ced70fd05c8d2a2add66eef9c276f03" +checksum = "cfccd388620734c661bd8b7ca93c44cdd59fecc9b550eea416a78ffcbb29475f" dependencies = [ "ahash 0.8.12", "arrow", + "chrono", "datafusion-common", "datafusion-expr-common", - "hashbrown 0.14.5", + "hashbrown 0.16.1", + "indexmap 2.12.1", "itertools 0.14.0", + "parking_lot", ] [[package]] name = "datafusion-physical-optimizer" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce9804f799acd7daef3be7aaffe77c0033768ed8fdbf5fb82fc4c5f2e6bc14e6" +checksum = "bde5fa10e73259a03b705d5fddc136516814ab5f441b939525618a4070f5a059" dependencies = [ "arrow", "datafusion-common", @@ -2359,27 +2361,27 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0acf0ad6b6924c6b1aa7d213b181e012e2d3ec0a64ff5b10ee6282ab0f8532ac" +checksum = "0e1098760fb29127c24cc9ade3277051dc73c9ed0ac0131bd7bcd742e0ad7470" dependencies = [ "ahash 0.8.12", "arrow", "arrow-ord", "arrow-schema", "async-trait", - "chrono", "datafusion-common", "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", + "datafusion-functions", "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr", "datafusion-physical-expr-common", "futures", "half", - "hashbrown 0.14.5", + "hashbrown 0.16.1", "indexmap 2.12.1", "itertools 0.14.0", "log", @@ -2390,9 +2392,9 @@ dependencies = [ [[package]] name = "datafusion-postgres" -version = "0.13.0" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26b2869098db07e7b5e3e609365beba2bfef604e5f22633fbb38ddc462719cb9" +checksum = "12413f19af3af28a49fad42191b45d47941091dfeb5f58bb3791c976d3188be1" dependencies = [ "arrow-pg", "async-trait", @@ -2414,9 +2416,9 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d368093a98a17d1449b1083ac22ed16b7128e4c67789991869480d8c4a40ecb9" +checksum = "0cf75daf56aa6b1c6867cc33ff0fb035d517d6d06737fd355a3e1ef67cba6e7a" dependencies = [ "arrow", "chrono", @@ -2441,9 +2443,9 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b6aef3d5e5c1d2bc3114c4876730cb76a9bdc5a8df31ef1b6db48f0c1671895" +checksum = "12a0cb3cce232a3de0d14ef44b58a6537aeb1362cfb6cf4d808691ddbb918956" dependencies = [ "arrow", "datafusion-common", @@ -2452,9 +2454,9 @@ dependencies = [ [[package]] name = "datafusion-pruning" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac2c2498a1f134a9e11a9f5ed202a2a7d7e9774bd9249295593053ea3be999db" +checksum = "64d0fef4201777b52951edec086c21a5b246f3c82621569ddb4a26f488bc38a9" dependencies = [ "arrow", "datafusion-common", @@ -2469,9 +2471,9 @@ dependencies = [ [[package]] name = "datafusion-session" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f96eebd17555386f459037c65ab73aae8df09f464524c709d6a3134ad4f4776" +checksum = "f71f1e39e8f2acbf1c63b0e93756c2e970a64729dab70ac789587d6237c4fde0" dependencies = [ "async-trait", "datafusion-common", @@ -2483,9 +2485,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "51.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fc195fe60634b2c6ccfd131b487de46dc30eccae8a3c35a13f136e7f440414f" +checksum = "f44693cfcaeb7a9f12d71d1c576c3a6dc025a12cef209375fa2d16fb3b5670ee" dependencies = [ "arrow", "bigdecimal", @@ -2501,14 +2503,16 @@ dependencies = [ [[package]] name = "datafusion-tracing" -version = "51.0.0" -source = "git+https://github.com/datafusion-contrib/datafusion-tracing.git#2527512d7567b65c1a842f9e73543e1eeb4ef32c" +version = "52.0.0" +source = "git+https://github.com/datafusion-contrib/datafusion-tracing.git?rev=43734ac7a87eacb599d1d855a21c8c157d71acbb#43734ac7a87eacb599d1d855a21c8c157d71acbb" dependencies = [ + "async-trait", "comfy-table", "datafusion", "delegate", "futures", "pin-project", + "similar", "tracing", "tracing-futures", "unicode-width 0.2.2", @@ -2522,14 +2526,14 @@ checksum = "780eb241654bf097afb00fc5f054a09b687dad862e485fdcf8399bb056565370" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] name = "delta_kernel" -version = "0.19.0" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1eb81d155d4f2423b931c7bf7e58a3124b23ee9a074a4771e1751b72af7fdc5" +checksum = "8d3d40b40819579c0ec4b58e8f256a8080a82f5540a42bfab9e0eb4b3f92de2a" dependencies = [ "arrow", "bytes", @@ -2564,13 +2568,13 @@ checksum = "c9e6474dabfc8e0b849ee2d68f8f13025230d1945b28c69695e9a21b9219ac8e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] name = "deltalake" -version = "0.30.0" -source = "git+https://github.com/delta-io/delta-rs.git?rev=cacb6c668f535bccfee182cd4ff3b6375b1a4e25#cacb6c668f535bccfee182cd4ff3b6375b1a4e25" +version = "0.30.1" +source = "git+https://github.com/delta-io/delta-rs.git?rev=ffb794ba0745394fc4b747a4ef2e11c2d4ec086a#ffb794ba0745394fc4b747a4ef2e11c2d4ec086a" dependencies = [ "ctor", "delta_kernel", @@ -2581,7 +2585,7 @@ dependencies = [ [[package]] name = "deltalake-aws" version = "0.13.0" -source = "git+https://github.com/delta-io/delta-rs.git?rev=cacb6c668f535bccfee182cd4ff3b6375b1a4e25#cacb6c668f535bccfee182cd4ff3b6375b1a4e25" +source = "git+https://github.com/delta-io/delta-rs.git?rev=ffb794ba0745394fc4b747a4ef2e11c2d4ec086a#ffb794ba0745394fc4b747a4ef2e11c2d4ec086a" dependencies = [ "async-trait", "aws-config", @@ -2606,8 +2610,8 @@ dependencies = [ [[package]] name = "deltalake-core" -version = "0.30.0" -source = "git+https://github.com/delta-io/delta-rs.git?rev=cacb6c668f535bccfee182cd4ff3b6375b1a4e25#cacb6c668f535bccfee182cd4ff3b6375b1a4e25" +version = "0.30.1" +source = "git+https://github.com/delta-io/delta-rs.git?rev=ffb794ba0745394fc4b747a4ef2e11c2d4ec086a#ffb794ba0745394fc4b747a4ef2e11c2d4ec086a" dependencies = [ "arrow", "arrow-arith", @@ -2626,6 +2630,7 @@ dependencies = [ "chrono", "dashmap", "datafusion", + "datafusion-datasource", "datafusion-proto", "delta_kernel", "deltalake-derive", @@ -2659,13 +2664,13 @@ dependencies = [ [[package]] name = "deltalake-derive" version = "0.30.0" -source = "git+https://github.com/delta-io/delta-rs.git?rev=cacb6c668f535bccfee182cd4ff3b6375b1a4e25#cacb6c668f535bccfee182cd4ff3b6375b1a4e25" +source = "git+https://github.com/delta-io/delta-rs.git?rev=ffb794ba0745394fc4b747a4ef2e11c2d4ec086a#ffb794ba0745394fc4b747a4ef2e11c2d4ec086a" dependencies = [ "convert_case", "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -2707,7 +2712,7 @@ checksum = "2cdc8d50f426189eef89dac62fabfa0abb27d5cc008f25bf4156a0203325becc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -2728,7 +2733,7 @@ dependencies = [ "darling 0.20.11", "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -2738,7 +2743,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -2782,7 +2787,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -2860,7 +2865,7 @@ dependencies = [ "enum-ordinalize", "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -2909,30 +2914,7 @@ checksum = "8ca9601fb2d62598ee17836250842873a413586e5d7ed88b356e38ddbb0ec631" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", -] - -[[package]] -name = "env_filter" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bf3c259d255ca70051b30e2e95b5446cdb8949ac4cd22c0d7fd634d89f568e2" -dependencies = [ - "log", - "regex", -] - -[[package]] -name = "env_logger" -version = "0.11.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f" -dependencies = [ - "anstream", - "anstyle", - "env_filter", - "jiff", - "log", + "syn 2.0.114", ] [[package]] @@ -3319,7 +3301,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -3334,12 +3316,6 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" -[[package]] -name = "futures-timer" -version = "3.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" - [[package]] name = "futures-util" version = "0.3.31" @@ -3404,7 +3380,7 @@ dependencies = [ "proc-macro-error2", "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -3495,10 +3471,6 @@ name = "hashbrown" version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" -dependencies = [ - "ahash 0.8.12", - "allocator-api2", -] [[package]] name = "hashbrown" @@ -3955,8 +3927,8 @@ dependencies = [ [[package]] name = "instrumented-object-store" -version = "51.0.0" -source = "git+https://github.com/datafusion-contrib/datafusion-tracing.git#2527512d7567b65c1a842f9e73543e1eeb4ef32c" +version = "52.0.0" +source = "git+https://github.com/datafusion-contrib/datafusion-tracing.git?rev=43734ac7a87eacb599d1d855a21c8c157d71acbb#43734ac7a87eacb599d1d855a21c8c157d71acbb" dependencies = [ "async-trait", "bytes", @@ -4029,30 +4001,6 @@ version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ee5b5339afb4c41626dde77b7a611bd4f2c202b897852b4bcf5d03eddc61010" -[[package]] -name = "jiff" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49cce2b81f2098e7e3efc35bc2e0a6b7abec9d34128283d7a26fa8f32a6dbb35" -dependencies = [ - "jiff-static", - "log", - "portable-atomic", - "portable-atomic-util", - "serde_core", -] - -[[package]] -name = "jiff-static" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "980af8b43c3ad5d8d349ace167ec8170839f753a42d233ba19e08afe1850fa69" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.111", -] - [[package]] name = "jiter" version = "0.12.0" @@ -4108,7 +4056,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -4189,6 +4137,26 @@ version = "0.2.178" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091" +[[package]] +name = "liblzma" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73c36d08cad03a3fbe2c4e7bb3a9e84c57e4ee4135ed0b065cade3d98480c648" +dependencies = [ + "liblzma-sys", +] + +[[package]] +name = "liblzma-sys" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f2db66f3268487b5033077f266da6777d057949b8f93c8ad82e441df25e6186" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "libm" version = "0.2.15" @@ -4322,17 +4290,6 @@ dependencies = [ "twox-hash", ] -[[package]] -name = "lzma-sys" -version = "0.1.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "madsim" version = "0.2.34" @@ -4556,7 +4513,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -4908,11 +4865,22 @@ dependencies = [ "serde", ] +[[package]] +name = "pg_interval_2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a055f44628dcf9c4e68f931535dabd3544a239655fdde25a3b0e95d4b36e9260" +dependencies = [ + "bytes", + "chrono", + "postgres-types", +] + [[package]] name = "pgwire" -version = "0.36.3" +version = "0.37.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70a2bcdcc4b20a88e0648778ecf00415bbd5b447742275439c22176835056f99" +checksum = "6fcd410bc6990bd8d20b3fe3cd879a3c3ec250bdb1cb12537b528818823b02c9" dependencies = [ "async-trait", "base64", @@ -4923,6 +4891,7 @@ dependencies = [ "hex", "lazy-regex", "md5", + "pg_interval_2", "postgres-types", "rand 0.9.2", "ring", @@ -4931,6 +4900,7 @@ dependencies = [ "ryu", "serde", "serde_json", + "smol_str", "stringprep", "thiserror", "tokio", @@ -4993,7 +4963,7 @@ checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -5051,15 +5021,6 @@ version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f59e70c4aef1e55797c2e8fd94a4f2a973fc972cfde0e0b05f683667b0cd39dd" -[[package]] -name = "portable-atomic-util" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" -dependencies = [ - "portable-atomic", -] - [[package]] name = "postgres-protocol" version = "0.6.9" @@ -5145,7 +5106,7 @@ dependencies = [ "proc-macro-error-attr2", "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -5177,7 +5138,7 @@ dependencies = [ "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -5257,7 +5218,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -5270,7 +5231,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -5444,7 +5405,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -5493,7 +5454,7 @@ checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -5531,12 +5492,6 @@ version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" -[[package]] -name = "relative-path" -version = "1.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2" - [[package]] name = "rend" version = "0.4.2" @@ -5673,35 +5628,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "rstest" -version = "0.26.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5a3193c063baaa2a95a33f03035c8a72b83d97a54916055ba22d35ed3839d49" -dependencies = [ - "futures-timer", - "futures-util", - "rstest_macros", -] - -[[package]] -name = "rstest_macros" -version = "0.26.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c845311f0ff7951c5506121a9ad75aec44d083c31583b2ea5a30bcb0b0abba0" -dependencies = [ - "cfg-if", - "glob", - "proc-macro-crate", - "proc-macro2", - "quote", - "regex", - "relative-path", - "rustc_version", - "syn 2.0.111", - "unicode-ident", -] - [[package]] name = "rust_decimal" version = "1.39.0" @@ -6026,7 +5952,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -6091,7 +6017,7 @@ dependencies = [ "darling 0.21.3", "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -6129,7 +6055,7 @@ checksum = "5d69265a08751de7844521fd15003ae0a888e035773ba05695c5c759a6f89eef" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -6243,6 +6169,16 @@ dependencies = [ "serde", ] +[[package]] +name = "smol_str" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f7a918bd2a9951d18ee6e48f076843e8e73a9a5d22cf05bcd4b7a81bdd04e17" +dependencies = [ + "borsh", + "serde_core", +] + [[package]] name = "snap" version = "1.1.1" @@ -6341,7 +6277,7 @@ checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -6403,7 +6339,7 @@ dependencies = [ "quote", "sqlx-core", "sqlx-macros-core", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -6426,7 +6362,7 @@ dependencies = [ "sqlx-mysql", "sqlx-postgres", "sqlx-sqlite", - "syn 2.0.111", + "syn 2.0.114", "tokio", "url", ] @@ -6600,7 +6536,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -6632,9 +6568,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.111" +version = "2.0.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "390cc9a294ab71bdb1aa2e99d13be9c753cd2d7bd6560c77118597410c4d2e87" +checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a" dependencies = [ "proc-macro2", "quote", @@ -6658,7 +6594,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -6692,6 +6628,39 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "test-case" +version = "3.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb2550dd13afcd286853192af8601920d959b14c401fcece38071d53bf0768a8" +dependencies = [ + "test-case-macros", +] + +[[package]] +name = "test-case-core" +version = "3.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adcb7fd841cd518e279be3d5a3eb0636409487998a4aff22f3de87b81e88384f" +dependencies = [ + "cfg-if", + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "test-case-macros" +version = "3.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c89e72a01ed4c579669add59014b9a524d609c0c88c6a585ce37485879f6ffb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", + "test-case-core", +] + [[package]] name = "thiserror" version = "2.0.17" @@ -6709,7 +6678,7 @@ checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -6792,7 +6761,6 @@ dependencies = [ "delta_kernel", "deltalake", "dotenv", - "env_logger", "envy", "foyer", "futures", @@ -6819,6 +6787,7 @@ dependencies = [ "strum", "tdigests", "tempfile", + "test-case", "thiserror", "tokio", "tokio-cron-scheduler", @@ -6870,9 +6839,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.48.0" +version = "1.49.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408" +checksum = "72a2903cd7736441aac9df9d7688bd0ce48edccaadf181c3b90be801e81d3d86" dependencies = [ "bytes", "libc", @@ -6909,7 +6878,7 @@ checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -7139,7 +7108,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -7267,7 +7236,7 @@ checksum = "076a02dc54dd46795c2e9c8282ed40bcfb1e22747e955de9389a1de28190fb26" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -7415,7 +7384,7 @@ dependencies = [ "proc-macro-error2", "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -7546,7 +7515,7 @@ dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", "wasm-bindgen-shared", ] @@ -7655,7 +7624,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -7666,7 +7635,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -7979,15 +7948,6 @@ version = "0.13.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" -[[package]] -name = "xz2" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" -dependencies = [ - "lzma-sys", -] - [[package]] name = "yoke" version = "0.8.1" @@ -8007,7 +7967,7 @@ checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", "synstructure", ] @@ -8034,7 +7994,7 @@ checksum = "d8a8d209fdf45cf5138cbb5a506f6b52522a25afccc534d1475dad8e31105c6a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -8054,7 +8014,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", "synstructure", ] @@ -8075,7 +8035,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -8108,7 +8068,7 @@ checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 9c64646..c09a4f4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,8 +5,8 @@ edition = "2024" [dependencies] tokio = { version = "1.48", features = ["full"] } -datafusion = "51.0.0" -datafusion-datasource = "51.0.0" +datafusion = "52.1.0" +datafusion-datasource = "52.1.0" arrow = "57.1.0" arrow-json = "57.1.0" uuid = { version = "1.17", features = ["v4", "serde"] } @@ -16,17 +16,16 @@ serde_json = "1.0.141" serde_with = "3.14" serde_yaml = "0.9" async-trait = "0.1.86" -env_logger = "0.11.6" log = "0.4.27" color-eyre = "0.6.5" arrow-schema = "57.1.0" regex = "1.11.1" -# Updated to latest delta-rs with datafusion 51 and arrow 57 support -deltalake = { git = "https://github.com/delta-io/delta-rs.git", rev = "cacb6c668f535bccfee182cd4ff3b6375b1a4e25", features = [ +# Updated to delta-rs with datafusion 52 Utf8View fixes (includes commits 987e535f, ffb794ba) +deltalake = { git = "https://github.com/delta-io/delta-rs.git", rev = "ffb794ba0745394fc4b747a4ef2e11c2d4ec086a", features = [ "datafusion", "s3", ] } -delta_kernel = { version = "0.19.0", features = [ +delta_kernel = { version = "0.19.1", features = [ "arrow-conversion", "default-engine-rustls", "arrow-57", @@ -42,8 +41,8 @@ sqlx = { version = "0.8", features = [ futures = { version = "0.3.31", features = ["alloc"] } bytes = "1.4" tokio-rustls = "0.26.1" -datafusion-postgres = "0.13.0" -datafusion-functions-json = "0.51.0" +datafusion-postgres = "0.14.0" +datafusion-functions-json = "0.52.0" anyhow = "1.0.100" tokio-util = "0.7.17" tokio-stream = { version = "0.1.17", features = ["net"] } @@ -53,8 +52,8 @@ tracing-opentelemetry = "0.32" opentelemetry = "0.31" opentelemetry-otlp = { version = "0.31", features = ["grpc-tonic"] } opentelemetry_sdk = { version = "0.31", features = ["rt-tokio"] } -datafusion-tracing = { git = "https://github.com/datafusion-contrib/datafusion-tracing.git" } -instrumented-object-store = { git = "https://github.com/datafusion-contrib/datafusion-tracing.git" } +datafusion-tracing = { git = "https://github.com/datafusion-contrib/datafusion-tracing.git", rev = "43734ac7a87eacb599d1d855a21c8c157d71acbb" } +instrumented-object-store = { git = "https://github.com/datafusion-contrib/datafusion-tracing.git", rev = "43734ac7a87eacb599d1d855a21c8c157d71acbb" } dotenv = "0.15.0" include_dir = "0.7" aws-config = { version = "1.6.0", features = ["behavior-version-latest"] } @@ -63,7 +62,7 @@ aws-sdk-s3 = "1.3.0" aws-sdk-dynamodb = "1.3.0" url = "2.5.4" tokio-cron-scheduler = "0.15" -object_store = "0.12.3" +object_store = "0.12.4" foyer = { version = "0.21.1", features = ["serde"] } ahash = "0.8" lru = "0.16.1" @@ -79,11 +78,12 @@ strum = { version = "0.27", features = ["derive"] } [dev-dependencies] sqllogictest = { git = "https://github.com/risinglightdb/sqllogictest-rs.git" } serial_test = "3.2.0" -datafusion-common = "51.0.0" +datafusion-common = "52.1.0" tokio-postgres = { version = "0.7.10", features = ["with-chrono-0_4"] } scopeguard = "1.2.0" rand = "0.9.2" tempfile = "3" +test-case = "3.3" [features] default = [] diff --git a/src/buffered_write_layer.rs b/src/buffered_write_layer.rs index 46b4ce5..2b30749 100644 --- a/src/buffered_write_layer.rs +++ b/src/buffered_write_layer.rs @@ -25,6 +25,13 @@ pub struct RecoveryStats { pub corrupted_entries_skipped: u64, } +#[derive(Debug, Default)] +pub struct FlushStats { + pub buckets_flushed: u64, + pub buckets_failed: u64, + pub total_rows: u64, +} + /// Callback for writing batches to Delta Lake. The callback MUST: /// - Complete the Delta commit (including S3 upload) before returning Ok /// - Return Err if the commit fails for any reason @@ -169,6 +176,12 @@ impl BufferedWriteLayer { self.release_reservation(reserved_size); result?; + + // Immediate flush mode: flush after every insert + if self.config.buffer.flush_immediately() { + self.flush_all_now().await?; + } + debug!("BufferedWriteLayer insert complete: project={}, table={}", project_id, table_name); Ok(()) } @@ -202,7 +215,7 @@ impl BufferedWriteLayer { for entry in entries { match entry.operation { - WalOperation::Insert => match WalManager::deserialize_batch(&entry.data) { + WalOperation::Insert => match WalManager::deserialize_batch(&entry.data, &entry.table_name) { Ok(batch) => { self.mem_buffer.insert(&entry.project_id, &entry.table_name, batch, entry.timestamp_micros)?; entries_replayed += 1; @@ -332,7 +345,7 @@ impl BufferedWriteLayer { return Ok(()); } - info!("Flushing {} buckets to Delta", flushable.len()); + debug!("Flushing {} buckets to Delta", flushable.len()); // Flush buckets in parallel with bounded concurrency let parallelism = self.config.buffer.flush_parallelism(); @@ -442,6 +455,32 @@ impl BufferedWriteLayer { Ok(()) } + /// Force flush all buffered data to Delta immediately. + pub async fn flush_all_now(&self) -> anyhow::Result { + let _flush_guard = self.flush_lock.lock().await; + let all_buckets = self.mem_buffer.get_all_buckets(); + let mut stats = FlushStats { total_rows: all_buckets.iter().map(|b| b.row_count as u64).sum(), ..Default::default() }; + + for bucket in all_buckets { + match self.flush_bucket(&bucket).await { + Ok(()) => { + self.checkpoint_and_drain(&bucket); + stats.buckets_flushed += 1; + } + Err(e) => { + error!("flush_all_now: failed bucket {}: {}", bucket.bucket_id, e); + stats.buckets_failed += 1; + } + } + } + Ok(stats) + } + + /// Check if buffer is empty (all data flushed). + pub fn is_empty(&self) -> bool { + self.mem_buffer.get_stats().total_rows == 0 + } + pub fn get_stats(&self) -> MemBufferStats { self.mem_buffer.get_stats() } @@ -503,7 +542,7 @@ impl BufferedWriteLayer { #[cfg(test)] mod tests { use super::*; - use arrow::array::{Int64Array, StringArray}; + use arrow::array::{Int64Array, StringViewArray}; use arrow::datatypes::{DataType, Field, Schema}; use std::path::PathBuf; use tempfile::tempdir; @@ -517,10 +556,10 @@ mod tests { fn create_test_batch() -> RecordBatch { let schema = Arc::new(Schema::new(vec![ Field::new("id", DataType::Int64, false), - Field::new("name", DataType::Utf8, false), + Field::new("name", DataType::Utf8View, false), ])); let id_array = Int64Array::from(vec![1, 2, 3]); - let name_array = StringArray::from(vec!["a", "b", "c"]); + let name_array = StringViewArray::from(vec!["a", "b", "c"]); RecordBatch::try_new(schema, vec![Arc::new(id_array), Arc::new(name_array)]).unwrap() } diff --git a/src/config.rs b/src/config.rs index f98f090..2149f40 100644 --- a/src/config.rs +++ b/src/config.rs @@ -198,18 +198,19 @@ impl AwsConfig { } let mut opts = HashMap::new(); - insert_opt!(opts, "aws_access_key_id", self.aws_access_key_id); - insert_opt!(opts, "aws_secret_access_key", self.aws_secret_access_key); - insert_opt!(opts, "aws_region", self.aws_default_region); - opts.insert("aws_endpoint".into(), endpoint_override.unwrap_or(&self.aws_s3_endpoint).to_string()); + insert_opt!(opts, "AWS_ACCESS_KEY_ID", self.aws_access_key_id); + insert_opt!(opts, "AWS_SECRET_ACCESS_KEY", self.aws_secret_access_key); + insert_opt!(opts, "AWS_REGION", self.aws_default_region); + insert_opt!(opts, "AWS_ALLOW_HTTP", self.aws_allow_http); + opts.insert("AWS_ENDPOINT_URL".into(), endpoint_override.unwrap_or(&self.aws_s3_endpoint).to_string()); if self.is_dynamodb_locking_enabled() { - opts.insert("aws_s3_locking_provider".into(), "dynamodb".into()); - insert_opt!(opts, "delta_dynamo_table_name", self.dynamodb.delta_dynamo_table_name); - insert_opt!(opts, "aws_access_key_id_dynamodb", self.dynamodb.aws_access_key_id_dynamodb); - insert_opt!(opts, "aws_secret_access_key_dynamodb", self.dynamodb.aws_secret_access_key_dynamodb); - insert_opt!(opts, "aws_region_dynamodb", self.dynamodb.aws_region_dynamodb); - insert_opt!(opts, "aws_endpoint_url_dynamodb", self.dynamodb.aws_endpoint_url_dynamodb); + opts.insert("AWS_S3_LOCKING_PROVIDER".into(), "dynamodb".into()); + insert_opt!(opts, "DELTA_DYNAMO_TABLE_NAME", self.dynamodb.delta_dynamo_table_name); + insert_opt!(opts, "AWS_ACCESS_KEY_ID_DYNAMODB", self.dynamodb.aws_access_key_id_dynamodb); + insert_opt!(opts, "AWS_SECRET_ACCESS_KEY_DYNAMODB", self.dynamodb.aws_secret_access_key_dynamodb); + insert_opt!(opts, "AWS_REGION_DYNAMODB", self.dynamodb.aws_region_dynamodb); + insert_opt!(opts, "AWS_ENDPOINT_URL_DYNAMODB", self.dynamodb.aws_endpoint_url_dynamodb); } opts } @@ -247,6 +248,8 @@ pub struct BufferConfig { pub timefusion_wal_corruption_threshold: usize, #[serde(default = "d_flush_parallelism")] pub timefusion_flush_parallelism: usize, + #[serde(default)] + pub timefusion_flush_immediately: bool, } impl BufferConfig { @@ -268,6 +271,9 @@ impl BufferConfig { pub fn flush_parallelism(&self) -> usize { self.timefusion_flush_parallelism.max(1) } + pub fn flush_immediately(&self) -> bool { + self.timefusion_flush_immediately + } pub fn compute_shutdown_timeout(&self, current_memory_mb: usize) -> Duration { Duration::from_secs((self.timefusion_shutdown_timeout_secs.max(1) + (current_memory_mb / 100) as u64).min(300)) diff --git a/src/database.rs b/src/database.rs index 39c644a..4539028 100644 --- a/src/database.rs +++ b/src/database.rs @@ -6,14 +6,15 @@ use anyhow::Result; use arrow_schema::SchemaRef; use async_trait::async_trait; use chrono::Utc; -use datafusion::arrow::array::{Array, AsArray}; +use datafusion::arrow::array::Array; +use datafusion::physical_expr::expressions::{CastExpr, Column as PhysicalColumn}; +use datafusion::physical_plan::projection::ProjectionExec; use datafusion::common::not_impl_err; use datafusion::common::{SchemaExt, Statistics}; use datafusion::datasource::sink::{DataSink, DataSinkExec}; use datafusion::execution::TaskContext; use datafusion::execution::context::SessionContext; use datafusion::logical_expr::{Expr, Operator, TableProviderFilterPushDown}; -// Removed unused imports use datafusion::physical_plan::DisplayAs; use datafusion::scalar::ScalarValue; use datafusion::{ @@ -56,10 +57,18 @@ pub async fn get_delta_table(project_configs: &ProjectConfigs, project_id: &str, // Helper function to extract project_id from a batch pub fn extract_project_id(batch: &RecordBatch) -> Option { + use datafusion::arrow::array::{StringArray, StringViewArray}; + batch.schema().fields().iter().position(|f| f.name() == "project_id").and_then(|idx| { let column = batch.column(idx); - let string_array = column.as_string::(); - (string_array.len() > 0 && !string_array.is_null(0)).then(|| string_array.value(0).to_string()) + // Try Utf8View first (our preferred type), then fall back to Utf8 + if let Some(arr) = column.as_any().downcast_ref::() { + (arr.len() > 0 && !arr.is_null(0)).then(|| arr.value(0).to_string()) + } else if let Some(arr) = column.as_any().downcast_ref::() { + (arr.len() > 0 && !arr.is_null(0)).then(|| arr.value(0).to_string()) + } else { + None + } }) } @@ -382,6 +391,17 @@ impl Database { self.buffered_layer.as_ref() } + /// Query Delta tables directly, bypassing the in-memory buffer (for testing). + pub async fn query_delta_only(&self, sql: &str) -> Result> { + let mut db_clone = self.clone(); + db_clone.buffered_layer = None; + let db_arc = Arc::new(db_clone); + let mut ctx = Arc::clone(&db_arc).create_session_context(); + datafusion_functions_json::register_all(&mut ctx)?; + db_arc.setup_session_context(&mut ctx)?; + Ok(ctx.sql(sql).await?.collect().await?) + } + /// Enable object store cache with foyer (deprecated - cache is now initialized in new()) /// This method is kept for backward compatibility but is now a no-op pub async fn with_object_store_cache(self) -> Result { @@ -560,6 +580,10 @@ impl Database { let mut options = ConfigOptions::new(); let _ = options.set("datafusion.catalog.information_schema", "true"); + // Ensure Utf8View handling for consistent string types across DataFusion and Delta + let _ = options.set("datafusion.execution.parquet.schema_force_view_types", "true"); + let _ = options.set("datafusion.sql_parser.map_string_types_to_utf8view", "true"); + // Enable Parquet statistics for better query optimization with Delta Lake // These settings ensure DataFusion uses file and column statistics for pruning let _ = options.set("datafusion.execution.parquet.statistics_enabled", "page"); @@ -688,44 +712,45 @@ impl Database { /// Register PostgreSQL settings table for compatibility pub fn register_pg_settings_table(&self, ctx: &SessionContext) -> datafusion::error::Result<()> { - use datafusion::arrow::array::StringArray; + use datafusion::arrow::array::StringViewArray; use datafusion::arrow::datatypes::{DataType, Field, Schema}; use datafusion::arrow::record_batch::RecordBatch; let schema = Arc::new(Schema::new(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("setting", DataType::Utf8, false), + Field::new("name", DataType::Utf8View, false), + Field::new("setting", DataType::Utf8View, false), ])); - let names = vec![ - "TimeZone".to_string(), - "client_encoding".to_string(), - "datestyle".to_string(), - "client_min_messages".to_string(), - // Add more PostgreSQL settings that clients might try to set - "lc_monetary".to_string(), - "lc_numeric".to_string(), - "lc_time".to_string(), - "standard_conforming_strings".to_string(), - "application_name".to_string(), - "search_path".to_string(), + let names: Vec<&str> = vec![ + "TimeZone", + "client_encoding", + "datestyle", + "client_min_messages", + "lc_monetary", + "lc_numeric", + "lc_time", + "standard_conforming_strings", + "application_name", + "search_path", ]; - let settings = vec![ - "UTC".to_string(), - "UTF8".to_string(), - "ISO, MDY".to_string(), - "notice".to_string(), - // Default values for the additional settings - "C".to_string(), - "C".to_string(), - "C".to_string(), - "on".to_string(), - "TimeFusion".to_string(), - "public".to_string(), + let settings: Vec<&str> = vec![ + "UTC", + "UTF8", + "ISO, MDY", + "notice", + "C", + "C", + "C", + "on", + "TimeFusion", + "public", ]; - let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(StringArray::from(names)), Arc::new(StringArray::from(settings))])?; + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(StringViewArray::from(names)), Arc::new(StringViewArray::from(settings))], + )?; ctx.register_batch("pg_settings", batch)?; Ok(()) @@ -733,17 +758,17 @@ impl Database { /// Register set_config UDF for PostgreSQL compatibility pub fn register_set_config_udf(&self, ctx: &SessionContext) { - use datafusion::arrow::array::{StringArray, StringBuilder}; + use datafusion::arrow::array::{StringViewArray, StringViewBuilder}; use datafusion::arrow::datatypes::DataType; use datafusion::logical_expr::{ColumnarValue, ScalarFunctionImplementation, Volatility, create_udf}; let set_config_fn: ScalarFunctionImplementation = Arc::new(move |args: &[ColumnarValue]| -> datafusion::error::Result { let param_value_array = match &args[1] { - ColumnarValue::Array(array) => array.as_any().downcast_ref::().expect("set_config second arg must be a StringArray"), + ColumnarValue::Array(array) => array.as_any().downcast_ref::().expect("set_config second arg must be a StringViewArray"), _ => panic!("set_config second arg must be an array"), }; - let mut builder = StringBuilder::new(); + let mut builder = StringViewBuilder::new(); for i in 0..param_value_array.len() { if param_value_array.is_null(i) { builder.append_null(); @@ -756,8 +781,8 @@ impl Database { let set_config_udf = create_udf( "set_config", - vec![DataType::Utf8, DataType::Utf8, DataType::Boolean], - DataType::Utf8, + vec![DataType::Utf8View, DataType::Utf8View, DataType::Boolean], + DataType::Utf8View, Volatility::Volatile, set_config_fn, ); @@ -891,30 +916,30 @@ impl Database { ); let mut storage_options = HashMap::new(); - storage_options.insert("aws_access_key_id".to_string(), config.s3_access_key_id.clone()); - storage_options.insert("aws_secret_access_key".to_string(), config.s3_secret_access_key.clone()); - storage_options.insert("aws_region".to_string(), config.s3_region.clone()); + storage_options.insert("AWS_ACCESS_KEY_ID".to_string(), config.s3_access_key_id.clone()); + storage_options.insert("AWS_SECRET_ACCESS_KEY".to_string(), config.s3_secret_access_key.clone()); + storage_options.insert("AWS_REGION".to_string(), config.s3_region.clone()); if let Some(ref endpoint) = config.s3_endpoint { - storage_options.insert("aws_endpoint".to_string(), endpoint.clone()); + storage_options.insert("AWS_ENDPOINT_URL".to_string(), endpoint.clone()); } // Add DynamoDB locking configuration if enabled (even for project-specific configs) if self.config.aws.is_dynamodb_locking_enabled() { - storage_options.insert("aws_s3_locking_provider".to_string(), "dynamodb".to_string()); + storage_options.insert("AWS_S3_LOCKING_PROVIDER".to_string(), "dynamodb".to_string()); if let Some(ref table) = self.config.aws.dynamodb.delta_dynamo_table_name { - storage_options.insert("delta_dynamo_table_name".to_string(), table.clone()); + storage_options.insert("DELTA_DYNAMO_TABLE_NAME".to_string(), table.clone()); } if let Some(ref key) = self.config.aws.dynamodb.aws_access_key_id_dynamodb { - storage_options.insert("aws_access_key_id_dynamodb".to_string(), key.clone()); + storage_options.insert("AWS_ACCESS_KEY_ID_DYNAMODB".to_string(), key.clone()); } if let Some(ref secret) = self.config.aws.dynamodb.aws_secret_access_key_dynamodb { - storage_options.insert("aws_secret_access_key_dynamodb".to_string(), secret.clone()); + storage_options.insert("AWS_SECRET_ACCESS_KEY_DYNAMODB".to_string(), secret.clone()); } if let Some(ref region) = self.config.aws.dynamodb.aws_region_dynamodb { - storage_options.insert("aws_region_dynamodb".to_string(), region.clone()); + storage_options.insert("AWS_REGION_DYNAMODB".to_string(), region.clone()); } if let Some(ref endpoint) = self.config.aws.dynamodb.aws_endpoint_url_dynamodb { - storage_options.insert("aws_endpoint_url_dynamodb".to_string(), endpoint.clone()); + storage_options.insert("AWS_ENDPOINT_URL_DYNAMODB".to_string(), endpoint.clone()); } } @@ -1062,16 +1087,16 @@ impl Database { let mut builder = AmazonS3Builder::new().with_bucket_name(bucket); // Apply storage options - if let Some(access_key) = storage_options.get("aws_access_key_id") { + if let Some(access_key) = storage_options.get("AWS_ACCESS_KEY_ID") { builder = builder.with_access_key_id(access_key); } - if let Some(secret_key) = storage_options.get("aws_secret_access_key") { + if let Some(secret_key) = storage_options.get("AWS_SECRET_ACCESS_KEY") { builder = builder.with_secret_access_key(secret_key); } - if let Some(region) = storage_options.get("aws_region") { + if let Some(region) = storage_options.get("AWS_REGION") { builder = builder.with_region(region); } - if let Some(endpoint) = storage_options.get("aws_endpoint") { + if let Some(endpoint) = storage_options.get("AWS_ENDPOINT_URL") { builder = builder.with_endpoint(endpoint); // If endpoint is HTTP, allow HTTP connections if endpoint.starts_with("http://") { @@ -1080,24 +1105,24 @@ impl Database { } // Use config values as fallback - if storage_options.get("aws_access_key_id").is_none() + if storage_options.get("AWS_ACCESS_KEY_ID").is_none() && let Some(ref key) = self.config.aws.aws_access_key_id { builder = builder.with_access_key_id(key); } - if storage_options.get("aws_secret_access_key").is_none() + if storage_options.get("AWS_SECRET_ACCESS_KEY").is_none() && let Some(ref secret) = self.config.aws.aws_secret_access_key { builder = builder.with_secret_access_key(secret); } - if storage_options.get("aws_region").is_none() + if storage_options.get("AWS_REGION").is_none() && let Some(ref region) = self.config.aws.aws_default_region { builder = builder.with_region(region); } // Check if we need to use config for endpoint and allow HTTP - if storage_options.get("aws_endpoint").is_none() { + if storage_options.get("AWS_ENDPOINT_URL").is_none() { let endpoint = &self.config.aws.aws_s3_endpoint; builder = builder.with_endpoint(endpoint); if endpoint.starts_with("http://") { @@ -1108,8 +1133,8 @@ impl Database { let store = builder.build()?; // Log if DynamoDB locking is enabled for this store - if storage_options.get("aws_s3_locking_provider") == Some(&"dynamodb".to_string()) - && let Some(table_name) = storage_options.get("delta_dynamo_table_name") + if storage_options.get("AWS_S3_LOCKING_PROVIDER") == Some(&"dynamodb".to_string()) + && let Some(table_name) = storage_options.get("DELTA_DYNAMO_TABLE_NAME") { debug!("Object store configured with DynamoDB locking using table: {}", table_name); } @@ -1118,11 +1143,17 @@ impl Database { } /// Creates or loads a DeltaTable with proper configuration - /// When DynamoDB locking is enabled, we have to use the standard DeltaTableBuilder - /// without custom storage backend to ensure proper log store initialization + /// Sets environment variables from storage_options to ensure delta-rs credential resolution works async fn create_or_load_delta_table( &self, storage_uri: &str, storage_options: HashMap, cached_store: Arc, ) -> Result { + // Set env vars from storage_options for delta-rs credential resolution + for (key, value) in &storage_options { + if key.starts_with("AWS_") { + unsafe { std::env::set_var(key, value); } + } + } + DeltaTableBuilder::from_url(Url::parse(storage_uri)?)? .with_storage_backend(cached_store.clone(), Url::parse(storage_uri)?) .with_storage_options(storage_options.clone()) @@ -1536,15 +1567,25 @@ impl ProjectRoutingTable { fn extract_project_id(&self, expr: &Expr) -> Option { match expr { Expr::BinaryExpr(BinaryExpr { left, op, right }) if *op == Operator::Eq => { - if let (Expr::Column(col), Expr::Literal(ScalarValue::Utf8(Some(value)), None)) = (left.as_ref(), right.as_ref()) + // Check column = value (both Utf8 and Utf8View) + if let Expr::Column(col) = left.as_ref() && col.name == "project_id" { - return Some(value.clone()); + match right.as_ref() { + Expr::Literal(ScalarValue::Utf8(Some(v)), _) => return Some(v.clone()), + Expr::Literal(ScalarValue::Utf8View(Some(v)), _) => return Some(v.clone()), + _ => {} + } } - if let (Expr::Literal(ScalarValue::Utf8(Some(value)), None), Expr::Column(col)) = (left.as_ref(), right.as_ref()) + // Check value = column (both Utf8 and Utf8View) + if let Expr::Column(col) = right.as_ref() && col.name == "project_id" { - return Some(value.clone()); + match left.as_ref() { + Expr::Literal(ScalarValue::Utf8(Some(v)), _) => return Some(v.clone()), + Expr::Literal(ScalarValue::Utf8View(Some(v)), _) => return Some(v.clone()), + _ => {} + } } None } @@ -1669,7 +1710,76 @@ impl ProjectRoutingTable { ) -> DFResult> { let delta_table = self.database.resolve_table(project_id, &self.table_name).await?; let table = delta_table.read().await; - table.scan(state, projection.cloned().as_ref(), filters, limit).await + + // Register the object store with DataFusion's runtime so table_provider().scan() can access it + let log_store = table.log_store(); + let root_store = log_store.root_object_store(None); + let bucket_url = { + let table_url = table.table_url(); + let scheme = table_url.scheme(); + let bucket = table_url.host_str().unwrap_or(""); + Url::parse(&format!("{}://{}/", scheme, bucket)).expect("valid bucket URL") + }; + state.runtime_env().register_object_store(&bucket_url, root_store); + + let provider = table.table_provider().await.map_err(|e| DataFusionError::External(Box::new(e)))?; + + // Translate projection indices from our schema to delta table's schema + // The projection indices from DataFusion are based on ProjectRoutingTable.schema, + // but the delta table provider expects indices based on its own schema + let delta_schema = provider.schema(); + let translated_projection = projection.map(|proj| { + proj.iter() + .filter_map(|&idx| { + // Get column name from our schema + let col_name = self.schema.field(idx).name(); + // Find column index in delta schema + delta_schema.fields().iter().position(|f| f.name() == col_name) + }) + .collect::>() + }); + + let delta_plan = provider.scan(state, translated_projection.as_ref(), filters, limit).await?; + + // Determine target schema based on projection + let target_schema = if let Some(proj) = projection { + Arc::new(arrow_schema::Schema::new(proj.iter().map(|&idx| self.schema.field(idx).clone()).collect::>())) + } else { + self.schema.clone() + }; + + // Coerce delta output schema to match our expected schema (e.g., Utf8 -> Utf8View) + let delta_output_schema = delta_plan.schema(); + if delta_output_schema.fields().len() == target_schema.fields().len() { + let needs_coercion = delta_output_schema + .fields() + .iter() + .zip(target_schema.fields()) + .any(|(delta_field, target_field)| delta_field.data_type() != target_field.data_type()); + + if needs_coercion { + // Create cast expressions for each column + let cast_exprs: Vec<(Arc, String)> = delta_output_schema + .fields() + .iter() + .enumerate() + .zip(target_schema.fields()) + .map(|((idx, delta_field), target_field)| { + let col_expr = Arc::new(PhysicalColumn::new(delta_field.name(), idx)) as Arc; + let expr: Arc = if delta_field.data_type() != target_field.data_type() { + Arc::new(CastExpr::new(col_expr, target_field.data_type().clone(), None)) + } else { + col_expr + }; + (expr, target_field.name().clone()) + }) + .collect(); + + return Ok(Arc::new(ProjectionExec::try_new(cast_exprs, delta_plan)?)); + } + } + + Ok(delta_plan) } /// Extract time range (min, max) from query filters. @@ -1945,13 +2055,79 @@ impl TableProvider for ProjectRoutingTable { let delta_table = self.database.resolve_table(&project_id, &self.table_name).instrument(resolve_span).await?; let table = delta_table.read().await; + // Register the object store with DataFusion's runtime so table_provider().scan() can access it + let log_store = table.log_store(); + let root_store = log_store.root_object_store(None); + let bucket_url = { + let table_url = table.table_url(); + let scheme = table_url.scheme(); + let bucket = table_url.host_str().unwrap_or(""); + Url::parse(&format!("{}://{}/", scheme, bucket)).expect("valid bucket URL") + }; + state.runtime_env().register_object_store(&bucket_url, root_store); + let scan_span = tracing::trace_span!("delta_table.scan", table.name = %self.table_name, table.project_id = %project_id, partition_filters = ?delta_filters.iter().filter(|f| matches!(f, Expr::BinaryExpr(_))).count() ); - let delta_plan = table.scan(state, projection.cloned().as_ref(), &delta_filters, limit).instrument(scan_span).await?; + let provider = table.table_provider().await.map_err(|e| DataFusionError::External(Box::new(e)))?; + + // Translate projection indices from our schema to delta table's schema + let delta_schema = provider.schema(); + let translated_projection = projection.map(|proj| { + proj.iter() + .filter_map(|&idx| { + let col_name = self.schema.field(idx).name(); + delta_schema.fields().iter().position(|f| f.name() == col_name) + }) + .collect::>() + }); + + let delta_plan = provider.scan(state, translated_projection.as_ref(), &delta_filters, limit).instrument(scan_span).await?; + + // Determine target schema based on projection + let target_schema = if let Some(proj) = projection { + Arc::new(arrow_schema::Schema::new(proj.iter().map(|&idx| self.schema.field(idx).clone()).collect::>())) + } else { + self.schema.clone() + }; + + // Coerce delta output schema to match our expected schema (e.g., Utf8 -> Utf8View) + let delta_output_schema = delta_plan.schema(); + let delta_plan = if delta_output_schema.fields().len() == target_schema.fields().len() { + let needs_coercion = delta_output_schema + .fields() + .iter() + .zip(target_schema.fields()) + .any(|(delta_field, target_field)| delta_field.data_type() != target_field.data_type()); + + if needs_coercion { + // Create cast expressions for each column + let cast_exprs: Vec<(Arc, String)> = delta_output_schema + .fields() + .iter() + .enumerate() + .zip(target_schema.fields()) + .map(|((idx, delta_field), target_field)| { + let col_expr = Arc::new(PhysicalColumn::new(delta_field.name(), idx)) as Arc; + let expr: Arc = if delta_field.data_type() != target_field.data_type() { + Arc::new(CastExpr::new(col_expr, target_field.data_type().clone(), None)) + } else { + col_expr + }; + (expr, target_field.name().clone()) + }) + .collect(); + + Arc::new(ProjectionExec::try_new(cast_exprs, delta_plan)?) as Arc + } else { + delta_plan + } + } else { + delta_plan + }; // Union both plans (mem data first for recency, then Delta for historical) UnionExec::try_new(vec![mem_plan, delta_plan]) @@ -1980,6 +2156,20 @@ mod tests { use serial_test::serial; use std::path::PathBuf; + /// Helper function to extract string value from array column, handling different string array types + fn get_str(array: &dyn Array, idx: usize) -> String { + use datafusion::arrow::array::{StringArray, LargeStringArray, StringViewArray}; + if let Some(arr) = array.as_any().downcast_ref::() { + arr.value(idx).to_string() + } else if let Some(arr) = array.as_any().downcast_ref::() { + arr.value(idx).to_string() + } else if let Some(arr) = array.as_any().downcast_ref::() { + arr.value(idx).to_string() + } else { + panic!("Unsupported string array type: {:?}", array.data_type()) + } + } + fn create_test_config(test_id: &str) -> Arc { let mut cfg = AppConfig::default(); // S3/MinIO settings @@ -2036,8 +2226,8 @@ mod tests { .collect() .await?; assert_eq!(result[0].num_rows(), 1); - assert_eq!(result[0].column(0).as_string::().value(0), "test1"); - assert_eq!(result[0].column(1).as_string::().value(0), "span1"); + assert_eq!(get_str(result[0].column(0).as_ref(), 0), "test1"); + assert_eq!(get_str(result[0].column(1).as_ref(), 0), "span1"); // Shutdown database db.shutdown().await?; @@ -2067,7 +2257,7 @@ mod tests { let sql = format!("SELECT id FROM otel_logs_and_spans WHERE project_id = '{}'", project); let result = ctx.sql(&sql).await?.collect().await?; assert_eq!(result[0].num_rows(), 1); - assert_eq!(result[0].column(0).as_string::().value(0), format!("id_{}", project)); + assert_eq!(get_str(result[0].column(0).as_ref(), 0), format!("id_{}", project)); } // Verify total count - need to check across all projects @@ -2096,7 +2286,6 @@ mod tests { let (db, ctx, prefix) = setup_test_database().await?; let project_id = format!("filter_proj_{}", prefix); use chrono::Utc; - use datafusion::arrow::array::AsArray; use serde_json::json; let now = Utc::now(); @@ -2141,7 +2330,7 @@ mod tests { .collect() .await?; assert_eq!(result[0].num_rows(), 1); - assert_eq!(result[0].column(0).as_string::().value(0), "span2"); + assert_eq!(get_str(result[0].column(0).as_ref(), 0), "span2"); // Test filtering by duration let result = ctx @@ -2153,7 +2342,7 @@ mod tests { .collect() .await?; assert_eq!(result[0].num_rows(), 1); - assert_eq!(result[0].column(0).as_string::().value(0), "span2"); + assert_eq!(get_str(result[0].column(0).as_ref(), 0), "span2"); // Test compound filtering let result = ctx @@ -2165,7 +2354,7 @@ mod tests { .collect() .await?; assert_eq!(result[0].num_rows(), 1); - assert_eq!(result[0].column(1).as_string::().value(0), "Error occurred"); + assert_eq!(get_str(result[0].column(1).as_ref(), 0), "Error occurred"); // Shutdown database to ensure proper cleanup db.shutdown().await?; @@ -2222,7 +2411,7 @@ mod tests { .collect() .await?; assert_eq!(result[0].num_rows(), 1); - assert_eq!(result[0].column(1).as_string::().value(0), "sql_name"); + assert_eq!(get_str(result[0].column(1).as_ref(), 0), "sql_name"); db.shutdown().await?; Ok(()) @@ -2262,9 +2451,9 @@ mod tests { // Verify individual records let result = ctx.sql(&format!("SELECT id, name FROM otel_logs_and_spans WHERE project_id = '{}' ORDER BY id", project_id)).await?.collect().await?; assert_eq!(result[0].num_rows(), 3); - assert_eq!(result[0].column(0).as_string::().value(0), "id1"); - assert_eq!(result[0].column(0).as_string::().value(1), "id2"); - assert_eq!(result[0].column(0).as_string::().value(2), "id3"); + assert_eq!(get_str(result[0].column(0).as_ref(), 0), "id1"); + assert_eq!(get_str(result[0].column(0).as_ref(), 1), "id2"); + assert_eq!(get_str(result[0].column(0).as_ref(), 2), "id3"); // Shutdown database db.shutdown().await?; @@ -2282,7 +2471,6 @@ mod tests { let (db, ctx, prefix) = setup_test_database().await?; let project_id = format!("ts_test_{}", prefix); use chrono::Utc; - use datafusion::arrow::array::AsArray; use serde_json::json; let base_time = chrono::DateTime::parse_from_rfc3339("2023-01-01T10:00:00Z").unwrap().with_timezone(&Utc); @@ -2329,7 +2517,7 @@ mod tests { .await?; assert!(!result.is_empty(), "Query returned no results"); assert_eq!(result[0].num_rows(), 1); - assert_eq!(result[0].column(0).as_string::().value(0), "late"); + assert_eq!(get_str(result[0].column(0).as_ref(), 0), "late"); // Test timestamp formatting - need to include project_id let result = ctx @@ -2341,8 +2529,8 @@ mod tests { .collect() .await?; assert_eq!(result[0].num_rows(), 2); - assert_eq!(result[0].column(1).as_string::().value(0), "2023-01-01 10:00"); - assert_eq!(result[0].column(1).as_string::().value(1), "2023-01-01 12:00"); + assert_eq!(get_str(result[0].column(1).as_ref(), 0), "2023-01-01 10:00"); + assert_eq!(get_str(result[0].column(1).as_ref(), 1), "2023-01-01 12:00"); // Shutdown database to ensure proper cleanup db.shutdown().await?; diff --git a/src/dml.rs b/src/dml.rs index dc0c756..1395c26 100644 --- a/src/dml.rs +++ b/src/dml.rs @@ -9,10 +9,7 @@ use datafusion::{ }, common::{Column, Result}, error::DataFusionError, - execution::{ - SendableRecordBatchStream, TaskContext, - context::{QueryPlanner, SessionState}, - }, + execution::{SendableRecordBatchStream, TaskContext, context::{QueryPlanner, SessionState}}, logical_expr::{BinaryExpr, Expr, LogicalPlan, Operator, WriteOp}, physical_plan::{DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, PlanProperties, stream::RecordBatchStreamAdapter}, physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner}, @@ -443,6 +440,7 @@ pub async fn perform_delta_update( let span = tracing::Span::current(); let result = perform_delta_operation(database, table_name, project_id, |delta_table| async move { + // delta-rs handles Utf8View automatically with schema_force_view_types=true (default in DF52+) let mut builder = delta_table.update(); if let Some(pred) = predicate { @@ -483,6 +481,7 @@ pub async fn perform_delta_delete(database: &Database, table_name: &str, project let span = tracing::Span::current(); let result = perform_delta_operation(database, table_name, project_id, |delta_table| async move { + // delta-rs handles Utf8View automatically with schema_force_view_types=true (default in DF52+) let mut builder = delta_table.delete(); if let Some(pred) = predicate { @@ -527,15 +526,15 @@ where Ok(rows_affected) } -/// Convert DataFusion Expr to Delta-compatible format +/// Convert DataFusion Expr to Delta-compatible format. +/// Only strips table qualifiers from columns - Utf8View is kept for consistency. fn convert_expr_to_delta(expr: &Expr) -> Result { - match expr { - Expr::Column(col) => Ok(Expr::Column(Column::from_name(&col.name))), - Expr::BinaryExpr(binary) => Ok(Expr::BinaryExpr(BinaryExpr { - left: Box::new(convert_expr_to_delta(&binary.left)?), - op: binary.op, - right: Box::new(convert_expr_to_delta(&binary.right)?), - })), - _ => Ok(expr.clone()), - } + use datafusion::common::tree_node::TreeNode; + expr.clone() + .transform(|e| match &e { + Expr::Column(col) => Ok(datafusion::common::tree_node::Transformed::yes(Expr::Column(Column::from_name(&col.name)))), + _ => Ok(datafusion::common::tree_node::Transformed::no(e)), + }) + .map(|t| t.data) + .map_err(|e| DataFusionError::Execution(format!("Failed to convert expression: {}", e))) } diff --git a/src/functions.rs b/src/functions.rs index da9782d..c12a239 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -2,8 +2,8 @@ use anyhow::Result; use chrono::{DateTime, Utc}; use chrono_tz::Tz; use datafusion::arrow::array::{ - Array, ArrayRef, BinaryArray, BooleanArray, Float64Array, Int64Array, ListArray, StringArray, StringBuilder, TimestampMicrosecondArray, - TimestampNanosecondArray, + Array, ArrayRef, BinaryArray, BooleanArray, Float64Array, Int64Array, ListArray, StringArray, StringViewArray, StringViewBuilder, + TimestampMicrosecondArray, TimestampNanosecondArray, }; use datafusion::arrow::datatypes::{DataType, TimeUnit}; use datafusion::common::{DataFusionError, ScalarValue, not_impl_err}; @@ -80,7 +80,7 @@ impl ScalarUDFImpl for ToCharUDF { } fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { - Ok(DataType::Utf8) + Ok(DataType::Utf8View) } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> datafusion::error::Result { @@ -101,11 +101,18 @@ impl ScalarUDFImpl for ToCharUDF { let format_str = match &args[1] { ColumnarValue::Scalar(scalar) => match scalar { ScalarValue::Utf8(Some(s)) => s.clone(), + ScalarValue::Utf8View(Some(s)) => s.clone(), ScalarValue::LargeUtf8(Some(s)) => s.clone(), _ => return Err(DataFusionError::Execution("Format string must be a UTF8 string".to_string())), }, ColumnarValue::Array(arr) => { - if let Some(str_arr) = arr.as_any().downcast_ref::() { + if let Some(str_arr) = arr.as_any().downcast_ref::() { + if str_arr.len() == 1 && !str_arr.is_null(0) { + str_arr.value(0).to_string() + } else { + return Err(DataFusionError::Execution("Format string must be a scalar value".to_string())); + } + } else if let Some(str_arr) = arr.as_any().downcast_ref::() { if str_arr.len() == 1 && !str_arr.is_null(0) { str_arr.value(0).to_string() } else { @@ -125,7 +132,7 @@ impl ScalarUDFImpl for ToCharUDF { /// Format timestamps according to PostgreSQL format patterns fn format_timestamps(timestamp_array: &ArrayRef, format_str: &str) -> datafusion::error::Result { let chrono_format = postgres_to_chrono_format(format_str); - let mut builder = StringBuilder::new(); + let mut builder = StringViewBuilder::new(); let format_fn = |timestamp_us: i64| -> datafusion::error::Result { DateTime::::from_timestamp_micros(timestamp_us) @@ -237,11 +244,18 @@ impl ScalarUDFImpl for AtTimeZoneUDF { let tz_str = match &args[1] { ColumnarValue::Scalar(scalar) => match scalar { ScalarValue::Utf8(Some(s)) => s.clone(), + ScalarValue::Utf8View(Some(s)) => s.clone(), ScalarValue::LargeUtf8(Some(s)) => s.clone(), _ => return Err(DataFusionError::Execution("Timezone must be a UTF8 string".to_string())), }, ColumnarValue::Array(arr) => { - if let Some(str_arr) = arr.as_any().downcast_ref::() { + if let Some(str_arr) = arr.as_any().downcast_ref::() { + if str_arr.len() == 1 && !str_arr.is_null(0) { + str_arr.value(0).to_string() + } else { + return Err(DataFusionError::Execution("Timezone must be a scalar string value".to_string())); + } + } else if let Some(str_arr) = arr.as_any().downcast_ref::() { if str_arr.len() == 1 && !str_arr.is_null(0) { str_arr.value(0).to_string() } else { @@ -332,8 +346,8 @@ fn create_jsonb_array_elements_udf() -> ScalarUDF { create_udf( "jsonb_array_elements", - vec![DataType::Utf8], - DataType::Utf8, + vec![DataType::Utf8View], + DataType::Utf8View, Volatility::Immutable, jsonb_array_elements_fn, ) @@ -371,14 +385,14 @@ impl ScalarUDFImpl for JsonBuildArrayUDF { } fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { - Ok(DataType::Utf8) + Ok(DataType::Utf8View) } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> datafusion::error::Result { let args = args.args; if args.is_empty() { // Empty array case - let mut builder = StringBuilder::with_capacity(1, 1024); + let mut builder = StringViewBuilder::with_capacity(1); builder.append_value("[]"); return Ok(ColumnarValue::Array(Arc::new(builder.finish()))); } @@ -389,7 +403,7 @@ impl ScalarUDFImpl for JsonBuildArrayUDF { ColumnarValue::Scalar(_) => 1, }; - let mut builder = StringBuilder::with_capacity(num_rows, 1024); + let mut builder = StringViewBuilder::with_capacity(num_rows); for row_idx in 0..num_rows { let mut row_values = Vec::new(); @@ -449,7 +463,7 @@ impl ScalarUDFImpl for ToJsonUDF { } fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { - Ok(DataType::Utf8) + Ok(DataType::Utf8View) } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> datafusion::error::Result { @@ -464,7 +478,7 @@ impl ScalarUDFImpl for ToJsonUDF { }; let json_values = array_to_json_values(&array)?; - let mut builder = StringBuilder::with_capacity(json_values.len(), 1024); + let mut builder = StringViewBuilder::with_capacity(json_values.len()); for value in json_values { builder.append_value(value.to_string()); @@ -557,11 +571,11 @@ fn array_to_json_values(array: &ArrayRef) -> datafusion::error::Result { + DataType::Utf8View => { let string_array = array .as_any() - .downcast_ref::() - .ok_or_else(|| DataFusionError::Execution("Failed to downcast to StringArray".to_string()))?; + .downcast_ref::() + .ok_or_else(|| DataFusionError::Execution("Failed to downcast to StringViewArray".to_string()))?; for i in 0..string_array.len() { if string_array.is_null(i) { values.push(JsonValue::Null); @@ -650,7 +664,7 @@ fn array_to_json_values(array: &ArrayRef) -> datafusion::error::Result { // For other types, try to convert to string - let string_array = datafusion::arrow::compute::cast(array, &DataType::Utf8)?; + let string_array = datafusion::arrow::compute::cast(array, &DataType::Utf8View)?; return array_to_json_values(&string_array); } } @@ -695,7 +709,7 @@ fn create_time_bucket_udf() -> ScalarUDF { create_udf( "time_bucket", - vec![DataType::Utf8, DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from("UTC")))], + vec![DataType::Utf8View, DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from("UTC")))], DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from("UTC"))), Volatility::Immutable, time_bucket_fn, diff --git a/src/mem_buffer.rs b/src/mem_buffer.rs index 48d08ea..dd2b51c 100644 --- a/src/mem_buffer.rs +++ b/src/mem_buffer.rs @@ -12,7 +12,7 @@ use datafusion::sql::sqlparser::dialect::GenericDialect; use datafusion::sql::sqlparser::parser::Parser as SqlParser; use std::sync::atomic::{AtomicI64, AtomicUsize, Ordering}; use std::sync::{Arc, RwLock}; -use tracing::{debug, info, instrument, warn}; +use tracing::{debug, instrument, warn}; // 10-minute buckets balance flush granularity vs overhead. Shorter = more flushes, // longer = larger Delta files. Matches default flush interval for aligned boundaries. @@ -46,7 +46,7 @@ fn schemas_compatible(existing: &SchemaRef, incoming: &SchemaRef) -> bool { } } if new_fields > 0 { - info!("Schema evolution: {} new nullable field(s) added", new_fields); + debug!("Schema evolution: {} new nullable field(s) added", new_fields); } true } @@ -155,7 +155,13 @@ pub fn estimate_batch_size(batch: &RecordBatch) -> usize { /// Merge two arrays based on a boolean mask. /// For each row: if mask[i] is true, use new_values[i], else use original[i]. fn merge_arrays(original: &ArrayRef, new_values: &ArrayRef, mask: &BooleanArray) -> DFResult { - arrow::compute::kernels::zip::zip(mask, new_values, original).map_err(|e| datafusion::error::DataFusionError::ArrowError(Box::new(e), None)) + // Cast new_values to match original's type if they differ (e.g., Utf8 -> Utf8View) + let new_values = if original.data_type() != new_values.data_type() { + arrow::compute::cast(new_values, original.data_type()).map_err(|e| datafusion::error::DataFusionError::ArrowError(Box::new(e), None))? + } else { + new_values.clone() + }; + arrow::compute::kernels::zip::zip(mask, &new_values, original).map_err(|e| datafusion::error::DataFusionError::ArrowError(Box::new(e), None)) } /// Parse a SQL WHERE clause fragment into a DataFusion Expr. @@ -423,7 +429,7 @@ impl MemBuffer { pub fn get_flushable_buckets(&self, cutoff_bucket_id: i64) -> Vec { let flushable = self.collect_buckets(|bucket_id| bucket_id < cutoff_bucket_id); - info!("MemBuffer flushable buckets: count={}, cutoff={}", flushable.len(), cutoff_bucket_id); + debug!("MemBuffer flushable buckets: count={}, cutoff={}", flushable.len(), cutoff_bucket_id); flushable } @@ -478,7 +484,7 @@ impl MemBuffer { } if evicted_count > 0 { - info!( + debug!( "MemBuffer evicted {} buckets older than bucket_id={}, freed {} bytes", evicted_count, cutoff_bucket_id, freed_bytes ); @@ -697,7 +703,7 @@ impl MemBuffer { pub fn clear(&self) { self.tables.clear(); self.estimated_bytes.store(0, Ordering::Relaxed); - info!("MemBuffer cleared"); + debug!("MemBuffer cleared"); } } @@ -767,7 +773,7 @@ impl TimeBucket { #[cfg(test)] mod tests { use super::*; - use arrow::array::{Int64Array, StringArray, TimestampMicrosecondArray}; + use arrow::array::{Int64Array, StringViewArray, TimestampMicrosecondArray}; use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use std::sync::Arc; @@ -775,11 +781,11 @@ mod tests { let schema = Arc::new(Schema::new(vec![ Field::new("timestamp", DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())), false), Field::new("id", DataType::Int64, false), - Field::new("name", DataType::Utf8, false), + Field::new("name", DataType::Utf8View, false), ])); let ts_array = TimestampMicrosecondArray::from(vec![timestamp_micros]).with_timezone("UTC"); let id_array = Int64Array::from(vec![1]); - let name_array = StringArray::from(vec!["test"]); + let name_array = StringViewArray::from(vec!["test"]); RecordBatch::try_new(schema, vec![Arc::new(ts_array), Arc::new(id_array), Arc::new(name_array)]).unwrap() } @@ -851,11 +857,11 @@ mod tests { let schema = Arc::new(Schema::new(vec![ Field::new("timestamp", DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())), false), Field::new("id", DataType::Int64, false), - Field::new("name", DataType::Utf8, false), + Field::new("name", DataType::Utf8View, false), ])); let ts_array = TimestampMicrosecondArray::from(vec![ts; ids.len()]).with_timezone("UTC"); let id_array = Int64Array::from(ids); - let name_array = StringArray::from(names); + let name_array = StringViewArray::from(names); RecordBatch::try_new(schema, vec![Arc::new(ts_array), Arc::new(id_array), Arc::new(name_array)]).unwrap() } @@ -917,7 +923,7 @@ mod tests { let batch = &results[0]; assert_eq!(batch.num_rows(), 3); - let name_col = batch.column(2).as_any().downcast_ref::().unwrap(); + let name_col = batch.column(2).as_any().downcast_ref::().unwrap(); assert_eq!(name_col.value(0), "a"); assert_eq!(name_col.value(1), "updated"); assert_eq!(name_col.value(2), "c"); diff --git a/src/object_store_cache.rs b/src/object_store_cache.rs index 48cabf5..67df578 100644 --- a/src/object_store_cache.rs +++ b/src/object_store_cache.rs @@ -668,7 +668,7 @@ impl ObjectStore for FoyerObjectStoreCache { if value.is_expired(ttl) { self.update_stats(|s| s.ttl_expirations += 1).await; self.cache.remove(&cache_key); - info!( + debug!( "Foyer cache EXPIRED for: {} (TTL: {}s, age: {}ms)", location, ttl.as_secs(), diff --git a/src/pgwire_handlers.rs b/src/pgwire_handlers.rs index a85950d..fd7526d 100644 --- a/src/pgwire_handlers.rs +++ b/src/pgwire_handlers.rs @@ -71,9 +71,9 @@ pub struct LoggingSimpleQueryHandler { } impl LoggingSimpleQueryHandler { - pub fn new(session_context: Arc, auth_manager: Arc) -> Self { + pub fn new(session_context: Arc, _auth_manager: Arc) -> Self { Self { - inner: DfSessionService::new(session_context, auth_manager), + inner: DfSessionService::new(session_context), } } } @@ -144,9 +144,9 @@ pub struct LoggingExtendedQueryHandler { } impl LoggingExtendedQueryHandler { - pub fn new(session_context: Arc, auth_manager: Arc) -> Self { + pub fn new(session_context: Arc, _auth_manager: Arc) -> Self { Self { - inner: DfSessionService::new(session_context, auth_manager), + inner: DfSessionService::new(session_context), } } } diff --git a/src/schema_loader.rs b/src/schema_loader.rs index cc360e2..088bd14 100644 --- a/src/schema_loader.rs +++ b/src/schema_loader.rs @@ -94,13 +94,14 @@ impl TableSchema { fn parse_arrow_data_type(s: &str) -> anyhow::Result { Ok(match s { - "Utf8" => ArrowDataType::Utf8, + // Use Utf8View for better performance with zero-copy string operations + "Utf8" => ArrowDataType::Utf8View, "Date32" => ArrowDataType::Date32, "Int32" => ArrowDataType::Int32, "Int64" => ArrowDataType::Int64, "UInt32" => ArrowDataType::UInt32, "UInt64" => ArrowDataType::UInt64, - "List(Utf8)" => ArrowDataType::List(Arc::new(Field::new("item", ArrowDataType::Utf8, true))), + "List(Utf8)" => ArrowDataType::List(Arc::new(Field::new("item", ArrowDataType::Utf8View, true))), "Timestamp(Microsecond, None)" => ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Microsecond, None), "Timestamp(Microsecond, Some(\"UTC\"))" => ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Microsecond, Some("UTC".into())), _ => anyhow::bail!("Unknown type: {}", s), diff --git a/src/test_utils.rs b/src/test_utils.rs index fe7dbae..3c00e46 100644 --- a/src/test_utils.rs +++ b/src/test_utils.rs @@ -1,19 +1,106 @@ +/// Initialize tracing for tests. Call at start of test functions. +/// Uses try_init() so multiple calls are safe. +pub fn init_test_logging() { + use tracing_subscriber::EnvFilter; + let _ = tracing_subscriber::fmt() + .with_env_filter(EnvFilter::from_default_env().add_directive("info".parse().unwrap())) + .with_test_writer() + .try_init(); +} + pub mod test_helpers { + use crate::config::AppConfig; use crate::schema_loader::get_default_schema; use arrow_json::ReaderBuilder; + use datafusion::arrow::compute::cast; + use datafusion::arrow::datatypes::{DataType, Field, Schema}; use datafusion::arrow::record_batch::RecordBatch; use serde_json::{Value, json}; use std::collections::HashMap; + use std::path::PathBuf; + use std::sync::Arc; + + #[derive(Clone, Copy, Debug, PartialEq, Eq)] + pub enum BufferMode { + Enabled, + FlushImmediately, + } + + pub struct TestConfigBuilder { + test_name: String, + buffer_mode: BufferMode, + } + + impl TestConfigBuilder { + pub fn new(test_name: &str) -> Self { + Self { test_name: test_name.to_string(), buffer_mode: BufferMode::Enabled } + } + + pub fn with_buffer_mode(mut self, mode: BufferMode) -> Self { + self.buffer_mode = mode; + self + } + + pub fn build(self) -> Arc { + let uuid = uuid::Uuid::new_v4().to_string()[..8].to_string(); + let mut cfg = AppConfig::default(); + cfg.aws.aws_s3_bucket = Some("timefusion-tests".to_string()); + cfg.aws.aws_access_key_id = Some("minioadmin".to_string()); + cfg.aws.aws_secret_access_key = Some("minioadmin".to_string()); + cfg.aws.aws_s3_endpoint = "http://127.0.0.1:9000".to_string(); + cfg.aws.aws_default_region = Some("us-east-1".to_string()); + cfg.aws.aws_allow_http = Some("true".to_string()); + cfg.core.timefusion_table_prefix = format!("test-{}-{}", self.test_name, uuid); + cfg.core.walrus_data_dir = PathBuf::from(format!("/tmp/walrus-{}-{}", self.test_name, uuid)); + cfg.cache.timefusion_foyer_disabled = true; + cfg.buffer.timefusion_flush_immediately = self.buffer_mode == BufferMode::FlushImmediately; + Arc::new(cfg) + } + } pub fn json_to_batch(records: Vec) -> anyhow::Result { - let schema = get_default_schema().schema_ref(); + let target_schema = get_default_schema().schema_ref(); + + // Create a schema for reading JSON with Utf8 (which arrow-json produces) + let json_read_schema = Arc::new(Schema::new( + target_schema + .fields() + .iter() + .map(|f| { + let data_type = match f.data_type() { + DataType::Utf8View => DataType::Utf8, + DataType::List(inner) if inner.data_type() == &DataType::Utf8View => { + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))) + } + other => other.clone(), + }; + Field::new(f.name(), data_type, f.is_nullable()) + }) + .collect::>(), + )); + let json_data = records.into_iter().map(|v| v.to_string()).collect::>().join("\n"); - ReaderBuilder::new(schema.clone()) + let batch = ReaderBuilder::new(json_read_schema) .build(std::io::Cursor::new(json_data.as_bytes()))? .next() - .ok_or_else(|| anyhow::anyhow!("Failed to read batch"))? - .map_err(Into::into) + .ok_or_else(|| anyhow::anyhow!("Failed to read batch"))??; + + // Cast columns to target schema types (Utf8 -> Utf8View) + let columns: Vec> = batch + .columns() + .iter() + .zip(target_schema.fields()) + .map(|(col, field)| { + if col.data_type() != field.data_type() { + cast(col, field.data_type()).unwrap_or_else(|_| col.clone()) + } else { + col.clone() + } + }) + .collect(); + + Ok(RecordBatch::try_new(target_schema, columns)?) } pub fn create_default_record() -> HashMap { diff --git a/src/wal.rs b/src/wal.rs index 48696a8..fc57993 100644 --- a/src/wal.rs +++ b/src/wal.rs @@ -1,9 +1,9 @@ -use arrow::array::RecordBatch; -use arrow::ipc::reader::StreamReader; -use arrow::ipc::writer::StreamWriter; +use crate::schema_loader::{get_default_schema, get_schema}; +use arrow::array::{Array, ArrayRef, RecordBatch, make_array}; +use arrow::buffer::{Buffer, NullBuffer}; +use arrow::datatypes::{DataType, SchemaRef}; use bincode::{Decode, Encode}; use dashmap::DashSet; -use std::io::Cursor; use std::path::PathBuf; use thiserror::Error; use tracing::{debug, error, info, instrument, warn}; @@ -84,6 +84,80 @@ pub struct UpdatePayload { pub assignments: Vec<(String, String)>, } +/// Compact representation of a column's raw Arrow buffers (no schema embedded) +#[derive(Debug, Encode, Decode)] +struct CompactColumn { + null_bitmap: Option>, + buffers: Vec>, + children: Vec, + null_count: usize, + /// Length of child arrays (needed for List types where child length != parent length) + child_lens: Vec, +} + +/// Compact batch without schema - just raw column data +#[derive(Debug, Encode, Decode)] +struct CompactBatch { + num_rows: usize, + columns: Vec, +} + +impl CompactColumn { + fn from_array(array: &dyn Array) -> Self { + let data = array.to_data(); + Self { + null_bitmap: data.nulls().map(|n| n.buffer().as_slice().to_vec()), + buffers: data.buffers().iter().map(|b| b.as_slice().to_vec()).collect(), + children: data.child_data().iter().map(|c| Self::from_array_data(c)).collect(), + null_count: data.null_count(), + child_lens: data.child_data().iter().map(|c| c.len()).collect(), + } + } + + fn from_array_data(data: &arrow::array::ArrayData) -> Self { + Self { + null_bitmap: data.nulls().map(|n| n.buffer().as_slice().to_vec()), + buffers: data.buffers().iter().map(|b| b.as_slice().to_vec()).collect(), + children: data.child_data().iter().map(|c| Self::from_array_data(c)).collect(), + null_count: data.null_count(), + child_lens: data.child_data().iter().map(|c| c.len()).collect(), + } + } + + fn to_array_data(&self, data_type: &DataType, len: usize) -> arrow::array::ArrayData { + let null_buffer = self.null_bitmap.as_ref().map(|b| { + NullBuffer::new(arrow::buffer::BooleanBuffer::new(Buffer::from(b.as_slice()), 0, len)) + }); + let buffers: Vec = self.buffers.iter().map(|b| Buffer::from(b.as_slice())).collect(); + + let child_data: Vec = match data_type { + DataType::List(field) => { + self.children.iter().zip(&self.child_lens) + .map(|(child, &child_len)| child.to_array_data(field.data_type(), child_len)) + .collect() + } + DataType::Struct(fields) => { + self.children.iter().zip(fields.iter()).zip(&self.child_lens) + .map(|((child, field), &child_len)| child.to_array_data(field.data_type(), child_len)) + .collect() + } + _ => vec![], + }; + + unsafe { + arrow::array::ArrayData::new_unchecked( + data_type.clone(), + len, + Some(self.null_count), + null_buffer.map(|n| n.into_inner().into_inner()), + 0, + buffers, + child_data, + ) + } + } +} + pub struct WalManager { wal: Walrus, data_dir: PathBuf, @@ -126,10 +200,20 @@ impl WalManager { } } + /// Human-readable topic identifier for metadata/logging fn make_topic(project_id: &str, table_name: &str) -> String { format!("{}:{}", project_id, table_name) } + /// Short hash for walrus topic key (walrus has 62-byte metadata limit) + fn walrus_topic_key(project_id: &str, table_name: &str) -> String { + use std::hash::{Hash, Hasher}; + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + project_id.hash(&mut hasher); + table_name.hash(&mut hasher); + format!("{:016x}", hasher.finish()) + } + fn parse_topic(topic: &str) -> Option<(String, String)> { topic.split_once(':').map(|(p, t)| (p.to_string(), t.to_string())) } @@ -137,8 +221,9 @@ impl WalManager { #[instrument(skip(self, batch), fields(project_id, table_name, rows))] pub fn append(&self, project_id: &str, table_name: &str, batch: &RecordBatch) -> Result<(), WalError> { let topic = Self::make_topic(project_id, table_name); + let walrus_key = Self::walrus_topic_key(project_id, table_name); let entry = WalEntry::new(project_id, table_name, WalOperation::Insert, serialize_record_batch(batch)?); - self.wal.append_for_topic(&topic, &serialize_wal_entry(&entry)?)?; + self.wal.append_for_topic(&walrus_key, &serialize_wal_entry(&entry)?)?; self.persist_topic(&topic); debug!("WAL append INSERT: topic={}, rows={}", topic, batch.num_rows()); Ok(()) @@ -147,13 +232,14 @@ impl WalManager { #[instrument(skip(self, batches), fields(project_id, table_name, batch_count))] pub fn append_batch(&self, project_id: &str, table_name: &str, batches: &[RecordBatch]) -> Result<(), WalError> { let topic = Self::make_topic(project_id, table_name); + let walrus_key = Self::walrus_topic_key(project_id, table_name); let payloads: Vec> = batches .iter() .map(|batch| serialize_wal_entry(&WalEntry::new(project_id, table_name, WalOperation::Insert, serialize_record_batch(batch)?))) .collect::>()?; let payload_refs: Vec<&[u8]> = payloads.iter().map(Vec::as_slice).collect(); - self.wal.batch_append_for_topic(&topic, &payload_refs)?; + self.wal.batch_append_for_topic(&walrus_key, &payload_refs)?; self.persist_topic(&topic); debug!("WAL batch append INSERT: topic={}, batches={}", topic, batches.len()); Ok(()) @@ -162,6 +248,7 @@ impl WalManager { #[instrument(skip(self), fields(project_id, table_name))] pub fn append_delete(&self, project_id: &str, table_name: &str, predicate_sql: Option<&str>) -> Result<(), WalError> { let topic = Self::make_topic(project_id, table_name); + let walrus_key = Self::walrus_topic_key(project_id, table_name); let data = bincode::encode_to_vec( &DeletePayload { predicate_sql: predicate_sql.map(String::from), @@ -169,7 +256,7 @@ impl WalManager { BINCODE_CONFIG, )?; let entry = WalEntry::new(project_id, table_name, WalOperation::Delete, data); - self.wal.append_for_topic(&topic, &serialize_wal_entry(&entry)?)?; + self.wal.append_for_topic(&walrus_key, &serialize_wal_entry(&entry)?)?; self.persist_topic(&topic); debug!("WAL append DELETE: topic={}, predicate={:?}", topic, predicate_sql); Ok(()) @@ -178,12 +265,13 @@ impl WalManager { #[instrument(skip(self, assignments), fields(project_id, table_name))] pub fn append_update(&self, project_id: &str, table_name: &str, predicate_sql: Option<&str>, assignments: &[(String, String)]) -> Result<(), WalError> { let topic = Self::make_topic(project_id, table_name); + let walrus_key = Self::walrus_topic_key(project_id, table_name); let payload = UpdatePayload { predicate_sql: predicate_sql.map(String::from), assignments: assignments.to_vec(), }; let entry = WalEntry::new(project_id, table_name, WalOperation::Update, bincode::encode_to_vec(&payload, BINCODE_CONFIG)?); - self.wal.append_for_topic(&topic, &serialize_wal_entry(&entry)?)?; + self.wal.append_for_topic(&walrus_key, &serialize_wal_entry(&entry)?)?; self.persist_topic(&topic); debug!( "WAL append UPDATE: topic={}, predicate={:?}, assignments={}", @@ -199,12 +287,13 @@ impl WalManager { &self, project_id: &str, table_name: &str, since_timestamp_micros: Option, checkpoint: bool, ) -> Result<(Vec, usize), WalError> { let topic = Self::make_topic(project_id, table_name); + let walrus_key = Self::walrus_topic_key(project_id, table_name); let cutoff = since_timestamp_micros.unwrap_or(0); let mut results = Vec::new(); let mut error_count = 0usize; loop { - match self.wal.read_next(&topic, checkpoint) { + match self.wal.read_next(&walrus_key, checkpoint) { Ok(Some(entry_data)) => match deserialize_wal_entry(&entry_data.data) { Ok(entry) if entry.timestamp_micros >= cutoff => results.push(entry), Ok(_) => {} // Skip old entries @@ -261,8 +350,11 @@ impl WalManager { Ok((all_results, total_errors)) } - pub fn deserialize_batch(data: &[u8]) -> Result { - deserialize_record_batch(data) + pub fn deserialize_batch(data: &[u8], table_name: &str) -> Result { + let schema = get_schema(table_name) + .map(|s| s.schema_ref()) + .unwrap_or_else(|| get_default_schema().schema_ref()); + deserialize_record_batch(data, &schema) } pub fn list_topics(&self) -> Result, WalError> { @@ -272,9 +364,10 @@ impl WalManager { #[instrument(skip(self))] pub fn checkpoint(&self, project_id: &str, table_name: &str) -> Result<(), WalError> { let topic = Self::make_topic(project_id, table_name); + let walrus_key = Self::walrus_topic_key(project_id, table_name); let mut count = 0; loop { - match self.wal.read_next(&topic, true) { + match self.wal.read_next(&walrus_key, true) { Ok(Some(_)) => count += 1, Ok(None) => break, Err(e) => { @@ -295,15 +388,25 @@ impl WalManager { } fn serialize_record_batch(batch: &RecordBatch) -> Result, WalError> { - let mut buffer = Vec::new(); - let mut writer = StreamWriter::try_new(&mut buffer, &batch.schema())?; - writer.write(batch)?; - writer.finish()?; - Ok(buffer) + let compact = CompactBatch { + num_rows: batch.num_rows(), + columns: batch.columns().iter().map(|c| CompactColumn::from_array(c.as_ref())).collect(), + }; + bincode::encode_to_vec(&compact, BINCODE_CONFIG).map_err(WalError::BincodeEncode) } -fn deserialize_record_batch(data: &[u8]) -> Result { - StreamReader::try_new(Cursor::new(data), None)?.next().ok_or(WalError::EmptyBatch)?.map_err(WalError::ArrowIpc) +fn deserialize_record_batch(data: &[u8], schema: &SchemaRef) -> Result { + let (compact, _): (CompactBatch, _) = bincode::decode_from_slice(data, BINCODE_CONFIG)?; + + let arrays: Vec = compact.columns.iter() + .zip(schema.fields()) + .map(|(col, field)| { + let array_data = col.to_array_data(field.data_type(), compact.num_rows); + make_array(array_data) + }) + .collect(); + + RecordBatch::try_new(schema.clone(), arrays).map_err(WalError::ArrowIpc) } fn serialize_wal_entry(entry: &WalEntry) -> Result, WalError> { @@ -344,18 +447,18 @@ pub fn deserialize_update_payload(data: &[u8]) -> Result RecordBatch { let schema = Arc::new(Schema::new(vec![ Field::new("id", DataType::Int64, false), - Field::new("name", DataType::Utf8, false), + Field::new("name", DataType::Utf8View, false), ])); RecordBatch::try_new( schema, - vec![Arc::new(Int64Array::from(vec![1, 2, 3])), Arc::new(StringArray::from(vec!["a", "b", "c"]))], + vec![Arc::new(Int64Array::from(vec![1, 2, 3])), Arc::new(StringViewArray::from(vec!["a", "b", "c"]))], ) .unwrap() } @@ -363,8 +466,9 @@ mod tests { #[test] fn test_record_batch_serialization() { let batch = create_test_batch(); + let schema = batch.schema(); let serialized = serialize_record_batch(&batch).unwrap(); - let deserialized = deserialize_record_batch(&serialized).unwrap(); + let deserialized = deserialize_record_batch(&serialized, &schema).unwrap(); assert_eq!(batch.num_rows(), deserialized.num_rows()); assert_eq!(batch.num_columns(), deserialized.num_columns()); } diff --git a/tests/buffer_consistency_test.rs b/tests/buffer_consistency_test.rs new file mode 100644 index 0000000..1cad7be --- /dev/null +++ b/tests/buffer_consistency_test.rs @@ -0,0 +1,319 @@ +//! Buffer consistency tests - verifies query results are consistent whether data is in MemBuffer or Delta. + +use anyhow::Result; +use datafusion::arrow::array::{Array, AsArray, StringViewArray}; +use serial_test::serial; +use std::sync::Arc; +use test_case::test_case; +use timefusion::buffered_write_layer::BufferedWriteLayer; +use timefusion::database::Database; +use timefusion::test_utils::test_helpers::{BufferMode, TestConfigBuilder, json_to_batch, test_span}; + +fn get_str(arr: &dyn Array, idx: usize) -> String { + arr.as_any().downcast_ref::().map(|a| a.value(idx).to_string()).unwrap_or_default() +} + +async fn setup_db_with_buffer(mode: BufferMode) -> Result<(Arc, Arc, String)> { + let cfg = TestConfigBuilder::new("buf_test").with_buffer_mode(mode).build(); + // Set WALRUS_DATA_DIR env var so walrus-rust uses the correct path + unsafe { std::env::set_var("WALRUS_DATA_DIR", &cfg.core.walrus_data_dir) }; + let layer = Arc::new(BufferedWriteLayer::with_config(Arc::clone(&cfg))?); + let db = Arc::new( + Database::with_config(cfg) + .await? + .with_buffered_layer(Arc::clone(&layer)), + ); + let project_id = format!("proj_{}", uuid::Uuid::new_v4().to_string()[..8].to_string()); + Ok((db, layer, project_id)) +} + +fn create_records(project_id: &str, count: usize) -> Vec { + let now = chrono::Utc::now(); + (0..count) + .map(|i| { + serde_json::json!({ + "id": format!("id_{}", i), + "name": format!("name_{}", i), + "project_id": project_id, + "timestamp": now.timestamp_micros() + i as i64, + "level": "INFO", + "duration": 100 + i as i64, + "date": now.date_naive().to_string(), + "hashes": [], + "summary": [] + }) + }) + .collect() +} + +// ============================================================================= +// Parameterized tests - run in both buffer modes +// ============================================================================= + +#[test_case(BufferMode::Enabled ; "buffered")] +#[test_case(BufferMode::FlushImmediately ; "immediate")] +#[serial] +#[tokio::test] +async fn test_insert_query(mode: BufferMode) -> Result<()> { + let (db, _layer, project_id) = setup_db_with_buffer(mode).await?; + let mut ctx = Arc::clone(&db).create_session_context(); + db.setup_session_context(&mut ctx)?; + + let records = create_records(&project_id, 10); + let batch = json_to_batch(records)?; + db.insert_records_batch(&project_id, "otel_logs_and_spans", vec![batch], true).await?; + + let result = ctx + .sql(&format!("SELECT COUNT(*) as cnt FROM otel_logs_and_spans WHERE project_id = '{}'", project_id)) + .await? + .collect() + .await?; + + let count = result[0].column(0).as_primitive::().value(0); + assert_eq!(count, 10, "Expected 10 rows"); + Ok(()) +} + +#[test_case(BufferMode::Enabled ; "buffered")] +#[test_case(BufferMode::FlushImmediately ; "immediate")] +#[serial] +#[tokio::test] +async fn test_select_columns(mode: BufferMode) -> Result<()> { + let (db, _layer, project_id) = setup_db_with_buffer(mode).await?; + let mut ctx = Arc::clone(&db).create_session_context(); + db.setup_session_context(&mut ctx)?; + + let batch = json_to_batch(vec![test_span("test1", "my_span", &project_id)])?; + db.insert_records_batch(&project_id, "otel_logs_and_spans", vec![batch], true).await?; + + let result = ctx + .sql(&format!("SELECT id, name FROM otel_logs_and_spans WHERE project_id = '{}'", project_id)) + .await? + .collect() + .await?; + + assert_eq!(result[0].num_rows(), 1); + assert_eq!(get_str(result[0].column(0).as_ref(), 0), "test1"); + assert_eq!(get_str(result[0].column(1).as_ref(), 0), "my_span"); + Ok(()) +} + +#[test_case(BufferMode::Enabled ; "buffered")] +#[test_case(BufferMode::FlushImmediately ; "immediate")] +#[serial] +#[tokio::test] +async fn test_update(mode: BufferMode) -> Result<()> { + let (db, _layer, project_id) = setup_db_with_buffer(mode).await?; + let mut ctx = Arc::clone(&db).create_session_context(); + db.setup_session_context(&mut ctx)?; + + let records = create_records(&project_id, 3); + let batch = json_to_batch(records)?; + db.insert_records_batch(&project_id, "otel_logs_and_spans", vec![batch], true).await?; + + ctx.sql(&format!( + "UPDATE otel_logs_and_spans SET duration = 999 WHERE project_id = '{}' AND name = 'name_1'", + project_id + )) + .await? + .collect() + .await?; + + let result = ctx + .sql(&format!( + "SELECT name, duration FROM otel_logs_and_spans WHERE project_id = '{}' ORDER BY name", + project_id + )) + .await? + .collect() + .await?; + + let batch = &result[0]; + for i in 0..batch.num_rows() { + let name = get_str(batch.column(0).as_ref(), i); + let duration = batch.column(1).as_primitive::().value(i); + if name == "name_1" { + assert_eq!(duration, 999, "name_1 should have duration=999"); + } + } + Ok(()) +} + +#[test_case(BufferMode::Enabled ; "buffered")] +#[test_case(BufferMode::FlushImmediately ; "immediate")] +#[serial] +#[tokio::test] +async fn test_delete(mode: BufferMode) -> Result<()> { + let (db, _layer, project_id) = setup_db_with_buffer(mode).await?; + let mut ctx = Arc::clone(&db).create_session_context(); + db.setup_session_context(&mut ctx)?; + + let records = create_records(&project_id, 5); + let batch = json_to_batch(records)?; + db.insert_records_batch(&project_id, "otel_logs_and_spans", vec![batch], true).await?; + + ctx.sql(&format!( + "DELETE FROM otel_logs_and_spans WHERE project_id = '{}' AND name = 'name_2'", + project_id + )) + .await? + .collect() + .await?; + + let result = ctx + .sql(&format!("SELECT COUNT(*) as cnt FROM otel_logs_and_spans WHERE project_id = '{}'", project_id)) + .await? + .collect() + .await?; + + let count = result[0].column(0).as_primitive::().value(0); + assert_eq!(count, 4, "Expected 4 rows after delete"); + Ok(()) +} + +#[test_case(BufferMode::Enabled ; "buffered")] +#[test_case(BufferMode::FlushImmediately ; "immediate")] +#[serial] +#[tokio::test] +async fn test_aggregations(mode: BufferMode) -> Result<()> { + let (db, _layer, project_id) = setup_db_with_buffer(mode).await?; + let mut ctx = Arc::clone(&db).create_session_context(); + db.setup_session_context(&mut ctx)?; + + let records = create_records(&project_id, 10); + let batch = json_to_batch(records)?; + db.insert_records_batch(&project_id, "otel_logs_and_spans", vec![batch], true).await?; + + let result = ctx + .sql(&format!( + "SELECT COUNT(*) as cnt, SUM(duration) as total, AVG(duration) as avg_dur FROM otel_logs_and_spans WHERE project_id = '{}'", + project_id + )) + .await? + .collect() + .await?; + + let batch = &result[0]; + let cnt = batch.column(0).as_primitive::().value(0); + assert_eq!(cnt, 10); + Ok(()) +} + +// ============================================================================= +// Union tests - data split between buffer and Delta +// ============================================================================= + +#[serial] +#[tokio::test] +async fn test_partial_flush_union() -> Result<()> { + let (db, _layer, project_id) = setup_db_with_buffer(BufferMode::Enabled).await?; + let mut ctx = Arc::clone(&db).create_session_context(); + db.setup_session_context(&mut ctx)?; + + // Insert first batch directly to Delta (skip_queue=true) + let batch1 = json_to_batch(create_records(&project_id, 50))?; + db.insert_records_batch(&project_id, "otel_logs_and_spans", vec![batch1], true).await?; + + // Insert second batch to buffer only (skip_queue=false, no callback so no flush to Delta) + let now = chrono::Utc::now(); + let records2: Vec<_> = (50..100) + .map(|i| { + serde_json::json!({ + "id": format!("id_{}", i), + "name": format!("name_{}", i), + "project_id": &project_id, + "timestamp": now.timestamp_micros() + i as i64, + "level": "INFO", + "duration": 100 + i as i64, + "date": now.date_naive().to_string(), + "hashes": [], + "summary": [] + }) + }) + .collect(); + let batch2 = json_to_batch(records2)?; + db.insert_records_batch(&project_id, "otel_logs_and_spans", vec![batch2], false).await?; + + // Query should return all 100 rows (50 from Delta + 50 from buffer) + let result = ctx + .sql(&format!("SELECT COUNT(*) as cnt FROM otel_logs_and_spans WHERE project_id = '{}'", project_id)) + .await? + .collect() + .await?; + + let count = result[0].column(0).as_primitive::().value(0); + assert_eq!(count, 100, "Expected 100 rows from union of buffer + Delta"); + Ok(()) +} + +#[serial] +#[tokio::test] +async fn test_delta_only_query() -> Result<()> { + let (db, _layer, project_id) = setup_db_with_buffer(BufferMode::Enabled).await?; + + // Insert directly to Delta (skip_queue=true) + let batch1 = json_to_batch(create_records(&project_id, 30))?; + db.insert_records_batch(&project_id, "otel_logs_and_spans", vec![batch1], true).await?; + + // Insert to buffer only (skip_queue=false, no callback so stays in buffer) + let now = chrono::Utc::now(); + let records2: Vec<_> = (30..50) + .map(|i| { + serde_json::json!({ + "id": format!("id_{}", i), + "name": format!("name_{}", i), + "project_id": &project_id, + "timestamp": now.timestamp_micros() + i as i64, + "level": "INFO", + "duration": 100, + "date": now.date_naive().to_string(), + "hashes": [], + "summary": [] + }) + }) + .collect(); + let batch2 = json_to_batch(records2)?; + db.insert_records_batch(&project_id, "otel_logs_and_spans", vec![batch2], false).await?; + + // Delta-only query should return only Delta data (30 rows) + let delta_result = db + .query_delta_only(&format!( + "SELECT COUNT(*) as cnt FROM otel_logs_and_spans WHERE project_id = '{}'", + project_id + )) + .await?; + + let delta_count = delta_result[0].column(0).as_primitive::().value(0); + assert_eq!(delta_count, 30, "Delta-only should return 30 rows from Delta"); + + // Normal query should return all 50 (30 from Delta + 20 from buffer) + let mut ctx = Arc::clone(&db).create_session_context(); + db.setup_session_context(&mut ctx)?; + let full_result = ctx + .sql(&format!("SELECT COUNT(*) as cnt FROM otel_logs_and_spans WHERE project_id = '{}'", project_id)) + .await? + .collect() + .await?; + + let full_count = full_result[0].column(0).as_primitive::().value(0); + assert_eq!(full_count, 50, "Full query should return all 50 rows"); + Ok(()) +} + +// ============================================================================= +// Immediate flush verification +// ============================================================================= + +#[serial] +#[tokio::test] +async fn test_immediate_flush_drains_buffer() -> Result<()> { + let (db, layer, project_id) = setup_db_with_buffer(BufferMode::FlushImmediately).await?; + + // Insert with immediate mode through buffer (skip_queue=false) + let batch = json_to_batch(create_records(&project_id, 10))?; + db.insert_records_batch(&project_id, "otel_logs_and_spans", vec![batch], false).await?; + + // Buffer should be empty after immediate flush (flush drains buffer even without callback) + assert!(layer.is_empty(), "Buffer should be empty after immediate flush"); + Ok(()) +} diff --git a/tests/connection_pressure_test.rs b/tests/connection_pressure_test.rs index ca5e1e5..c0d65ca 100644 --- a/tests/connection_pressure_test.rs +++ b/tests/connection_pressure_test.rs @@ -26,7 +26,7 @@ mod connection_pressure { impl PressureTestServer { async fn start() -> Result { - let _ = env_logger::builder().is_test(true).try_init(); + timefusion::test_utils::init_test_logging(); dotenv().ok(); let test_id = Uuid::new_v4().to_string(); diff --git a/tests/delta_rs_api_test.rs b/tests/delta_rs_api_test.rs index 3bcfa32..6777585 100644 --- a/tests/delta_rs_api_test.rs +++ b/tests/delta_rs_api_test.rs @@ -1,10 +1,22 @@ use anyhow::Result; -use datafusion::arrow::array::AsArray; +use datafusion::arrow::array::{Array, AsArray, StringArray, LargeStringArray, StringViewArray}; use serial_test::serial; use std::sync::Arc; use timefusion::database::Database; use timefusion::test_utils::test_helpers::*; +fn get_str(array: &dyn Array, idx: usize) -> String { + if let Some(arr) = array.as_any().downcast_ref::() { + arr.value(idx).to_string() + } else if let Some(arr) = array.as_any().downcast_ref::() { + arr.value(idx).to_string() + } else if let Some(arr) = array.as_any().downcast_ref::() { + arr.value(idx).to_string() + } else { + panic!("Unsupported string array type: {:?}", array.data_type()) + } +} + async fn setup_test_database() -> Result<(Database, datafusion::prelude::SessionContext)> { dotenv::dotenv().ok(); unsafe { @@ -58,7 +70,7 @@ async fn test_partition_column_ordering() -> Result<()> { .await?; assert_eq!(result[0].num_rows(), 1); - assert_eq!(result[0].column(0).as_string::().value(0), "partition_project"); + assert_eq!(get_str(result[0].column(0).as_ref(), 0), "partition_project"); db.shutdown().await?; Ok(()) diff --git a/tests/integration_test.rs b/tests/integration_test.rs index 40b0c00..d0c7056 100644 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -42,7 +42,7 @@ mod integration { impl TestServer { async fn start() -> Result { - let _ = env_logger::builder().is_test(true).try_init(); + timefusion::test_utils::init_test_logging(); let test_id = Uuid::new_v4().to_string(); let port = 5433 + rand::rng().random_range(1..100) as u16; diff --git a/tests/test_custom_functions.rs b/tests/test_custom_functions.rs index f974cd7..3968377 100644 --- a/tests/test_custom_functions.rs +++ b/tests/test_custom_functions.rs @@ -1,10 +1,21 @@ #[cfg(test)] mod test_custom_functions { use anyhow::Result; - use datafusion::arrow::array::AsArray; + use datafusion::arrow::array::{Array, StringArray, StringViewArray}; use datafusion::prelude::*; use timefusion::functions::register_custom_functions; + /// Helper to get string value from either Utf8View or Utf8 array + fn get_str(arr: &dyn Array, idx: usize) -> String { + if let Some(sv) = arr.as_any().downcast_ref::() { + sv.value(idx).to_string() + } else if let Some(s) = arr.as_any().downcast_ref::() { + s.value(idx).to_string() + } else { + panic!("Expected string array but got {:?}", arr.data_type()); + } + } + #[tokio::test] async fn test_to_char_function() -> Result<()> { // Create a new SessionContext @@ -34,8 +45,7 @@ mod test_custom_functions { let batch = &results[0]; assert_eq!(batch.num_rows(), 1); - let array = batch.column(0).as_string::(); - let actual = array.value(0); + let actual = get_str(batch.column(0).as_ref(), 0); assert_eq!(actual, expected, "Format '{}' failed", format); } @@ -69,8 +79,7 @@ mod test_custom_functions { assert_eq!(results2.len(), 1); let batch2 = &results2[0]; - let array = batch2.column(0).as_string::(); - let actual = array.value(0); + let actual = get_str(batch2.column(0).as_ref(), 0); // UTC 14:30:45 -> America/New_York (UTC-5 in January) = 09:30:45 assert_eq!(actual, "2024-01-15 09:30:45"); diff --git a/tests/test_dml_operations.rs b/tests/test_dml_operations.rs index da87941..7515437 100644 --- a/tests/test_dml_operations.rs +++ b/tests/test_dml_operations.rs @@ -2,17 +2,23 @@ mod test_dml_operations { use anyhow::Result; use datafusion::arrow; - use datafusion::arrow::array::AsArray; + use datafusion::arrow::array::{Array, AsArray, StringArray, StringViewArray}; use serial_test::serial; use std::path::PathBuf; use std::sync::Arc; use timefusion::config::AppConfig; use timefusion::database::Database; - use tracing::{Level, info}; - - fn init_tracing() { - let subscriber = tracing_subscriber::fmt().with_max_level(Level::INFO).with_target(false).finish(); - let _ = tracing::subscriber::set_global_default(subscriber); + use tracing::info; + + /// Helper function to get string value from either Utf8View or Utf8 array + fn get_str(arr: &dyn Array, idx: usize) -> String { + if let Some(sv) = arr.as_any().downcast_ref::() { + sv.value(idx).to_string() + } else if let Some(s) = arr.as_any().downcast_ref::() { + s.value(idx).to_string() + } else { + panic!("Expected string array but got {:?}", arr.data_type()); + } } fn create_test_config(test_id: &str) -> Arc { @@ -80,7 +86,7 @@ mod test_dml_operations { #[serial] #[tokio::test] async fn test_update_query() -> Result<()> { - init_tracing(); + timefusion::test_utils::init_test_logging(); let test_id = uuid::Uuid::new_v4().to_string()[..8].to_string(); let cfg = create_test_config(&test_id); let db = Arc::new(Database::with_config(cfg).await?); @@ -116,11 +122,11 @@ mod test_dml_operations { let name_col_idx = batch.schema().fields().iter().position(|f| f.name() == "name").unwrap(); let duration_col_idx = batch.schema().fields().iter().position(|f| f.name() == "duration").unwrap(); - let name_col = batch.column(name_col_idx).as_string::(); + let name_col = batch.column(name_col_idx).as_ref(); let duration_col = batch.column(duration_col_idx).as_primitive::(); for i in 0..batch.num_rows() { - match name_col.value(i) { + match get_str(name_col, i).as_str() { "Bob" => assert_eq!(duration_col.value(i), 500, "Bob's duration should be updated to 500"), "Alice" => assert_eq!(duration_col.value(i), 100, "Alice's duration should remain 100"), "Charlie" => assert_eq!(duration_col.value(i), 300, "Charlie's duration should remain 300"), @@ -136,7 +142,7 @@ mod test_dml_operations { #[serial] #[tokio::test] async fn test_delete_with_predicate() -> Result<()> { - init_tracing(); + timefusion::test_utils::init_test_logging(); let test_id = uuid::Uuid::new_v4().to_string()[..8].to_string(); let cfg = create_test_config(&test_id); let db = Arc::new(Database::with_config(cfg).await?); @@ -172,13 +178,13 @@ mod test_dml_operations { let id_col_idx = batch.schema().fields().iter().position(|f| f.name() == "id").unwrap(); let name_col_idx = batch.schema().fields().iter().position(|f| f.name() == "name").unwrap(); - let id_col = batch.column(id_col_idx).as_string::(); - let name_col = batch.column(name_col_idx).as_string::(); + let id_col = batch.column(id_col_idx).as_ref(); + let name_col = batch.column(name_col_idx).as_ref(); - assert_eq!(id_col.value(0), "1"); - assert_eq!(name_col.value(0), "Alice"); - assert_eq!(id_col.value(1), "3"); - assert_eq!(name_col.value(1), "Charlie"); + assert_eq!(get_str(id_col, 0), "1"); + assert_eq!(get_str(name_col, 0), "Alice"); + assert_eq!(get_str(id_col, 1), "3"); + assert_eq!(get_str(name_col, 1), "Charlie"); Ok(()) } @@ -265,11 +271,11 @@ mod test_dml_operations { let results = df.collect().await?; let batch = &results[0]; - let id_col = batch.column(0).as_string::(); - let level_col = batch.column(1).as_string::(); + let id_col = batch.column(0).as_ref(); + let level_col = batch.column(1).as_ref(); - assert_eq!(id_col.value(0), "2"); - assert_eq!(level_col.value(0), "INFO"); + assert_eq!(get_str(id_col, 0), "2"); + assert_eq!(get_str(level_col, 0), "INFO"); Ok(()) } @@ -281,7 +287,7 @@ mod test_dml_operations { #[serial] #[tokio::test] async fn test_update_multiple_columns() -> Result<()> { - init_tracing(); + timefusion::test_utils::init_test_logging(); let test_id = uuid::Uuid::new_v4().to_string()[..8].to_string(); let cfg = create_test_config(&test_id); let db = Arc::new(Database::with_config(cfg).await?); @@ -319,10 +325,10 @@ mod test_dml_operations { let level_idx = batch.schema().fields().iter().position(|f| f.name() == "level").unwrap(); let duration_col = batch.column(duration_idx).as_primitive::(); - let level_col = batch.column(level_idx).as_string::(); + let level_col = batch.column(level_idx).as_ref(); assert_eq!(duration_col.value(0), 999, "Duration should be updated to 999"); - assert_eq!(level_col.value(0), "WARN", "Level should be updated to WARN"); + assert_eq!(get_str(level_col, 0), "WARN", "Level should be updated to WARN"); Ok(()) } @@ -334,7 +340,7 @@ mod test_dml_operations { #[serial] #[tokio::test] async fn test_delete_verify_counts() -> Result<()> { - init_tracing(); + timefusion::test_utils::init_test_logging(); let test_id = uuid::Uuid::new_v4().to_string()[..8].to_string(); let cfg = create_test_config(&test_id); let db = Arc::new(Database::with_config(cfg).await?); diff --git a/tests/test_postgres_json_functions.rs b/tests/test_postgres_json_functions.rs index 3d65da6..7e680ad 100644 --- a/tests/test_postgres_json_functions.rs +++ b/tests/test_postgres_json_functions.rs @@ -1,8 +1,20 @@ #[cfg(test)] mod test_json_functions { use anyhow::Result; + use datafusion::arrow::array::{Array, StringArray, StringViewArray}; use timefusion::database::Database; + /// Helper to extract string value from either Utf8View or Utf8 array + fn get_str(arr: &dyn Array, idx: usize) -> String { + if let Some(sv) = arr.as_any().downcast_ref::() { + sv.value(idx).to_string() + } else if let Some(s) = arr.as_any().downcast_ref::() { + s.value(idx).to_string() + } else { + panic!("Expected string array but got {:?}", arr.data_type()); + } + } + #[tokio::test] async fn test_json_build_array() -> Result<()> { // Initialize database @@ -17,8 +29,7 @@ mod test_json_functions { assert_eq!(results.len(), 1); let batch = &results[0]; let column = batch.column(0); - let value = column.as_any().downcast_ref::().unwrap(); - assert_eq!(value.value(0), r#"["a","b","c"]"#); + assert_eq!(get_str(column.as_ref(), 0), r#"["a","b","c"]"#); Ok(()) } @@ -37,8 +48,7 @@ mod test_json_functions { assert_eq!(results.len(), 1); let batch = &results[0]; let column = batch.column(0); - let value = column.as_any().downcast_ref::().unwrap(); - assert_eq!(value.value(0), r#"{"hello":"world"}"#); + assert_eq!(get_str(column.as_ref(), 0), r#"{"hello":"world"}"#); // Test to_json with number let df = ctx.sql("SELECT to_json(123) as result").await?; @@ -46,8 +56,7 @@ mod test_json_functions { assert_eq!(results.len(), 1); let batch = &results[0]; let column = batch.column(0); - let value = column.as_any().downcast_ref::().unwrap(); - assert_eq!(value.value(0), "123"); + assert_eq!(get_str(column.as_ref(), 0), "123"); Ok(()) } @@ -87,8 +96,7 @@ mod test_json_functions { assert_eq!(results.len(), 1); let batch = &results[0]; let column = batch.column(0); - let value = column.as_any().downcast_ref::().unwrap(); - assert_eq!(value.value(0), "2025-08-07 10:00:00"); + assert_eq!(get_str(column.as_ref(), 0), "2025-08-07 10:00:00"); Ok(()) } @@ -111,8 +119,7 @@ mod test_json_functions { assert_eq!(results.len(), 1); let batch = &results[0]; let column = batch.column(0); - let value = column.as_any().downcast_ref::().unwrap(); - assert_eq!(value.value(0), r#"["001","test_span",1500,{"status":"ok"}]"#); + assert_eq!(get_str(column.as_ref(), 0), r#"["001","test_span",1500,{"status":"ok"}]"#); Ok(()) } From 9f3df13d5cf1db2428d533618b7d1577ca2cad6b Mon Sep 17 00:00:00 2001 From: Anthony Alaribe Date: Wed, 28 Jan 2026 16:40:11 -0800 Subject: [PATCH 2/6] Upgrade to DataFusion 52 with Utf8View support and fix WAL metadata limits - Update delta-rs to ffb794ba to include Utf8View predicate fixes - Migrate string types to Utf8View for better performance - Fix WAL metadata size limit by using hashed topic keys (16-char hex with ahash) - Add bincode serialization for WAL entries (schema-less, compact) - Remove unnecessary session state from DML operations - Add buffer_consistency_test.rs with comprehensive buffer/Delta tests - Update test utilities and assertions for Utf8View compatibility --- src/buffered_write_layer.rs | 5 +++- src/database.rs | 31 ++++++++++-------------- src/dml.rs | 5 +++- src/test_utils.rs | 9 +++---- src/wal.rs | 41 ++++++++++++++++++-------------- tests/buffer_consistency_test.rs | 11 ++------- tests/delta_rs_api_test.rs | 2 +- 7 files changed, 52 insertions(+), 52 deletions(-) diff --git a/src/buffered_write_layer.rs b/src/buffered_write_layer.rs index 2b30749..d4e5034 100644 --- a/src/buffered_write_layer.rs +++ b/src/buffered_write_layer.rs @@ -459,7 +459,10 @@ impl BufferedWriteLayer { pub async fn flush_all_now(&self) -> anyhow::Result { let _flush_guard = self.flush_lock.lock().await; let all_buckets = self.mem_buffer.get_all_buckets(); - let mut stats = FlushStats { total_rows: all_buckets.iter().map(|b| b.row_count as u64).sum(), ..Default::default() }; + let mut stats = FlushStats { + total_rows: all_buckets.iter().map(|b| b.row_count as u64).sum(), + ..Default::default() + }; for bucket in all_buckets { match self.flush_bucket(&bucket).await { diff --git a/src/database.rs b/src/database.rs index 4539028..7b717ea 100644 --- a/src/database.rs +++ b/src/database.rs @@ -7,15 +7,15 @@ use arrow_schema::SchemaRef; use async_trait::async_trait; use chrono::Utc; use datafusion::arrow::array::Array; -use datafusion::physical_expr::expressions::{CastExpr, Column as PhysicalColumn}; -use datafusion::physical_plan::projection::ProjectionExec; use datafusion::common::not_impl_err; use datafusion::common::{SchemaExt, Statistics}; use datafusion::datasource::sink::{DataSink, DataSinkExec}; use datafusion::execution::TaskContext; use datafusion::execution::context::SessionContext; use datafusion::logical_expr::{Expr, Operator, TableProviderFilterPushDown}; +use datafusion::physical_expr::expressions::{CastExpr, Column as PhysicalColumn}; use datafusion::physical_plan::DisplayAs; +use datafusion::physical_plan::projection::ProjectionExec; use datafusion::scalar::ScalarValue; use datafusion::{ catalog::Session, @@ -734,18 +734,7 @@ impl Database { "search_path", ]; - let settings: Vec<&str> = vec![ - "UTC", - "UTF8", - "ISO, MDY", - "notice", - "C", - "C", - "C", - "on", - "TimeFusion", - "public", - ]; + let settings: Vec<&str> = vec!["UTC", "UTF8", "ISO, MDY", "notice", "C", "C", "C", "on", "TimeFusion", "public"]; let batch = RecordBatch::try_new( schema.clone(), @@ -1150,7 +1139,9 @@ impl Database { // Set env vars from storage_options for delta-rs credential resolution for (key, value) in &storage_options { if key.starts_with("AWS_") { - unsafe { std::env::set_var(key, value); } + unsafe { + std::env::set_var(key, value); + } } } @@ -1743,7 +1734,9 @@ impl ProjectRoutingTable { // Determine target schema based on projection let target_schema = if let Some(proj) = projection { - Arc::new(arrow_schema::Schema::new(proj.iter().map(|&idx| self.schema.field(idx).clone()).collect::>())) + Arc::new(arrow_schema::Schema::new( + proj.iter().map(|&idx| self.schema.field(idx).clone()).collect::>(), + )) } else { self.schema.clone() }; @@ -2089,7 +2082,9 @@ impl TableProvider for ProjectRoutingTable { // Determine target schema based on projection let target_schema = if let Some(proj) = projection { - Arc::new(arrow_schema::Schema::new(proj.iter().map(|&idx| self.schema.field(idx).clone()).collect::>())) + Arc::new(arrow_schema::Schema::new( + proj.iter().map(|&idx| self.schema.field(idx).clone()).collect::>(), + )) } else { self.schema.clone() }; @@ -2158,7 +2153,7 @@ mod tests { /// Helper function to extract string value from array column, handling different string array types fn get_str(array: &dyn Array, idx: usize) -> String { - use datafusion::arrow::array::{StringArray, LargeStringArray, StringViewArray}; + use datafusion::arrow::array::{LargeStringArray, StringArray, StringViewArray}; if let Some(arr) = array.as_any().downcast_ref::() { arr.value(idx).to_string() } else if let Some(arr) = array.as_any().downcast_ref::() { diff --git a/src/dml.rs b/src/dml.rs index 1395c26..3718efe 100644 --- a/src/dml.rs +++ b/src/dml.rs @@ -9,7 +9,10 @@ use datafusion::{ }, common::{Column, Result}, error::DataFusionError, - execution::{SendableRecordBatchStream, TaskContext, context::{QueryPlanner, SessionState}}, + execution::{ + SendableRecordBatchStream, TaskContext, + context::{QueryPlanner, SessionState}, + }, logical_expr::{BinaryExpr, Expr, LogicalPlan, Operator, WriteOp}, physical_plan::{DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, PlanProperties, stream::RecordBatchStreamAdapter}, physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner}, diff --git a/src/test_utils.rs b/src/test_utils.rs index 3c00e46..f7aa816 100644 --- a/src/test_utils.rs +++ b/src/test_utils.rs @@ -33,7 +33,10 @@ pub mod test_helpers { impl TestConfigBuilder { pub fn new(test_name: &str) -> Self { - Self { test_name: test_name.to_string(), buffer_mode: BufferMode::Enabled } + Self { + test_name: test_name.to_string(), + buffer_mode: BufferMode::Enabled, + } } pub fn with_buffer_mode(mut self, mode: BufferMode) -> Self { @@ -69,9 +72,7 @@ pub mod test_helpers { .map(|f| { let data_type = match f.data_type() { DataType::Utf8View => DataType::Utf8, - DataType::List(inner) if inner.data_type() == &DataType::Utf8View => { - DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))) - } + DataType::List(inner) if inner.data_type() == &DataType::Utf8View => DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), other => other.clone(), }; Field::new(f.name(), data_type, f.is_nullable()) diff --git a/src/wal.rs b/src/wal.rs index fc57993..e084752 100644 --- a/src/wal.rs +++ b/src/wal.rs @@ -125,22 +125,26 @@ impl CompactColumn { } fn to_array_data(&self, data_type: &DataType, len: usize) -> arrow::array::ArrayData { - let null_buffer = self.null_bitmap.as_ref().map(|b| { - NullBuffer::new(arrow::buffer::BooleanBuffer::new(Buffer::from(b.as_slice()), 0, len)) - }); + let null_buffer = self + .null_bitmap + .as_ref() + .map(|b| NullBuffer::new(arrow::buffer::BooleanBuffer::new(Buffer::from(b.as_slice()), 0, len))); let buffers: Vec = self.buffers.iter().map(|b| Buffer::from(b.as_slice())).collect(); let child_data: Vec = match data_type { - DataType::List(field) => { - self.children.iter().zip(&self.child_lens) - .map(|(child, &child_len)| child.to_array_data(field.data_type(), child_len)) - .collect() - } - DataType::Struct(fields) => { - self.children.iter().zip(fields.iter()).zip(&self.child_lens) - .map(|((child, field), &child_len)| child.to_array_data(field.data_type(), child_len)) - .collect() - } + DataType::List(field) => self + .children + .iter() + .zip(&self.child_lens) + .map(|(child, &child_len)| child.to_array_data(field.data_type(), child_len)) + .collect(), + DataType::Struct(fields) => self + .children + .iter() + .zip(fields.iter()) + .zip(&self.child_lens) + .map(|((child, field), &child_len)| child.to_array_data(field.data_type(), child_len)) + .collect(), _ => vec![], }; @@ -207,8 +211,9 @@ impl WalManager { /// Short hash for walrus topic key (walrus has 62-byte metadata limit) fn walrus_topic_key(project_id: &str, table_name: &str) -> String { + use ahash::AHasher; use std::hash::{Hash, Hasher}; - let mut hasher = std::collections::hash_map::DefaultHasher::new(); + let mut hasher = AHasher::default(); project_id.hash(&mut hasher); table_name.hash(&mut hasher); format!("{:016x}", hasher.finish()) @@ -351,9 +356,7 @@ impl WalManager { } pub fn deserialize_batch(data: &[u8], table_name: &str) -> Result { - let schema = get_schema(table_name) - .map(|s| s.schema_ref()) - .unwrap_or_else(|| get_default_schema().schema_ref()); + let schema = get_schema(table_name).map(|s| s.schema_ref()).unwrap_or_else(|| get_default_schema().schema_ref()); deserialize_record_batch(data, &schema) } @@ -398,7 +401,9 @@ fn serialize_record_batch(batch: &RecordBatch) -> Result, WalError> { fn deserialize_record_batch(data: &[u8], schema: &SchemaRef) -> Result { let (compact, _): (CompactBatch, _) = bincode::decode_from_slice(data, BINCODE_CONFIG)?; - let arrays: Vec = compact.columns.iter() + let arrays: Vec = compact + .columns + .iter() .zip(schema.fields()) .map(|(col, field)| { let array_data = col.to_array_data(field.data_type(), compact.num_rows); diff --git a/tests/buffer_consistency_test.rs b/tests/buffer_consistency_test.rs index 1cad7be..f648986 100644 --- a/tests/buffer_consistency_test.rs +++ b/tests/buffer_consistency_test.rs @@ -18,11 +18,7 @@ async fn setup_db_with_buffer(mode: BufferMode) -> Result<(Arc, Arc Result<()> { // Delta-only query should return only Delta data (30 rows) let delta_result = db - .query_delta_only(&format!( - "SELECT COUNT(*) as cnt FROM otel_logs_and_spans WHERE project_id = '{}'", - project_id - )) + .query_delta_only(&format!("SELECT COUNT(*) as cnt FROM otel_logs_and_spans WHERE project_id = '{}'", project_id)) .await?; let delta_count = delta_result[0].column(0).as_primitive::().value(0); diff --git a/tests/delta_rs_api_test.rs b/tests/delta_rs_api_test.rs index 6777585..e361e13 100644 --- a/tests/delta_rs_api_test.rs +++ b/tests/delta_rs_api_test.rs @@ -1,5 +1,5 @@ use anyhow::Result; -use datafusion::arrow::array::{Array, AsArray, StringArray, LargeStringArray, StringViewArray}; +use datafusion::arrow::array::{Array, AsArray, LargeStringArray, StringArray, StringViewArray}; use serial_test::serial; use std::sync::Arc; use timefusion::database::Database; From fc5e9d5c17a4ff2e826e05f2337f84e0080f07bf Mon Sep 17 00:00:00 2001 From: Anthony Alaribe Date: Wed, 28 Jan 2026 17:00:26 -0800 Subject: [PATCH 3/6] Fix WAL safety issues and add memory reservation backoff - Replace unsafe ArrayData::new_unchecked with validated try_new - Add MAX_BATCH_SIZE (100MB) limit to prevent unbounded allocation - Add WAL format versioning (v128) for future compatibility - Add exponential backoff to CAS loop to reduce CPU thrashing - Define named constants for magic numbers - Add support for LargeList, FixedSizeList, Map types in WAL --- src/buffered_write_layer.rs | 25 ++++++++-- src/wal.rs | 85 ++++++++++++++++++++++---------- tests/buffer_consistency_test.rs | 4 +- 3 files changed, 81 insertions(+), 33 deletions(-) diff --git a/src/buffered_write_layer.rs b/src/buffered_write_layer.rs index d4e5034..5f4a99a 100644 --- a/src/buffered_write_layer.rs +++ b/src/buffered_write_layer.rs @@ -14,6 +14,14 @@ use tracing::{debug, error, info, instrument, warn}; // 20% overhead accounts for DashMap internal structures, RwLock wrappers, // Arc refs, and Arrow buffer alignment padding const MEMORY_OVERHEAD_MULTIPLIER: f64 = 1.2; +/// Hard limit multiplier (120%) provides headroom for in-flight writes while preventing OOM +const HARD_LIMIT_MULTIPLIER: usize = 5; // max_bytes + max_bytes/5 = 120% +/// Maximum CAS retry attempts before failing +const MAX_CAS_RETRIES: u32 = 100; +/// Base backoff delay in microseconds for CAS retries +const CAS_BACKOFF_BASE_MICROS: u64 = 1; +/// Maximum backoff exponent (caps delay at ~1ms) +const CAS_BACKOFF_MAX_EXPONENT: u32 = 10; #[derive(Debug, Default)] pub struct RecoveryStats { @@ -100,16 +108,15 @@ impl BufferedWriteLayer { /// Try to reserve memory atomically before a write. /// Returns estimated batch size on success, or error if hard limit exceeded. - /// Callers MUST implement retry logic - hard failures may cause data loss. + /// Uses exponential backoff to reduce CPU thrashing under contention. fn try_reserve_memory(&self, batches: &[RecordBatch]) -> anyhow::Result { let batch_size: usize = batches.iter().map(estimate_batch_size).sum(); let estimated_size = (batch_size as f64 * MEMORY_OVERHEAD_MULTIPLIER) as usize; let max_bytes = self.max_memory_bytes(); - // Hard limit at 120% provides headroom for in-flight writes while preventing OOM - let hard_limit = max_bytes.saturating_add(max_bytes / 5); + let hard_limit = max_bytes.saturating_add(max_bytes / HARD_LIMIT_MULTIPLIER); - for _ in 0..100 { + for attempt in 0..MAX_CAS_RETRIES { let current_reserved = self.reserved_bytes.load(Ordering::Acquire); let current_mem = self.mem_buffer.estimated_memory_bytes(); let new_total = current_mem + current_reserved + estimated_size; @@ -130,8 +137,16 @@ impl BufferedWriteLayer { { return Ok(estimated_size); } + + // Exponential backoff: spin_loop for first few attempts, then yield + if attempt < 5 { + std::hint::spin_loop(); + } else { + let backoff_micros = CAS_BACKOFF_BASE_MICROS << attempt.min(CAS_BACKOFF_MAX_EXPONENT); + std::thread::sleep(std::time::Duration::from_micros(backoff_micros)); + } } - anyhow::bail!("Failed to reserve memory after 100 retries due to contention") + anyhow::bail!("Failed to reserve memory after {} retries due to contention", MAX_CAS_RETRIES) } fn release_reservation(&self, size: usize) { diff --git a/src/wal.rs b/src/wal.rs index e084752..89c6b0a 100644 --- a/src/wal.rs +++ b/src/wal.rs @@ -13,8 +13,12 @@ use walrus_rust::{FsyncSchedule, ReadConsistency, Walrus}; pub enum WalError { #[error("WAL entry too short: {len} bytes")] TooShort { len: usize }, + #[error("Batch too large: {size} bytes exceeds max {max}")] + BatchTooLarge { size: usize, max: usize }, #[error("Invalid WAL operation type: {0}")] InvalidOperation(u8), + #[error("Unsupported WAL version: {version} (expected {expected})")] + UnsupportedVersion { version: u8, expected: u8 }, #[error("Bincode decode error: {0}")] BincodeDecode(#[from] bincode::error::DecodeError), #[error("Bincode encode error: {0}")] @@ -29,7 +33,13 @@ pub enum WalError { /// Magic bytes to identify new WAL format with DML support const WAL_MAGIC: [u8; 4] = [0x57, 0x41, 0x4C, 0x32]; // "WAL2" +/// Version byte must be > 2 to distinguish from legacy operation bytes (0=Insert, 1=Delete, 2=Update) +const WAL_VERSION: u8 = 128; const BINCODE_CONFIG: bincode::config::Configuration = bincode::config::standard(); +/// Maximum size for a single record batch (100MB) - prevents unbounded memory allocation from malicious/corrupted WAL +const MAX_BATCH_SIZE: usize = 100 * 1024 * 1024; +/// Fsync schedule interval in milliseconds - balances durability with performance +const FSYNC_SCHEDULE_MS: u64 = 200; #[derive(Debug, Clone, Copy, PartialEq, Eq, Encode, Decode)] #[repr(u8)] @@ -124,15 +134,15 @@ impl CompactColumn { } } - fn to_array_data(&self, data_type: &DataType, len: usize) -> arrow::array::ArrayData { + fn to_array_data(&self, data_type: &DataType, len: usize) -> Result { let null_buffer = self .null_bitmap .as_ref() .map(|b| NullBuffer::new(arrow::buffer::BooleanBuffer::new(Buffer::from(b.as_slice()), 0, len))); let buffers: Vec = self.buffers.iter().map(|b| Buffer::from(b.as_slice())).collect(); - let child_data: Vec = match data_type { - DataType::List(field) => self + let child_data: Result, WalError> = match data_type { + DataType::List(field) | DataType::LargeList(field) | DataType::FixedSizeList(field, _) => self .children .iter() .zip(&self.child_lens) @@ -145,20 +155,24 @@ impl CompactColumn { .zip(&self.child_lens) .map(|((child, field), &child_len)| child.to_array_data(field.data_type(), child_len)) .collect(), - _ => vec![], + DataType::Map(field, _) => self + .children + .iter() + .zip(&self.child_lens) + .map(|(child, &child_len)| child.to_array_data(field.data_type(), child_len)) + .collect(), + _ => Ok(vec![]), }; - unsafe { - arrow::array::ArrayData::new_unchecked( - data_type.clone(), - len, - Some(self.null_count), - null_buffer.map(|n| n.into_inner().into_inner()), - 0, - buffers, - child_data, - ) - } + arrow::array::ArrayData::try_new( + data_type.clone(), + len, + null_buffer.map(|n| n.into_inner().into_inner()), + 0, + buffers, + child_data?, + ) + .map_err(WalError::ArrowIpc) } } @@ -172,7 +186,7 @@ impl WalManager { pub fn new(data_dir: PathBuf) -> Result { std::fs::create_dir_all(&data_dir)?; - let wal = Walrus::with_consistency_and_schedule(ReadConsistency::StrictlyAtOnce, FsyncSchedule::Milliseconds(200))?; + let wal = Walrus::with_consistency_and_schedule(ReadConsistency::StrictlyAtOnce, FsyncSchedule::Milliseconds(FSYNC_SCHEDULE_MS))?; // Load known topics from index file let meta_dir = data_dir.join(".timefusion_meta"); @@ -399,23 +413,25 @@ fn serialize_record_batch(batch: &RecordBatch) -> Result, WalError> { } fn deserialize_record_batch(data: &[u8], schema: &SchemaRef) -> Result { + if data.len() > MAX_BATCH_SIZE { + return Err(WalError::BatchTooLarge { size: data.len(), max: MAX_BATCH_SIZE }); + } + let (compact, _): (CompactBatch, _) = bincode::decode_from_slice(data, BINCODE_CONFIG)?; - let arrays: Vec = compact + let arrays: Result, WalError> = compact .columns .iter() .zip(schema.fields()) - .map(|(col, field)| { - let array_data = col.to_array_data(field.data_type(), compact.num_rows); - make_array(array_data) - }) + .map(|(col, field)| Ok(make_array(col.to_array_data(field.data_type(), compact.num_rows)?))) .collect(); - RecordBatch::try_new(schema.clone(), arrays).map_err(WalError::ArrowIpc) + RecordBatch::try_new(schema.clone(), arrays?).map_err(WalError::ArrowIpc) } fn serialize_wal_entry(entry: &WalEntry) -> Result, WalError> { let mut buffer = WAL_MAGIC.to_vec(); + buffer.push(WAL_VERSION); buffer.push(entry.operation as u8); buffer.extend(bincode::encode_to_vec(entry, BINCODE_CONFIG)?); Ok(buffer) @@ -426,13 +442,28 @@ fn deserialize_wal_entry(data: &[u8]) -> Result { return Err(WalError::TooShort { len: data.len() }); } - // Check for new format (magic header) if data[0..4] == WAL_MAGIC { - WalOperation::try_from(data[4])?; // Validate operation type - let (entry, _): (WalEntry, _) = bincode::decode_from_slice(&data[5..], BINCODE_CONFIG)?; - Ok(entry) + // v1+ format: data[4] is version byte (>= 1), data[5] is operation + // v0 format: data[4] is operation (0-2), no version byte + // Distinguish: if data[4] > 2, it must be a version byte + if data[4] > 2 { + if data.len() < 6 { + return Err(WalError::TooShort { len: data.len() }); + } + if data[4] != WAL_VERSION { + return Err(WalError::UnsupportedVersion { version: data[4], expected: WAL_VERSION }); + } + WalOperation::try_from(data[5])?; + let (entry, _): (WalEntry, _) = bincode::decode_from_slice(&data[6..], BINCODE_CONFIG)?; + Ok(entry) + } else { + // Legacy v0: magic + operation + data + WalOperation::try_from(data[4])?; + let (entry, _): (WalEntry, _) = bincode::decode_from_slice(&data[5..], BINCODE_CONFIG)?; + Ok(entry) + } } else { - // Old format - decode without magic header, assume INSERT + // Ancient format - no magic header, assume INSERT let (mut entry, _): (WalEntry, _) = bincode::decode_from_slice(data, BINCODE_CONFIG)?; entry.operation = WalOperation::Insert; Ok(entry) diff --git a/tests/buffer_consistency_test.rs b/tests/buffer_consistency_test.rs index f648986..dbcc605 100644 --- a/tests/buffer_consistency_test.rs +++ b/tests/buffer_consistency_test.rs @@ -15,7 +15,9 @@ fn get_str(arr: &dyn Array, idx: usize) -> String { async fn setup_db_with_buffer(mode: BufferMode) -> Result<(Arc, Arc, String)> { let cfg = TestConfigBuilder::new("buf_test").with_buffer_mode(mode).build(); - // Set WALRUS_DATA_DIR env var so walrus-rust uses the correct path + // SAFETY: walrus-rust reads WALRUS_DATA_DIR from environment. We use #[serial] on all tests + // to prevent concurrent access to this process-global state. This is inherently racy but + // acceptable for tests since they run sequentially. unsafe { std::env::set_var("WALRUS_DATA_DIR", &cfg.core.walrus_data_dir) }; let layer = Arc::new(BufferedWriteLayer::with_config(Arc::clone(&cfg))?); let db = Arc::new(Database::with_config(cfg).await?.with_buffered_layer(Arc::clone(&layer))); From 201449daf2962541a00a9ec8c2269691a0b45b59 Mon Sep 17 00:00:00 2001 From: Anthony Alaribe Date: Wed, 28 Jan 2026 17:24:54 -0800 Subject: [PATCH 4/6] Refactor: extract duplicated Delta scan logic and improve code safety - Add SAFETY comment for unsafe env::set_var explaining why it's acceptable in the Delta table creation context (consistent values, early execution) - Extract duplicated schema coercion logic into scan_delta_table() and coerce_plan_to_schema() helpers, reducing ~60 lines of duplication - Fix convert_expr_to_delta comment to accurately describe the recursive tree transformation behavior --- src/database.rs | 189 ++++++++++++++++-------------------------------- src/dml.rs | 4 +- 2 files changed, 67 insertions(+), 126 deletions(-) diff --git a/src/database.rs b/src/database.rs index 7b717ea..652b7e2 100644 --- a/src/database.rs +++ b/src/database.rs @@ -1131,12 +1131,18 @@ impl Database { Ok(Arc::new(store)) } - /// Creates or loads a DeltaTable with proper configuration - /// Sets environment variables from storage_options to ensure delta-rs credential resolution works + /// Creates or loads a DeltaTable with proper configuration. + /// Sets environment variables from storage_options to ensure delta-rs credential resolution works. async fn create_or_load_delta_table( &self, storage_uri: &str, storage_options: HashMap, cached_store: Arc, ) -> Result { - // Set env vars from storage_options for delta-rs credential resolution + // SAFETY: delta-rs internally uses std::env::var() for AWS credential resolution. + // While set_var is unsafe in multi-threaded contexts (potential data races with concurrent + // env reads), this is acceptable here because: + // 1. We only set AWS_* vars which are read by the AWS SDK during client initialization + // 2. The values are consistent across calls (same credentials for same storage_options) + // 3. Delta table creation happens early in request processing, before parallel query execution + // 4. The alternative (forking processes or thread-local storage) adds significant complexity for (key, value) in &storage_options { if key.starts_with("AWS_") { unsafe { @@ -1695,13 +1701,11 @@ impl ProjectRoutingTable { Ok(Arc::new(DataSourceExec::new(Arc::new(mem_source)))) } - /// Helper to scan Delta only (when no MemBuffer data) - async fn scan_delta_only( - &self, state: &dyn Session, project_id: &str, projection: Option<&Vec>, filters: &[Expr], limit: Option, + /// Scan a Delta table and coerce output schema to match our expected types. + /// Handles object store registration, projection translation, and type coercion (e.g., Utf8 -> Utf8View). + async fn scan_delta_table( + &self, table: &DeltaTable, state: &dyn Session, projection: Option<&Vec>, filters: &[Expr], limit: Option, ) -> DFResult> { - let delta_table = self.database.resolve_table(project_id, &self.table_name).await?; - let table = delta_table.read().await; - // Register the object store with DataFusion's runtime so table_provider().scan() can access it let log_store = table.log_store(); let root_store = log_store.root_object_store(None); @@ -1715,16 +1719,14 @@ impl ProjectRoutingTable { let provider = table.table_provider().await.map_err(|e| DataFusionError::External(Box::new(e)))?; - // Translate projection indices from our schema to delta table's schema - // The projection indices from DataFusion are based on ProjectRoutingTable.schema, - // but the delta table provider expects indices based on its own schema + // Translate projection indices from our schema to delta table's schema. + // DataFusion passes indices based on ProjectRoutingTable.schema, but the + // delta table provider expects indices based on its own schema. let delta_schema = provider.schema(); let translated_projection = projection.map(|proj| { proj.iter() .filter_map(|&idx| { - // Get column name from our schema let col_name = self.schema.field(idx).name(); - // Find column index in delta schema delta_schema.fields().iter().position(|f| f.name() == col_name) }) .collect::>() @@ -1733,46 +1735,58 @@ impl ProjectRoutingTable { let delta_plan = provider.scan(state, translated_projection.as_ref(), filters, limit).await?; // Determine target schema based on projection - let target_schema = if let Some(proj) = projection { - Arc::new(arrow_schema::Schema::new( - proj.iter().map(|&idx| self.schema.field(idx).clone()).collect::>(), - )) - } else { - self.schema.clone() + let target_schema = match projection { + Some(proj) => Arc::new(arrow_schema::Schema::new(proj.iter().map(|&idx| self.schema.field(idx).clone()).collect::>())), + None => self.schema.clone(), }; - // Coerce delta output schema to match our expected schema (e.g., Utf8 -> Utf8View) - let delta_output_schema = delta_plan.schema(); - if delta_output_schema.fields().len() == target_schema.fields().len() { - let needs_coercion = delta_output_schema - .fields() - .iter() - .zip(target_schema.fields()) - .any(|(delta_field, target_field)| delta_field.data_type() != target_field.data_type()); - - if needs_coercion { - // Create cast expressions for each column - let cast_exprs: Vec<(Arc, String)> = delta_output_schema - .fields() - .iter() - .enumerate() - .zip(target_schema.fields()) - .map(|((idx, delta_field), target_field)| { - let col_expr = Arc::new(PhysicalColumn::new(delta_field.name(), idx)) as Arc; - let expr: Arc = if delta_field.data_type() != target_field.data_type() { - Arc::new(CastExpr::new(col_expr, target_field.data_type().clone(), None)) - } else { - col_expr - }; - (expr, target_field.name().clone()) - }) - .collect(); + Self::coerce_plan_to_schema(delta_plan, &target_schema) + } - return Ok(Arc::new(ProjectionExec::try_new(cast_exprs, delta_plan)?)); - } + /// Wrap an execution plan with type coercion if the output schema doesn't match the target. + /// This handles cases like Delta returning Utf8 when we expect Utf8View. + fn coerce_plan_to_schema(plan: Arc, target_schema: &SchemaRef) -> DFResult> { + let plan_schema = plan.schema(); + if plan_schema.fields().len() != target_schema.fields().len() { + return Ok(plan); + } + + let needs_coercion = plan_schema + .fields() + .iter() + .zip(target_schema.fields()) + .any(|(plan_field, target_field)| plan_field.data_type() != target_field.data_type()); + + if !needs_coercion { + return Ok(plan); } - Ok(delta_plan) + let cast_exprs: Vec<(Arc, String)> = plan_schema + .fields() + .iter() + .enumerate() + .zip(target_schema.fields()) + .map(|((idx, plan_field), target_field)| { + let col_expr = Arc::new(PhysicalColumn::new(plan_field.name(), idx)) as Arc; + let expr: Arc = if plan_field.data_type() != target_field.data_type() { + Arc::new(CastExpr::new(col_expr, target_field.data_type().clone(), None)) + } else { + col_expr + }; + (expr, target_field.name().clone()) + }) + .collect(); + + Ok(Arc::new(ProjectionExec::try_new(cast_exprs, plan)?)) + } + + /// Helper to scan Delta only (when no MemBuffer data) + async fn scan_delta_only( + &self, state: &dyn Session, project_id: &str, projection: Option<&Vec>, filters: &[Expr], limit: Option, + ) -> DFResult> { + let delta_table = self.database.resolve_table(project_id, &self.table_name).await?; + let table = delta_table.read().await; + self.scan_delta_table(&table, state, projection, filters, limit).await } /// Extract time range (min, max) from query filters. @@ -2047,82 +2061,7 @@ impl TableProvider for ProjectRoutingTable { let resolve_span = tracing::trace_span!(parent: &span, "resolve_delta_table"); let delta_table = self.database.resolve_table(&project_id, &self.table_name).instrument(resolve_span).await?; let table = delta_table.read().await; - - // Register the object store with DataFusion's runtime so table_provider().scan() can access it - let log_store = table.log_store(); - let root_store = log_store.root_object_store(None); - let bucket_url = { - let table_url = table.table_url(); - let scheme = table_url.scheme(); - let bucket = table_url.host_str().unwrap_or(""); - Url::parse(&format!("{}://{}/", scheme, bucket)).expect("valid bucket URL") - }; - state.runtime_env().register_object_store(&bucket_url, root_store); - - let scan_span = tracing::trace_span!("delta_table.scan", - table.name = %self.table_name, - table.project_id = %project_id, - partition_filters = ?delta_filters.iter().filter(|f| matches!(f, Expr::BinaryExpr(_))).count() - ); - - let provider = table.table_provider().await.map_err(|e| DataFusionError::External(Box::new(e)))?; - - // Translate projection indices from our schema to delta table's schema - let delta_schema = provider.schema(); - let translated_projection = projection.map(|proj| { - proj.iter() - .filter_map(|&idx| { - let col_name = self.schema.field(idx).name(); - delta_schema.fields().iter().position(|f| f.name() == col_name) - }) - .collect::>() - }); - - let delta_plan = provider.scan(state, translated_projection.as_ref(), &delta_filters, limit).instrument(scan_span).await?; - - // Determine target schema based on projection - let target_schema = if let Some(proj) = projection { - Arc::new(arrow_schema::Schema::new( - proj.iter().map(|&idx| self.schema.field(idx).clone()).collect::>(), - )) - } else { - self.schema.clone() - }; - - // Coerce delta output schema to match our expected schema (e.g., Utf8 -> Utf8View) - let delta_output_schema = delta_plan.schema(); - let delta_plan = if delta_output_schema.fields().len() == target_schema.fields().len() { - let needs_coercion = delta_output_schema - .fields() - .iter() - .zip(target_schema.fields()) - .any(|(delta_field, target_field)| delta_field.data_type() != target_field.data_type()); - - if needs_coercion { - // Create cast expressions for each column - let cast_exprs: Vec<(Arc, String)> = delta_output_schema - .fields() - .iter() - .enumerate() - .zip(target_schema.fields()) - .map(|((idx, delta_field), target_field)| { - let col_expr = Arc::new(PhysicalColumn::new(delta_field.name(), idx)) as Arc; - let expr: Arc = if delta_field.data_type() != target_field.data_type() { - Arc::new(CastExpr::new(col_expr, target_field.data_type().clone(), None)) - } else { - col_expr - }; - (expr, target_field.name().clone()) - }) - .collect(); - - Arc::new(ProjectionExec::try_new(cast_exprs, delta_plan)?) as Arc - } else { - delta_plan - } - } else { - delta_plan - }; + let delta_plan = self.scan_delta_table(&table, state, projection, &delta_filters, limit).await?; // Union both plans (mem data first for recency, then Delta for historical) UnionExec::try_new(vec![mem_plan, delta_plan]) diff --git a/src/dml.rs b/src/dml.rs index 3718efe..1d6de76 100644 --- a/src/dml.rs +++ b/src/dml.rs @@ -530,7 +530,9 @@ where } /// Convert DataFusion Expr to Delta-compatible format. -/// Only strips table qualifiers from columns - Utf8View is kept for consistency. +/// Recursively walks the expression tree and strips table qualifiers from Column references +/// (e.g., `table.column` becomes just `column`). All other expression types (literals, +/// binary ops, functions, etc.) pass through unchanged, preserving types like Utf8View. fn convert_expr_to_delta(expr: &Expr) -> Result { use datafusion::common::tree_node::TreeNode; expr.clone() From 02cf6464e8454204380c101a506a52829422645d Mon Sep 17 00:00:00 2001 From: Anthony Alaribe Date: Wed, 28 Jan 2026 19:40:22 -0800 Subject: [PATCH 5/6] Fix WAL recovery test and improve error handling - Enable test_recovery by setting WALRUS_DATA_DIR env var - Use test_helpers for proper schema-compatible test batches - Add #[serial] to prevent test isolation issues - Improve error handling in wal.rs persist_topic() - Remove explicit shutdown to avoid premature WAL consumption --- src/buffered_write_layer.rs | 38 ++++++++++++++++++------------------- src/database.rs | 4 +++- src/wal.rs | 26 +++++++++++++++++++------ 3 files changed, 42 insertions(+), 26 deletions(-) diff --git a/src/buffered_write_layer.rs b/src/buffered_write_layer.rs index 5f4a99a..ff181b8 100644 --- a/src/buffered_write_layer.rs +++ b/src/buffered_write_layer.rs @@ -560,8 +560,8 @@ impl BufferedWriteLayer { #[cfg(test)] mod tests { use super::*; - use arrow::array::{Int64Array, StringViewArray}; - use arrow::datatypes::{DataType, Field, Schema}; + use crate::test_utils::test_helpers::{json_to_batch, test_span}; + use serial_test::serial; use std::path::PathBuf; use tempfile::tempdir; @@ -571,14 +571,14 @@ mod tests { Arc::new(cfg) } - fn create_test_batch() -> RecordBatch { - let schema = Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int64, false), - Field::new("name", DataType::Utf8View, false), - ])); - let id_array = Int64Array::from(vec![1, 2, 3]); - let name_array = StringViewArray::from(vec!["a", "b", "c"]); - RecordBatch::try_new(schema, vec![Arc::new(id_array), Arc::new(name_array)]).unwrap() + fn create_test_batch(project_id: &str) -> RecordBatch { + // Use test_span helper which creates data matching the default schema + json_to_batch(vec![ + test_span("test1", "span1", project_id), + test_span("test2", "span2", project_id), + test_span("test3", "span3", project_id), + ]) + .unwrap() } #[tokio::test] @@ -592,7 +592,7 @@ mod tests { let table = format!("t{}", test_id); let layer = BufferedWriteLayer::with_config(cfg).unwrap(); - let batch = create_test_batch(); + let batch = create_test_batch(&project); layer.insert(&project, &table, vec![batch.clone()]).await.unwrap(); @@ -601,15 +601,16 @@ mod tests { assert_eq!(results[0].num_rows(), 3); } - // NOTE: This test is ignored because walrus-rust creates new files for each instance - // rather than discovering existing files from previous instances in the same directory. - // This is a limitation of the walrus library, not our code. - #[ignore] + #[serial] #[tokio::test] async fn test_recovery() { let dir = tempdir().unwrap(); let cfg = create_test_config(dir.path().to_path_buf()); + // SAFETY: walrus-rust reads WALRUS_DATA_DIR from environment. We use #[serial] + // to prevent concurrent access to this process-global state. + unsafe { std::env::set_var("WALRUS_DATA_DIR", &cfg.core.walrus_data_dir) }; + // Use unique but short project/table names (walrus has metadata size limit) let test_id = &uuid::Uuid::new_v4().to_string()[..4]; let project = format!("r{}", test_id); @@ -618,10 +619,9 @@ mod tests { // First instance - write data { let layer = BufferedWriteLayer::with_config(Arc::clone(&cfg)).unwrap(); - let batch = create_test_batch(); + let batch = create_test_batch(&project); layer.insert(&project, &table, vec![batch]).await.unwrap(); - // Shutdown to ensure WAL is synced - layer.shutdown().await.unwrap(); + // Layer drops here - WAL data should be persisted } // Second instance - recover from WAL @@ -648,7 +648,7 @@ mod tests { let layer = BufferedWriteLayer::with_config(cfg).unwrap(); // First insert should succeed - let batch = create_test_batch(); + let batch = create_test_batch(&project); layer.insert(&project, &table, vec![batch]).await.unwrap(); // Verify reservation is released (should be 0 after successful insert) diff --git a/src/database.rs b/src/database.rs index 652b7e2..0afe4d5 100644 --- a/src/database.rs +++ b/src/database.rs @@ -1736,7 +1736,9 @@ impl ProjectRoutingTable { // Determine target schema based on projection let target_schema = match projection { - Some(proj) => Arc::new(arrow_schema::Schema::new(proj.iter().map(|&idx| self.schema.field(idx).clone()).collect::>())), + Some(proj) => Arc::new(arrow_schema::Schema::new( + proj.iter().map(|&idx| self.schema.field(idx).clone()).collect::>(), + )), None => self.schema.clone(), }; diff --git a/src/wal.rs b/src/wal.rs index 89c6b0a..c143148 100644 --- a/src/wal.rs +++ b/src/wal.rs @@ -210,10 +210,18 @@ impl WalManager { fn persist_topic(&self, topic: &str) { if self.known_topics.insert(topic.to_string()) { let meta_dir = self.data_dir.join(".timefusion_meta"); - let _ = std::fs::create_dir_all(&meta_dir); - if let Ok(mut file) = std::fs::OpenOptions::new().create(true).append(true).open(meta_dir.join("topics")) { - use std::io::Write; - let _ = writeln!(file, "{}", topic); + if let Err(e) = std::fs::create_dir_all(&meta_dir) { + warn!("Failed to create WAL meta dir {:?}: {}", meta_dir, e); + return; + } + match std::fs::OpenOptions::new().create(true).append(true).open(meta_dir.join("topics")) { + Ok(mut file) => { + use std::io::Write; + if let Err(e) = writeln!(file, "{}", topic) { + warn!("Failed to write topic '{}' to index: {}", topic, e); + } + } + Err(e) => warn!("Failed to open topics file: {}", e), } } } @@ -414,7 +422,10 @@ fn serialize_record_batch(batch: &RecordBatch) -> Result, WalError> { fn deserialize_record_batch(data: &[u8], schema: &SchemaRef) -> Result { if data.len() > MAX_BATCH_SIZE { - return Err(WalError::BatchTooLarge { size: data.len(), max: MAX_BATCH_SIZE }); + return Err(WalError::BatchTooLarge { + size: data.len(), + max: MAX_BATCH_SIZE, + }); } let (compact, _): (CompactBatch, _) = bincode::decode_from_slice(data, BINCODE_CONFIG)?; @@ -451,7 +462,10 @@ fn deserialize_wal_entry(data: &[u8]) -> Result { return Err(WalError::TooShort { len: data.len() }); } if data[4] != WAL_VERSION { - return Err(WalError::UnsupportedVersion { version: data[4], expected: WAL_VERSION }); + return Err(WalError::UnsupportedVersion { + version: data[4], + expected: WAL_VERSION, + }); } WalOperation::try_from(data[5])?; let (entry, _): (WalEntry, _) = bincode::decode_from_slice(&data[6..], BINCODE_CONFIG)?; From a91b3bfea0a6a4af6136e4fd42276d166050aeb0 Mon Sep 17 00:00:00 2001 From: Anthony Alaribe Date: Wed, 28 Jan 2026 19:57:31 -0800 Subject: [PATCH 6/6] cleanups --- Cargo.lock | 1 + Cargo.toml | 1 + src/buffered_write_layer.rs | 6 +++- src/database.rs | 55 ++++++++++++++++++++++++------------- src/mem_buffer.rs | 4 +-- src/pgwire_handlers.rs | 4 +++ src/wal.rs | 12 ++++---- 7 files changed, 56 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a4602d7..38080d0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6772,6 +6772,7 @@ dependencies = [ "opentelemetry", "opentelemetry-otlp", "opentelemetry_sdk", + "parking_lot", "rand 0.9.2", "regex", "scopeguard", diff --git a/Cargo.toml b/Cargo.toml index c09a4f4..fe2ba79 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,6 +68,7 @@ ahash = "0.8" lru = "0.16.1" serde_bytes = "0.11.19" dashmap = "6.1" +parking_lot = "0.12" envy = "0.4" tdigests = "1.0" bincode = { version = "2.0", features = ["serde"] } diff --git a/src/buffered_write_layer.rs b/src/buffered_write_layer.rs index ff181b8..7646048 100644 --- a/src/buffered_write_layer.rs +++ b/src/buffered_write_layer.rs @@ -138,10 +138,14 @@ impl BufferedWriteLayer { return Ok(estimated_size); } - // Exponential backoff: spin_loop for first few attempts, then yield + // Exponential backoff: spin_loop for first few attempts, then brief sleep. + // Note: Using std::thread::sleep in this sync function called from async context. + // This is acceptable because: (1) max sleep is ~1ms, (2) only under high contention, + // (3) converting to async would require spawn_blocking which adds more overhead. if attempt < 5 { std::hint::spin_loop(); } else { + // Max backoff = 1μs << 10 = 1024μs ≈ 1ms let backoff_micros = CAS_BACKOFF_BASE_MICROS << attempt.min(CAS_BACKOFF_MAX_EXPONENT); std::thread::sleep(std::time::Duration::from_micros(backoff_micros)); } diff --git a/src/database.rs b/src/database.rs index 0afe4d5..c025dad 100644 --- a/src/database.rs +++ b/src/database.rs @@ -36,9 +36,11 @@ use deltalake::operations::create::CreateBuilder; use deltalake::{DeltaTable, DeltaTableBuilder}; use futures::StreamExt; use instrumented_object_store::instrument_object_store; +use std::sync::Mutex; use serde::{Deserialize, Serialize}; use sqlx::{PgPool, postgres::PgPoolOptions}; use std::fmt; +use std::sync::OnceLock; use std::{any::Any, collections::HashMap, sync::Arc}; use tokio::sync::RwLock; use tokio_util::sync::CancellationToken; @@ -46,6 +48,14 @@ use tracing::field::Empty; use tracing::{Instrument, debug, error, info, instrument, warn}; use url::Url; +/// Mutex to serialize access to environment variable modifications. +/// Required because delta-rs uses std::env::var() for AWS credential resolution, +/// and std::env::set_var is unsafe in multi-threaded contexts. +static ENV_MUTEX: OnceLock> = OnceLock::new(); +fn env_mutex() -> &'static Mutex<()> { + ENV_MUTEX.get_or_init(|| Mutex::new(())) +} + // Changed to support multiple tables per project: (project_id, table_name) -> DeltaTable pub type ProjectConfigs = Arc>>>>; @@ -1136,17 +1146,18 @@ impl Database { async fn create_or_load_delta_table( &self, storage_uri: &str, storage_options: HashMap, cached_store: Arc, ) -> Result { - // SAFETY: delta-rs internally uses std::env::var() for AWS credential resolution. - // While set_var is unsafe in multi-threaded contexts (potential data races with concurrent - // env reads), this is acceptable here because: - // 1. We only set AWS_* vars which are read by the AWS SDK during client initialization - // 2. The values are consistent across calls (same credentials for same storage_options) - // 3. Delta table creation happens early in request processing, before parallel query execution - // 4. The alternative (forking processes or thread-local storage) adds significant complexity - for (key, value) in &storage_options { - if key.starts_with("AWS_") { - unsafe { - std::env::set_var(key, value); + // delta-rs uses std::env::var() for AWS credential resolution. + // We serialize access with ENV_MUTEX to prevent data races from concurrent set_var calls. + { + let _guard = env_mutex().lock(); + for (key, value) in &storage_options { + if key.starts_with("AWS_") { + // SAFETY: Protected by ENV_MUTEX. set_var is only unsafe due to potential + // concurrent reads, which we prevent by holding the mutex during the entire + // block. The mutex ensures only one thread modifies env vars at a time. + unsafe { + std::env::set_var(key, value); + } } } } @@ -1194,9 +1205,8 @@ impl Database { // Fallback to legacy batch queue if configured let enable_queue = self.config.core.enable_batch_queue; - if !skip_queue && enable_queue && self.batch_queue.is_some() { + if !skip_queue && enable_queue && let Some(ref queue) = self.batch_queue { span.record("use_queue", true); - let queue = self.batch_queue.as_ref().unwrap(); for batch in batches { if let Err(e) = queue.queue(batch) { return Err(anyhow::anyhow!("Queue error: {}", e)); @@ -1724,12 +1734,19 @@ impl ProjectRoutingTable { // delta table provider expects indices based on its own schema. let delta_schema = provider.schema(); let translated_projection = projection.map(|proj| { - proj.iter() - .filter_map(|&idx| { - let col_name = self.schema.field(idx).name(); - delta_schema.fields().iter().position(|f| f.name() == col_name) - }) - .collect::>() + let mut translated = Vec::with_capacity(proj.len()); + for &idx in proj { + let col_name = self.schema.field(idx).name(); + if let Some(delta_idx) = delta_schema.fields().iter().position(|f| f.name() == col_name) { + translated.push(delta_idx); + } else { + warn!( + "Column '{}' requested in projection but not found in Delta schema for table '{}'", + col_name, self.table_name + ); + } + } + translated }); let delta_plan = provider.scan(state, translated_projection.as_ref(), filters, limit).await?; diff --git a/src/mem_buffer.rs b/src/mem_buffer.rs index dd2b51c..adbc791 100644 --- a/src/mem_buffer.rs +++ b/src/mem_buffer.rs @@ -12,7 +12,7 @@ use datafusion::sql::sqlparser::dialect::GenericDialect; use datafusion::sql::sqlparser::parser::Parser as SqlParser; use std::sync::atomic::{AtomicI64, AtomicUsize, Ordering}; use std::sync::{Arc, RwLock}; -use tracing::{debug, instrument, warn}; +use tracing::{debug, info, instrument, warn}; // 10-minute buckets balance flush granularity vs overhead. Shorter = more flushes, // longer = larger Delta files. Matches default flush interval for aligned boundaries. @@ -46,7 +46,7 @@ fn schemas_compatible(existing: &SchemaRef, incoming: &SchemaRef) -> bool { } } if new_fields > 0 { - debug!("Schema evolution: {} new nullable field(s) added", new_fields); + info!("Schema evolution: {} new nullable field(s) added", new_fields); } true } diff --git a/src/pgwire_handlers.rs b/src/pgwire_handlers.rs index fd7526d..8485491 100644 --- a/src/pgwire_handlers.rs +++ b/src/pgwire_handlers.rs @@ -71,6 +71,8 @@ pub struct LoggingSimpleQueryHandler { } impl LoggingSimpleQueryHandler { + /// Create a new LoggingSimpleQueryHandler. + /// Note: auth_manager is unused since datafusion-postgres 0.14.0 moved auth to server level. pub fn new(session_context: Arc, _auth_manager: Arc) -> Self { Self { inner: DfSessionService::new(session_context), @@ -144,6 +146,8 @@ pub struct LoggingExtendedQueryHandler { } impl LoggingExtendedQueryHandler { + /// Create a new LoggingExtendedQueryHandler. + /// Note: auth_manager is unused since datafusion-postgres 0.14.0 moved auth to server level. pub fn new(session_context: Arc, _auth_manager: Arc) -> Self { Self { inner: DfSessionService::new(session_context), diff --git a/src/wal.rs b/src/wal.rs index c143148..836e0ce 100644 --- a/src/wal.rs +++ b/src/wal.rs @@ -118,7 +118,7 @@ impl CompactColumn { Self { null_bitmap: data.nulls().map(|n| n.buffer().as_slice().to_vec()), buffers: data.buffers().iter().map(|b| b.as_slice().to_vec()).collect(), - children: data.child_data().iter().map(|c| Self::from_array_data(c)).collect(), + children: data.child_data().iter().map(Self::from_array_data).collect(), null_count: data.null_count(), child_lens: data.child_data().iter().map(|c| c.len()).collect(), } @@ -128,7 +128,7 @@ impl CompactColumn { Self { null_bitmap: data.nulls().map(|n| n.buffer().as_slice().to_vec()), buffers: data.buffers().iter().map(|b| b.as_slice().to_vec()).collect(), - children: data.child_data().iter().map(|c| Self::from_array_data(c)).collect(), + children: data.child_data().iter().map(Self::from_array_data).collect(), null_count: data.null_count(), child_lens: data.child_data().iter().map(|c| c.len()).collect(), } @@ -454,9 +454,11 @@ fn deserialize_wal_entry(data: &[u8]) -> Result { } if data[0..4] == WAL_MAGIC { - // v1+ format: data[4] is version byte (>= 1), data[5] is operation - // v0 format: data[4] is operation (0-2), no version byte - // Distinguish: if data[4] > 2, it must be a version byte + // WAL format detection based on byte 4: + // - v0 (legacy): data[4] is operation byte (0=Insert, 1=Delete, 2=Update) + // - v1+ (current): data[4] is version byte (>=128), data[5] is operation + // Since WalOperation values are 0-2 and WAL_VERSION is 128, we can safely + // distinguish formats: if data[4] > 2, it must be a version byte, not an operation. if data[4] > 2 { if data.len() < 6 { return Err(WalError::TooShort { len: data.len() });