Fix and test arcticdb reading streaming data

IvoDD · IvoDD · commit dd1bfaa981bd · 2025-03-07T17:40:11.000+02:00
Fixes:
- Column filter in static schema
- Column ordering when introducing a new column with an incomplete
  segment

Tests:
- Columns filter in static and dynamic schema
- Reading diffrent schema incompletes
- Compatibility test for reading incompletes from an old env
diff --git a/cpp/arcticdb/pipeline/read_pipeline.hpp b/cpp/arcticdb/pipeline/read_pipeline.hpp
@@ -141,19 +141,24 @@ inline void generate_filtered_field_descriptors(std::shared_ptr<PipelineContext>
     generate_filtered_field_descriptors(*context, columns);
 }
 
+inline void get_column_bitset_in_context(
+    const ReadQuery& query,
+    const std::shared_ptr<PipelineContext>& pipeline_context) {
+    pipeline_context->set_selected_columns(query.columns);
+    pipeline_context->overall_column_bitset_ = overall_column_bitset(pipeline_context->descriptor(),
+                                                                     query.clauses_,
+                                                                     pipeline_context->selected_columns_);
+}
+
 template<class ContainerType>
 inline std::vector<FilterQuery<ContainerType>> get_column_bitset_and_query_functions(
     const ReadQuery& query,
     const std::shared_ptr<PipelineContext>& pipeline_context,
     bool dynamic_schema,
     bool column_groups) {
     using namespace arcticdb::pipelines::index;
-
     if(!dynamic_schema || column_groups) {
-        pipeline_context->set_selected_columns(query.columns);
-        pipeline_context->overall_column_bitset_ = overall_column_bitset(pipeline_context->descriptor(),
-                                                                         query.clauses_,
-                                                                         pipeline_context->selected_columns_);
+        get_column_bitset_in_context(query, pipeline_context);
     }
     return build_read_query_filters<ContainerType>(pipeline_context, query.row_filter, dynamic_schema, column_groups);
 }
diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp
@@ -1085,9 +1085,17 @@ bool read_incompletes_to_pipeline(
     // Mark the start point of the incompletes, so we know that there is no column slicing after this point
     pipeline_context->incompletes_after_ = pipeline_context->slice_and_keys_.size();
 
-    // If there are only incompletes we need to add the index here
     if(pipeline_context->slice_and_keys_.empty()) {
+        // If there are only incompletes we need to do the following (typically done when reading the index key):
+        // - add the index columns to query
+        // - in case of static schema: populate the descriptor and column_bitset
         add_index_columns_to_query(read_query, seg.index_descriptor());
+        if (!dynamic_schema) {
+            pipeline_context->desc_ = seg.descriptor();
+            get_column_bitset_in_context(
+                read_query,
+                pipeline_context);
+        }
     }
     pipeline_context->slice_and_keys_.insert(std::end(pipeline_context->slice_and_keys_), incomplete_segments.begin(), incomplete_segments.end());
 
@@ -1116,9 +1124,9 @@ bool read_incompletes_to_pipeline(
         pipeline_context->staged_descriptor_ =
             merge_descriptors(seg.descriptor(), incomplete_segments, read_query.columns);
         if (pipeline_context->desc_) {
-            const std::array fields_ptr = {pipeline_context->desc_->fields_ptr()};
+            const std::array staged_fields_ptr = {pipeline_context->staged_descriptor_->fields_ptr()};
             pipeline_context->desc_ =
-                merge_descriptors(*pipeline_context->staged_descriptor_, fields_ptr, read_query.columns);
+                merge_descriptors(*pipeline_context->desc_, staged_fields_ptr, read_query.columns);
         } else {
             pipeline_context->desc_ = pipeline_context->staged_descriptor_;
         }
diff --git a/python/tests/compat/arcticdb/test_compatibility.py b/python/tests/compat/arcticdb/test_compatibility.py
@@ -1,6 +1,7 @@
 import pytest
 from packaging import version
 import pandas as pd
+import numpy as np
 from arcticdb.util.test import assert_frame_equal
 from arcticdb.options import ModifiableEnterpriseLibraryOption
 from arcticdb.toolbox.library_tool import LibraryTool
@@ -171,3 +172,49 @@ def test_compat_snapshot_metadata_read(old_venv_and_arctic_uri, lib_name):
         snaps = curr.lib.list_snapshots()
         meta = snaps["old_snap"]
         assert meta == {"old_key": "old_value"}
+
+
+def test_compat_read_incomplete(old_venv_and_arctic_uri, lib_name):
+    old_venv, arctic_uri = old_venv_and_arctic_uri
+    sym = "sym"
+    df = pd.DataFrame({
+        "col": np.arange(10),
+        "float_col": np.arange(10, dtype=np.float64),
+        "str_col": [f"str_{i}" for i in range(10)]
+    }, pd.date_range("2024-01-01", periods=10))
+    df_1 = df.iloc[:8]
+    df_2 = df.iloc[8:]
+
+    old_ac = old_venv.create_arctic(arctic_uri)
+    old_lib = old_ac.create_library(lib_name)
+
+    if version.Version(old_venv.version) >= version.Version("5.1.0"):
+        # In version 5.1.0 (with commit a3b7545) we moved the streaming incomplete python API to the library tool.
+        old_lib.execute([
+            """
+lib_tool = lib.library_tool()
+lib_tool.append_incomplete("sym", df_1)
+lib_tool.append_incomplete("sym", df_2)
+            """
+        ], dfs={"df_1": df_1, "df_2": df_2})
+    else:
+        old_lib.execute([
+            """
+lib._nvs.append("sym", df_1, incomplete=True)
+lib._nvs.append("sym", df_2, incomplete=True)
+            """
+        ], dfs={"df_1": df_1, "df_2": df_2})
+
+
+    with CurrentVersion(arctic_uri, lib_name) as curr:
+        read_df = curr.lib._nvs.read(sym, date_range=(None, None), incomplete=True).data
+        assert_frame_equal(read_df, df)
+
+        read_df = curr.lib._nvs.read(sym, date_range=(None, None), incomplete=True, columns=["float_col"]).data
+        assert_frame_equal(read_df, df[["float_col"]])
+
+        read_df = curr.lib._nvs.read(sym, date_range=(None, None), incomplete=True, columns=["float_col", "str_col"]).data
+        assert_frame_equal(read_df, df[["float_col", "str_col"]])
+
+        read_df = curr.lib._nvs.read(sym, date_range=(pd.Timestamp(2024, 1, 5), pd.Timestamp(2024, 1, 9)), incomplete=True, columns=["float_col", "str_col"]).data
+        assert_frame_equal(read_df, df[["float_col", "str_col"]].iloc[4:9])
diff --git a/python/tests/unit/arcticdb/version_store/test_incompletes.py b/python/tests/unit/arcticdb/version_store/test_incompletes.py
@@ -12,6 +12,7 @@
 from arcticdb.exceptions import MissingDataException
 from arcticdb_ext.storage import KeyType
 
+from arcticdb.util.venv import CurrentVersion
 
 @pytest.mark.parametrize("batch", (True, False))
 def test_read_incompletes_with_indexed_data(lmdb_version_store_v1, batch):
@@ -80,3 +81,86 @@ def test_read_incompletes_no_chunking(lmdb_version_store_tiny_segment):
 
     ref_keys = lib_tool.find_keys_for_symbol(KeyType.APPEND_REF, sym)
     assert len(ref_keys) == 1
+
+@pytest.mark.parametrize("dynamic_schema", [True, False])
+def test_read_incompletes_columns_filter(version_store_factory, dynamic_schema):
+    lib = version_store_factory(dynamic_schema=dynamic_schema)
+    lib_tool = lib.library_tool()
+    sym = "sym"
+    df = pd.DataFrame({
+        "col": np.arange(20),
+        "float_col": np.arange(20, dtype=np.float64),
+        "str_col": [f"str_{i}" for i in range(20)]
+    }, pd.date_range("2024-01-01", periods=20))
+    lib_tool.append_incomplete(sym, df.iloc[:5])
+    lib_tool.append_incomplete(sym, df.iloc[5:8])
+    lib_tool.append_incomplete(sym, df.iloc[8:10])
+
+    date_range = (None, None)
+    col_df = lib.read(sym, date_range=date_range, incomplete=True, columns=["col"]).data
+    assert_frame_equal(col_df, df[["col"]].iloc[:10])
+
+    float_col_df = lib.read(sym, date_range=date_range, incomplete=True, columns=["float_col"]).data
+    assert_frame_equal(float_col_df, df[["float_col"]].iloc[:10])
+
+    float_and_str_col_df = lib.read(sym, date_range=date_range, incomplete=True, columns=["float_col", "str_col"]).data
+    assert_frame_equal(float_and_str_col_df, df[["float_col", "str_col"]].iloc[:10])
+
+    date_range = (pd.Timestamp(2024, 1, 3), pd.Timestamp(2024, 1, 8))
+    float_and_str_col_df = lib.read(sym, date_range=date_range, incomplete=True, columns=["float_col", "str_col"]).data
+    assert_frame_equal(float_and_str_col_df, df[["float_col", "str_col"]].iloc[2:8])
+
+    # Compact and add the rest of the df
+    lib.compact_incomplete(sym, append=True, convert_int_to_float=False, via_iteration=False)
+    lib_tool.append_incomplete(sym, df.iloc[10:17])
+    lib_tool.append_incomplete(sym, df.iloc[17:])
+
+    date_range = (None, None)
+    float_col_df = lib.read(sym, date_range=date_range, incomplete=True, columns=["float_col"]).data
+    assert_frame_equal(float_col_df, df[["float_col"]])
+
+    float_and_str_col_df = lib.read(sym, date_range=date_range, incomplete=True, columns=["float_col", "str_col"]).data
+    assert_frame_equal(float_and_str_col_df, df[["float_col", "str_col"]])
+
+    # Only incomplete range
+    date_range = (pd.Timestamp(2024, 1, 12), pd.Timestamp(2024, 1, 18))
+    float_and_str_col_df = lib.read(sym, date_range=date_range, incomplete=True, columns=["float_col", "str_col"]).data
+    assert_frame_equal(float_and_str_col_df, df[["float_col", "str_col"]].iloc[11:18])
+
+
+def test_read_incompletes_dynamic(lmdb_version_store_dynamic_schema_v1):
+    lib = lmdb_version_store_dynamic_schema_v1
+    lib_tool = lib.library_tool()
+    sym = "sym"
+
+    def get_date(days_after_epoch):
+        return pd.Timestamp(0) + pd.DateOffset(days=days_after_epoch)
+
+    def get_index(days_after_epoch, num_days):
+        return pd.date_range(get_date(days_after_epoch), periods=num_days, freq="d")
+
+    df_1 = pd.DataFrame({"col_1": [1., 2., 3.], "col_2": [1., 2., 3.]}, index=get_index(0, 3))
+    df_2 = pd.DataFrame({"col_2": [4., 5.], "col_3": [1., 2.]}, index=get_index(3, 2))
+    df_3 = pd.DataFrame({"col_3": [3., 4.], "col_4": [1., 2.]}, index=get_index(5, 2))
+
+    lib_tool.append_incomplete(sym, df_1)
+    lib_tool.append_incomplete(sym, df_2)
+
+    df = lib.read(sym, date_range=(None, None), incomplete=True).data
+    assert_frame_equal(df, pd.concat([df_1, df_2]))
+
+    # If reading just a single incomplete we will get the result in its own schema
+    df = lib.read(sym, date_range = (get_date(3), None), incomplete=True).data
+    assert_frame_equal(df, df_2)
+
+    lib.compact_incomplete(sym, append=True, convert_int_to_float=False, via_iteration=False)
+
+    df = lib.read(sym, date_range=(None, None), incomplete=True).data
+    assert_frame_equal(df, pd.concat([df_1, df_2]))
+
+    lib_tool.append_incomplete(sym, df_3)
+    df = lib.read(sym, date_range=(None, None), incomplete=True).data
+    assert_frame_equal(df, pd.concat([df_1, df_2, df_3]))
+
+    df_col_filter = lib.read(sym, date_range=(None, None), incomplete=True, columns=["col_2", "col_4"]).data
+    assert_frame_equal(df_col_filter, pd.concat([df_1, df_2, df_3])[["col_2", "col_4"]])