Merge branch 'master' into 2221-arcticdb-writes-dont-handle-numpy-slicesviews-correctly

G-D-Petrov · web-flow · commit 47e9a5e6145b · 2025-03-07T18:25:27.000+02:00
diff --git a/environment_unix.yml b/environment_unix.yml
@@ -36,7 +36,7 @@ dependencies:
   - azure-core-cpp
   - azure-identity-cpp
   - azure-storage-blobs-cpp
-  - fmt
+  - fmt <11.1
   - folly==2023.09.25.00
   - unordered_dense
   # Vendored build dependencies (see `cpp/thirdparty` and `cpp/vcpkg.json`)
diff --git a/python/arcticdb/util/utils.py b/python/arcticdb/util/utils.py
@@ -553,3 +553,129 @@ def generate_random_dataframe(cls, rows: int, cols: int, indexed: bool = True, s
             gen.add_timestamp_index("index", "s", pd.Timestamp(0))
         return gen.generate_dataframe()
 
+
+    @classmethod
+    def generate_random_int_dataframe(seclslf, start_name_prefix: str, 
+                                      num_rows:int, num_cols:int, 
+                                      dtype: ArcticIntType = np.int64, min_value: int = None, max_value: int = None,
+                                      seed: int = 3432) -> pd.DataFrame:
+        """
+        To be used to generate large number of same type columns, when generation time is
+        critical
+        """
+        np.random.seed(seed=seed)
+        platform_int_info = np.iinfo("int_")
+        iinfo = np.iinfo(dtype)
+        if min_value is None:
+            min_value = max(iinfo.min, platform_int_info.min)
+        if max_value is None:
+            max_value = min(iinfo.max, platform_int_info.max)
+
+        data = np.random.randint(min_value, max_value, size=(num_rows, num_cols), dtype= dtype)
+        columns = [f"{start_name_prefix}_{n}" for n in range(num_cols)]
+
+        return pd.DataFrame(data=data, columns=columns)
+
+    @classmethod
+    def generate_random_float_dataframe(cls, start_name_prefix: str, num_rows: int, num_cols: int, 
+                                        dtype: ArcticFloatType = np.float64, 
+                                        min_value: float = None, max_value: float = None, round_at: int = None,
+                                        seed: int = 54675) -> 'DFGenerator':
+        """
+        To be used to generate large number of same type columns, when generation time is
+        critical
+        """
+        # Higher numbers will trigger overflow in numpy uniform (-1e307 - 1e307)
+        # Get the minimum and maximum values for np.float32
+        info = np.finfo(np.float32)
+        _max = info.max
+        _min = info.min
+        np.random.seed(seed)
+        if min_value is None:
+            min_value = max(-1e307, -sys.float_info.max, _min)
+        if max_value is None:    
+            max_value = min(1e307, sys.float_info.max, _max)
+        if round_at is None:
+            data = np.random.uniform(min_value, max_value, size=(num_rows, num_cols)).astype(dtype)
+        else :
+            data = np.round(np.random.uniform(min_value, max_value, 
+                                              size=(num_rows, num_cols)), round_at).astype(dtype)
+
+        columns = [f"{start_name_prefix}_{n}" for n in range(num_cols)]
+
+        return pd.DataFrame(data=data, columns=columns)
+    
+    @classmethod
+    def generate_random_strings_dataframe(cls, start_name_prefix: str, num_rows: int, num_cols: int,
+                                          column_sizes=None, seed: int = 4543):
+        """
+        To be used to generate large number of same type columns, when generation time is
+        critical
+        If `column_sizes` not supplied default 10 will be used
+        """
+        if column_sizes is None:
+            column_sizes =  [10] * num_cols
+        np.random.seed(seed=seed)
+        data = [[random_string(column_sizes[col]) 
+                 for col in range(num_cols)] 
+                 for _ in range(num_rows)]
+        
+        columns = [f"{start_name_prefix}_{n}" for n in range(num_cols)]
+        return pd.DataFrame(data=data, columns=columns)
+
+    @classmethod
+    def generate_wide_dataframe(cls, num_rows: int, num_cols: int,  
+                            num_string_cols: int,
+                            start_time: pd.Timestamp = None,
+                            freq: Union[str , timedelta , pd.Timedelta , pd.DateOffset] = 's',
+                            seed = 23445):
+        """
+        Generates as fast as possible specified number of columns.
+        Uses random arrays generation in numpy to do that
+        As the strings generation is slowest always be mindful to pass number between 1-1000 max
+        The generated dataframe will have also index starting at specified `start_time`
+        """
+
+        cols, mod = divmod(num_cols - num_string_cols, 10) # divide by number of unique frame types
+        
+        int_frame1 = cls.generate_random_int_dataframe("int8", num_rows=num_rows, num_cols=cols, 
+                                                       dtype=np.int8, seed=seed)
+
+        int_frame2 = cls.generate_random_int_dataframe("int16", num_rows=num_rows, num_cols=cols, 
+                                                       dtype=np.int16, seed=seed)
+
+        int_frame3 = cls.generate_random_int_dataframe("int32", num_rows=num_rows, num_cols=cols, 
+                                                       dtype=np.int32, seed=seed)
+
+        int_frame4 = cls.generate_random_int_dataframe("int64", num_rows=num_rows, num_cols=cols + mod, 
+                                                       dtype=np.int64, seed=seed)
+
+        uint_frame1 = cls.generate_random_int_dataframe("uint8", num_rows=num_rows, num_cols=cols, 
+                                                        dtype=np.uint8, seed=seed)
+
+        uint_frame2 = cls.generate_random_int_dataframe("uint16", num_rows=num_rows, num_cols=cols, 
+                                                        dtype=np.uint16, seed=seed)
+
+        uint_frame3 = cls.generate_random_int_dataframe("uint32", num_rows=num_rows, num_cols=cols, 
+                                                        dtype=np.uint32, seed=seed)
+
+        uint_frame4 = cls.generate_random_int_dataframe("uint64", num_rows=num_rows, num_cols=cols, 
+                                                        dtype=np.uint64, seed=seed)
+
+        float_frame1 = cls.generate_random_float_dataframe("float32", num_rows=num_rows, num_cols=cols, 
+                                                           dtype=np.float32, seed=seed)
+
+        float_frame2 = cls.generate_random_float_dataframe("float64", num_rows=num_rows, num_cols=cols, 
+                                                           dtype=np.float64, seed=seed)
+
+        str_frame = cls.generate_random_strings_dataframe("str", num_rows=num_rows, num_cols=num_string_cols)
+
+        frame: pd.DataFrame = pd.concat([int_frame1, int_frame2, int_frame3, int_frame4,
+                           uint_frame1, uint_frame2, uint_frame3, uint_frame4,
+                           float_frame1, float_frame2, str_frame], axis=1) # Concatenate horizontally
+        
+        if start_time:
+            range = pd.date_range(start=start_time, periods=frame.shape[0], freq=freq, name='index')
+            frame.index = range
+
+        return  frame
diff --git a/python/tests/conftest.py b/python/tests/conftest.py
@@ -129,7 +129,7 @@ def lmdb_storage(tmp_path) -> Generator[LmdbStorageFixture, None, None]:
 
 
 @pytest.fixture
-def lmdb_library(lmdb_storage, lib_name) -> Library:
+def lmdb_library(lmdb_storage, lib_name) -> Generator[Library, None, None]:
     return lmdb_storage.create_arctic().create_library(lib_name)
 
 
diff --git a/python/tests/stress/arcticdb/version_store/test_mem_leaks.py b/python/tests/stress/arcticdb/version_store/test_mem_leaks.py
@@ -22,17 +22,28 @@
 import time
 import pandas as pd
 
-from typing import Generator, Tuple
+from typing import Generator, List, Tuple
+from arcticdb.encoding_version import EncodingVersion
+from arcticdb.options import LibraryOptions
 from arcticdb.util.test import get_sample_dataframe, random_string
+from arcticdb.util.utils import DFGenerator
 from arcticdb.version_store.library import Library, ReadRequest
 from arcticdb.version_store.processing import QueryBuilder
 from arcticdb.version_store._store import NativeVersionStore
-from tests.util.mark import MACOS, SLOW_TESTS_MARK, WINDOWS, MEMRAY_SUPPORTED, MEMRAY_TESTS_MARK, SKIP_CONDA_MARK
+from arcticdb_ext.version_store import PythonVersionStoreReadOptions
+from tests.util.mark import LINUX, MACOS, SLOW_TESTS_MARK, WINDOWS, MEMRAY_SUPPORTED, MEMRAY_TESTS_MARK, SKIP_CONDA_MARK
 
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("Memory_tests")
 
+##   IMPORTANT !!!
+##   
+##   All memory tests MUST be done with fixtures that return Library object
+##   and not NativeVersionStore. Reason is that the last is thick wrapper which 
+##   is hiding some possible problems, therefore all tests have to be done with what 
+##   customer works on
+
 
 # region HELPER functions for non-memray tests
 
@@ -590,6 +601,30 @@ def is_relevant(stack: Stack) -> bool:
             # do something to check if we need this to be added
             # as mem leak
             # print(f"SAMPLE >>> {frame.filename}:{frame.function}[{frame.lineno}]")
+            frame_info_str = f"{frame.filename}:{frame.function}:[{frame.lineno}]"
+
+            if "folly::CPUThreadPoolExecutor::CPUTask" in frame_info_str:
+                logger.warning(f"Frame excluded : {frame_info_str}")
+                logger.warning(f'''Explanation    : These are on purpose, and they come from the interaction of 
+                               multi-threading and forking. When Python forks, the task-scheduler has a linked-list 
+                               of tasks to execute, but there is a global lock held that protects the thread-local state.
+                               We can't free the list without accessing the global thread-local storage singleton, 
+                               and that is protected by a lock which is now held be a thread that is in a different 
+                               process, so it will never be unlocked in the child. As a work-around we intentionally 
+                               leak the task-scheduler and replace it with a new one, in this method: 
+                               https://github.com/man-group/ArcticDB/blob/master/cpp/arcticdb/async/task_scheduler.cpp#L34
+
+                               It's actually due to a bug in folly, because the lock around the thread-local 
+                               storage manager has a compile-time token that should be used to differentiate it 
+                               from other locks, but it has been constructed with type void as have other locks. 
+                               It's possible they might fix it at some point in which case we can free the memory. 
+                               Or we do have a vague intention to move away from folly for async if we 
+                               find something better
+
+                               Great that it is catching this, as it's the one case in the whole project where I know 
+                               for certain that it does leak memory (and only because there's no alternative''')
+                return False
+            
             pass
         return True
 
@@ -727,3 +762,127 @@ def test_mem_leak_read_all_arctic_lib_memray(library_with_big_symbol_):
         logger.info(f"Test took : {time.time() - st}")
 
         gc.collect()
+
+    @pytest.fixture
+    def lmdb_library(lmdb_storage, lib_name, request) -> Generator[Library, None, None]:
+        """
+        Allows passing library creation parameters as parameters of the test or other fixture.
+        Example: 
+
+
+            @pytest.mark.parametrize("lmdb_library_any", [
+                        {'library_options': LibraryOptions(rows_per_segment=100, columns_per_segment=100)}
+                    ], indirect=True)
+            def test_my_test(lmdb_library_any):
+            .....
+        """
+        params = request.param if hasattr(request, 'param') else {}
+        yield lmdb_storage.create_arctic().create_library(name=lib_name, **params)
+
+
+    @pytest.fixture
+    def prepare_head_tails_symbol(lmdb_library):               
+        """
+        This fixture is part of test `test_mem_leak_head_tail_memray`
+
+        It creates a symbol with several versions and snapshot for each version.
+        It inserts dataframes which are large given the segment size of library.
+        And if the dynamic schema is used each version has more columns than previous version
+
+        Should not be reused
+        """
+        lib: Library = lmdb_library
+        opts  = lib.options()
+        
+        total_number_columns = 1002
+        symbol = "asdf12345"
+        num_rows_list = [279,199,1,350,999,0,1001]
+        snapshot_names = []
+        for rows in num_rows_list:
+            st = time.time()
+            df = DFGenerator.generate_wide_dataframe(num_rows=rows, num_cols=total_number_columns, num_string_cols=25, 
+                                                     start_time=pd.Timestamp(0),seed=64578)
+            lib.write(symbol,df)
+            snap = f"{symbol}_{rows}"
+            lib.snapshot(snap)
+            snapshot_names.append(snap)
+            logger.info(f"Generated {rows} in {time.time() - st} sec")
+            if opts.dynamic_schema:
+                # Dynamic libraries are dynamic by nature so the test should cover that
+                # characteristic
+                total_number_columns += 20
+                logger.info(f"Total number of columns increased to {total_number_columns}")
+            
+        all_columns = df.columns.to_list()
+        yield (lib, symbol, num_rows_list, snapshot_names, all_columns)
+        lib.delete(symbol=symbol)
+
+
+    @MEMRAY_TESTS_MARK
+    @SLOW_TESTS_MARK
+    ## Linux is having quite huge location there will be separate issue to investigate why
+    @pytest.mark.limit_leaks(location_limit="1000 KB" if LINUX else "52 KB", filter_fn=is_relevant)
+    @pytest.mark.parametrize("lmdb_library", [
+                {'library_options': LibraryOptions(rows_per_segment=233, columns_per_segment=197, dynamic_schema=True, encoding_version=EncodingVersion.V2)},
+                {'library_options': LibraryOptions(rows_per_segment=99, columns_per_segment=99, dynamic_schema=False, encoding_version=EncodingVersion.V1)}
+            ], indirect=True)
+    def test_mem_leak_head_tail_memray(prepare_head_tails_symbol):
+        """
+        This test aims to test `head` and `tail` functions if they do leak memory.
+        The creation of initial symbol (test environment precondition) is in specialized fixture
+        so that memray does not detect memory there as this will slow the process many times
+        """
+
+        symbol: str
+        num_rows_list: List[int]
+        store: NativeVersionStore = None
+        snapshot_names:  List[str]
+        all_columns: List[str]
+        (store, symbol, num_rows_list, snapshot_names, all_columns) = prepare_head_tails_symbol
+    
+        start_test: float = time.time()
+        max_rows:int = max(num_rows_list)
+
+        np.random.seed(959034)
+        # constructing a list of head and tail rows to be selected
+        num_rows_to_select = []
+        important_values = [0, 1, 0 -1, 2, -2, max_rows, -max_rows ] # some boundary cases
+        num_rows_to_select.extend(important_values)
+        num_rows_to_select.extend(np.random.randint(low=5, high=99, size=7)) # add 7 more random values
+        # number of iterations will be the list length/size
+        iterations = len(num_rows_to_select)
+        # constructing a random list of values for snapshot names for each iteration
+        snapshots_list: List[str] = np.random.choice(snapshot_names, iterations) 
+        # constructing a random list of values for versions names for each iteration
+        versions_list: List[int] = np.random.randint(0, len(num_rows_list) - 1, iterations) 
+        # constructing a random list of number of columns to be selected
+        number_columns_for_selection_list: List[int] = np.random.randint(0, len(all_columns)-1, iterations) 
+
+        count: int = 0
+        # We will execute several time all head/tail operations with specific number of columns.
+        # the number of columns consist of random columns and boundary cases see definition above
+        for rows in num_rows_to_select:
+            selected_columns:List[str] = np.random.choice(all_columns, number_columns_for_selection_list[count], replace=False).tolist() 
+            snap: str = snapshots_list[count]
+            ver: str = int(versions_list[count])
+            logger.info(f"rows {rows} / snapshot {snap}")
+            df1: pd.DataFrame = store.head(n=rows, as_of=snap, symbol=symbol).data
+            df2: pd.DataFrame = store.tail(n=rows, as_of=snap, symbol=symbol).data
+            df3: pd.DataFrame = store.head(n=rows, as_of=ver, symbol=symbol, columns=selected_columns).data
+            difference = list(set(df3.columns.to_list()).difference(set(selected_columns)))
+            assert len(difference) == 0, f"Columns not included : {difference}"
+            df4: pd.DataFrame = store.tail(n=rows, as_of=ver, symbol=symbol, columns=selected_columns).data
+            difference = list(set(df4.columns.to_list()).difference(set(selected_columns)))
+            assert len(difference) == 0, f"Columns not included : {difference}"
+
+            logger.info(f"Iteration {count} / {iterations} completed")
+            count += 1
+            del selected_columns, df1, df2, df3, df4
+        
+        del store, symbol, num_rows_list, snapshot_names, all_columns
+        del num_rows_to_select, important_values, snapshots_list, versions_list, number_columns_for_selection_list
+        gc.collect()
+        time.sleep(10) # collection is not immediate
+        logger.info(f"Test completed in {time.time() - start_test}")     
+
+