man-group
diff --git a/‎cpp/arcticdb/CMakeLists.txt
+12-4 b/‎cpp/arcticdb/CMakeLists.txt
+12-4
diff --git a/‎cpp/arcticdb/column_store/block.hpp
+12-9 b/‎cpp/arcticdb/column_store/block.hpp
+12-9
diff --git a/‎cpp/arcticdb/column_store/chunked_buffer.cpp
+2-2 b/‎cpp/arcticdb/column_store/chunked_buffer.cpp
+2-2
diff --git a/‎cpp/arcticdb/column_store/chunked_buffer.hpp
-1 b/‎cpp/arcticdb/column_store/chunked_buffer.hpp
-1
diff --git a/‎cpp/arcticdb/util/mean.hpp
+35-54 b/‎cpp/arcticdb/util/mean.hpp
+35-54
diff --git a/‎cpp/arcticdb/util/min_max_float.hpp
+29-35 b/‎cpp/arcticdb/util/min_max_float.hpp
+29-35
@@ -381,6 +381,11 @@ set(arcticdb_srcs
         util/lazy.hpp
         util/type_traits.hpp
         util/variant.hpp
+        util/min_max_integer.hpp
+        util/mean.hpp
+        util/min_max_float.hpp
+        util/sum.hpp
+        util/vector_common.hpp
         version/de_dup_map.hpp
         version/op_log.hpp
         version/schema_checks.hpp
@@ -523,7 +528,7 @@ set(arcticdb_srcs
         version/version_core.cpp
         version/version_store_api.cpp
         version/version_utils.cpp
-        version/version_map_batch_methods.cpp util/min_max_integer.hpp util/mean.hpp util/min_max_float.hpp util/sum.hpp)
+        version/version_map_batch_methods.cpp )
 
 add_library(arcticdb_core_object OBJECT ${arcticdb_srcs})
 
@@ -750,8 +755,8 @@ if (SSL_LINK)
     find_package(OpenSSL REQUIRED)
     list(APPEND arcticdb_core_libraries OpenSSL::SSL)
     if (NOT WIN32)
-        #list(APPEND arcticdb_core_libraries ${KERBEROS_LIBRARY})
-        #list(APPEND arcticdb_core_includes  ${KERBEROS_INCLUDE_DIR})
+        list(APPEND arcticdb_core_libraries ${KERBEROS_LIBRARY})
+        list(APPEND arcticdb_core_includes  ${KERBEROS_INCLUDE_DIR})
     endif()
 endif ()
 target_link_libraries(arcticdb_core_object PUBLIC ${arcticdb_core_libraries})
@@ -968,6 +973,9 @@ if(${TEST})
             util/test/test_storage_lock.cpp
             util/test/test_string_pool.cpp
             util/test/test_string_utils.cpp
+            util/test/test_min_max_float.cpp
+            util/test/test_sum.cpp
+            util/test/test_mean.cpp
             util/test/test_tracing_allocator.cpp
             version/test/test_append.cpp
             version/test/test_key_block.cpp
@@ -980,7 +988,7 @@ if(${TEST})
             version/test/test_version_map_batch.cpp
             version/test/test_version_store.cpp
             version/test/version_map_model.hpp
-            python/python_handlers.cpp util/test/test_min_max_float.cpp util/test/test_sum.cpp util/test/test_mean.cpp)
+            python/python_handlers.cpp)
 
     set(EXECUTABLE_PERMS OWNER_WRITE OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE) # 755
 
 
@@ -15,7 +15,6 @@
 namespace arcticdb {
 
 struct MemBlock {
-    static const size_t Align = 128;
     static const size_t MinSize = 64;
     using magic_t = arcticdb::util::MagicNum<'M', 'e', 'm', 'b'>;
     magic_t magic_;
@@ -136,17 +135,21 @@ struct MemBlock {
     bool owns_external_data_ = false;
 
     static const size_t HeaderDataSize =
-            sizeof(magic_) +   // 8 bytes
-            sizeof(bytes_) +   // 8 bytes
-            sizeof(capacity_) +   // 8 bytes
+        sizeof(magic_) +
+            sizeof(bytes_) +
+            sizeof(capacity_) +
             sizeof(external_data_) +
             sizeof(offset_) +
-            sizeof(timestamp_) + 
+            sizeof(timestamp_) +
             sizeof(owns_external_data_);
 
-    uint8_t pad[Align - HeaderDataSize];
-    static const size_t HeaderSize = HeaderDataSize + sizeof(pad);
-    static_assert(HeaderSize == Align);
-    uint8_t data_[MinSize];
+    static const size_t DataAlignment = 64;
+    static const size_t PadSize = (DataAlignment - (HeaderDataSize % DataAlignment)) % DataAlignment;
+
+    uint8_t pad[PadSize];
+    static const size_t HeaderSize = HeaderDataSize + PadSize;
+    static_assert(HeaderSize % DataAlignment == 0, "Header size must be aligned to 64 bytes");
+
+    alignas(DataAlignment) uint8_t data_[MinSize];
 };
 }
@@ -68,7 +68,7 @@ std::vector<ChunkedBufferImpl<BlockSize>> split(const ChunkedBufferImpl<BlockSiz
 }
 
 template std::vector<ChunkedBufferImpl<64>> split(const ChunkedBufferImpl<64>& input, size_t nbytes);
-template std::vector<ChunkedBufferImpl<3968>> split(const ChunkedBufferImpl<3968>& input, size_t nbytes);
+template std::vector<ChunkedBufferImpl<4032ul>> split(const ChunkedBufferImpl<4032ul>& input, size_t nbytes);
 
 // Inclusive of start_byte, exclusive of end_byte
 template <size_t BlockSize>
@@ -112,6 +112,6 @@ ChunkedBufferImpl<BlockSize> truncate(const ChunkedBufferImpl<BlockSize>& input,
 }
 
 template ChunkedBufferImpl<64> truncate(const ChunkedBufferImpl<64>& input, size_t start_byte, size_t end_byte);
-template ChunkedBufferImpl<3968> truncate(const ChunkedBufferImpl<3968>& input, size_t start_byte, size_t end_byte);
+template ChunkedBufferImpl<4032ul> truncate(const ChunkedBufferImpl<4032ul>& input, size_t start_byte, size_t end_byte);
 
 } //namespace arcticdb
@@ -39,7 +39,6 @@ class ChunkedBufferImpl {
 
     using BlockType = MemBlock;
 
-    static_assert(sizeof(BlockType) == BlockType::Align + BlockType::MinSize);
     static_assert(DefaultBlockSize >= BlockType::MinSize);
 
   public:
 
@@ -3,98 +3,79 @@
 #include <type_traits>
 #include <cstddef>
 
+#include <arcticdb/util/vector_common.hpp>
 
-template<typename T>
-struct is_supported_type : std::false_type {};
-
-template<> struct is_supported_type<int8_t> : std::true_type {};
-template<> struct is_supported_type<uint8_t> : std::true_type {};
-template<> struct is_supported_type<int16_t> : std::true_type {};
-template<> struct is_supported_type<uint16_t> : std::true_type {};
-template<> struct is_supported_type<int32_t> : std::true_type {};
-template<> struct is_supported_type<uint32_t> : std::true_type {};
-template<> struct is_supported_type<int64_t> : std::true_type {};
-template<> struct is_supported_type<uint64_t> : std::true_type {};
-template<> struct is_supported_type<float> : std::true_type {};
-template<> struct is_supported_type<double> : std::true_type {};
-
-template<typename T>
-struct MeanResult {
-    T mean;
-    size_t count;  // Useful for floating point to know how many non-NaN values
-};
+namespace arcticdb {
 
 template<typename T>
 class MeanFinder {
-    static_assert(is_supported_type<T>::value, "Unsupported type");
-
-    using VectorType = T __attribute__((vector_size(64)));
+    static_assert(is_supported_int<T>::value || is_supported_float<T>::value, "Unsupported type");
 
 public:
-
     static double find(const T* data, size_t n) {
+        using VectorType = vector_type<T>;
+        using AccumVectorType = vector_type<double>;
 
-        using AccumVectorType = double __attribute__((vector_size(64)));
-
+        AccumVectorType vsum = {0.0};
         const size_t elements_per_vector = sizeof(VectorType) / sizeof(T);
-        const size_t vlen = n / elements_per_vector;
+        const size_t doubles_per_vector = sizeof(AccumVectorType) / sizeof(double);
+        const size_t vectors_per_acc = elements_per_vector / doubles_per_vector;
 
-        AccumVectorType sum_vec = {0};
-        AccumVectorType count_vec = {0};
-        double total_sum = 0;
         size_t valid_count = 0;
 
-        for(size_t i = 0; i < vlen; i++) {
-            VectorType v = reinterpret_cast<const VectorType*>(data)[i];
+        const auto* vdata = reinterpret_cast<const VectorType*>(data);
+        const size_t vector_len = n / elements_per_vector;
+
+        for(size_t i = 0; i < vector_len; i++) {
+            VectorType v = vdata[i];
 
             if constexpr(std::is_floating_point_v<T>) {
-                VectorType mask = v == v;  // !NaN
-                VectorType valid = v & mask;
-                VectorType replaced = VectorType{0} & ~mask;
-                v = valid | replaced;
+                VectorType mask = v == v;
+                v = v & mask;
 
-                AccumVectorType count_mask;
+                const T* mask_arr = reinterpret_cast<const T*>(&mask);
                 for(size_t j = 0; j < elements_per_vector; j++) {
-                    count_mask[j] = reinterpret_cast<const T*>(&mask)[j] != 0 ? 1.0 : 0.0;
+                    if(mask_arr[j] != 0) valid_count++;
                 }
-                count_vec += count_mask;
             } else {
-                count_vec += AccumVectorType{1};
+                valid_count += elements_per_vector;
             }
 
-            AccumVectorType v_double;
-            for(size_t j = 0; j < elements_per_vector; j++) {
-                v_double[j] = static_cast<double>(reinterpret_cast<const T*>(&v)[j]);
+            const T* v_arr = reinterpret_cast<const T*>(&v);
+            for(size_t chunk = 0; chunk < vectors_per_acc; chunk++) {
+                for(size_t j = 0; j < doubles_per_vector; j++) {
+                    size_t idx = chunk * doubles_per_vector + j;
+                    reinterpret_cast<double*>(&vsum)[j] += static_cast<double>(v_arr[idx]);
+                }
             }
-            sum_vec += v_double;
         }
 
-        const double* sum_arr = reinterpret_cast<const double*>(&sum_vec);
-        const double* count_arr = reinterpret_cast<const double*>(&count_vec);
-        for(size_t i = 0; i < elements_per_vector; i++) {
-            total_sum += sum_arr[i];
-            valid_count += static_cast<size_t>(count_arr[i]);
+        double total = 0.0;
+        const auto* sum_arr = reinterpret_cast<const double*>(&vsum);
+        for(size_t i = 0; i < doubles_per_vector; i++) {
+            total += sum_arr[i];
         }
 
-        const T* remain = data + (vlen * elements_per_vector);
+        const T* remain = data + (vector_len * elements_per_vector);
         for(size_t i = 0; i < n % elements_per_vector; i++) {
             if constexpr(std::is_floating_point_v<T>) {
                 if (remain[i] == remain[i]) {  // Not NaN
-                    total_sum += static_cast<double>(remain[i]);
+                    total += static_cast<double>(remain[i]);
                     valid_count++;
                 }
             } else {
-                total_sum += static_cast<double>(remain[i]);
+                total += static_cast<double>(remain[i]);
                 valid_count++;
             }
         }
 
-        double mean = valid_count > 0 ? total_sum / valid_count : 0.0;
-        return mean;
+        return valid_count > 0 ? total / static_cast<double>(valid_count) : 0.0;
     }
 };
 
 template<typename T>
-double find_mean(const T* data, size_t n) {
+double find_mean(const T *data, size_t n) {
     return MeanFinder<T>::find(data, n);
 }
+
+} // namespace arcticdb
@@ -4,54 +4,51 @@
 #include <cstddef>
 #include <algorithm>
 
-namespace arcticdb {
+#include <arcticdb/util/vector_common.hpp>
 
-template<typename T>
-struct is_supported_float : std::false_type {};
+namespace arcticdb {
 
 template<typename T>
 using vector_type __attribute__((vector_size(64))) = T;
 
-template<> struct is_supported_float<float> : std::true_type {};
-template<> struct is_supported_float<double> : std::true_type {};
-
 template<typename T>
 class FloatMinFinder {
     static_assert(is_supported_float<T>::value, "Type must be float or double");
     static_assert(std::is_floating_point_v<T>, "Type must be floating point");
 
 public:
-    static T find(const T *data, size_t n) {
-        using vec_t __attribute__((vector_size(64))) = T;
+    static T find(const T* data, size_t n) {
+        using vec_t = vector_type<T>;
 
+        // Initialize min vector with infinity
         vec_t vmin;
-        for (size_t i = 0; i < sizeof(vec_t) / sizeof(T); i++) {
-            reinterpret_cast<T *>(&vmin)[i] = std::numeric_limits<T>::infinity();
+        for(size_t i = 0; i < sizeof(vec_t)/sizeof(T); i++) {
+            reinterpret_cast<T*>(&vmin)[i] = std::numeric_limits<T>::infinity();
         }
 
-        const vec_t *vdata = reinterpret_cast<const vec_t *>(data);
+        // Process full vectors
+        const vec_t* vdata = reinterpret_cast<const vec_t*>(data);
         const size_t elements_per_vector = sizeof(vec_t) / sizeof(T);
         const size_t vlen = n / elements_per_vector;
 
-        for (size_t i = 0; i < vlen; i++) {
+        // Main SIMD loop
+        for(size_t i = 0; i < vlen; i++) {
             vec_t v = vdata[i];
-            vec_t mask = v == v;  // !NaN
-            vec_t valid = v & mask;
-            vec_t replaced = vmin & ~mask;
-            v = valid | replaced;
             vmin = (v < vmin) ? v : vmin;
         }
 
+        // Reduce vector to scalar
         T min_val = std::numeric_limits<T>::infinity();
-        const T *min_arr = reinterpret_cast<const T *>(&vmin);
-        for (size_t i = 0; i < elements_per_vector; i++) {
+        const T* min_arr = reinterpret_cast<const T*>(&vmin);
+        for(size_t i = 0; i < elements_per_vector; i++) {
             if (min_arr[i] == min_arr[i]) {  // Not NaN
                 min_val = std::min(min_val, min_arr[i]);
             }
         }
 
-        const T *remain = data + (vlen * elements_per_vector);
-        for (size_t i = 0; i < n % elements_per_vector; i++) {
+        // Handle remainder
+        const T* remain = data + (vlen * elements_per_vector);
+        for(size_t i = 0; i < n % elements_per_vector; i++) {
             if (remain[i] == remain[i]) {  // Not NaN
                 min_val = std::min(min_val, remain[i]);
             }
@@ -67,41 +64,38 @@ class FloatMaxFinder {
     static_assert(std::is_floating_point_v<T>, "Type must be floating point");
 
 public:
-    static T find(const T *data, size_t n) {
+    static T find(const T* data, size_t n) {
         using vec_t = vector_type<T>;
 
         // Initialize max vector with negative infinity
         vec_t vmax;
-        for (size_t i = 0; i < sizeof(vec_t) / sizeof(T); i++) {
-            reinterpret_cast<T *>(&vmax)[i] = -std::numeric_limits<T>::infinity();
+        for(size_t i = 0; i < sizeof(vec_t)/sizeof(T); i++) {
+            reinterpret_cast<T*>(&vmax)[i] = -std::numeric_limits<T>::infinity();
         }
 
-        const vec_t *vdata = reinterpret_cast<const vec_t *>(data);
+        // Process full vectors
+        const vec_t* vdata = reinterpret_cast<const vec_t*>(data);
         const size_t elements_per_vector = sizeof(vec_t) / sizeof(T);
         const size_t vlen = n / elements_per_vector;
 
         // Main SIMD loop
-        for (size_t i = 0; i < vlen; i++) {
+        for(size_t i = 0; i < vlen; i++) {
             vec_t v = vdata[i];
-            // Create mask for non-NaN values
-            vec_t mask = v == v;  // false for NaN
-            vec_t valid = v & mask;
-            vec_t replaced = vmax & ~mask;
-            v = valid | replaced;
-            // Vector max
             vmax = (v > vmax) ? v : vmax;
         }
 
+        // Reduce vector to scalar
         T max_val = -std::numeric_limits<T>::infinity();
-        const T *max_arr = reinterpret_cast<const T *>(&vmax);
-        for (size_t i = 0; i < elements_per_vector; i++) {
+        const T* max_arr = reinterpret_cast<const T*>(&vmax);
+        for(size_t i = 0; i < elements_per_vector; i++) {
             if (max_arr[i] == max_arr[i]) {  // Not NaN
                 max_val = std::max(max_val, max_arr[i]);
             }
         }
 
-        const T *remain = data + (vlen * elements_per_vector);
-        for (size_t i = 0; i < n % elements_per_vector; i++) {
+        // Handle remainder
+        const T* remain = data + (vlen * elements_per_vector);
+        for(size_t i = 0; i < n % elements_per_vector; i++) {
             if (remain[i] == remain[i]) {  // Not NaN
                 max_val = std::max(max_val, remain[i]);
             }
Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,7 @@ std::vector<ChunkedBufferImpl<BlockSize>> split(const ChunkedBufferImpl<BlockSiz`
`68`	`68`	`}`
`69`	`69`
`70`	`70`	`template std::vector<ChunkedBufferImpl<64>> split(const ChunkedBufferImpl<64>& input, size_t nbytes);`
`71`		`-template std::vector<ChunkedBufferImpl<3968>> split(const ChunkedBufferImpl<3968>& input, size_t nbytes);`
	`71`	`+template std::vector<ChunkedBufferImpl<4032ul>> split(const ChunkedBufferImpl<4032ul>& input, size_t nbytes);`
`72`	`72`
`73`	`73`	`// Inclusive of start_byte, exclusive of end_byte`
`74`	`74`	`template <size_t BlockSize>`
`@@ -112,6 +112,6 @@ ChunkedBufferImpl<BlockSize> truncate(const ChunkedBufferImpl<BlockSize>& input,`
`112`	`112`	`}`
`113`	`113`
`114`	`114`	`template ChunkedBufferImpl<64> truncate(const ChunkedBufferImpl<64>& input, size_t start_byte, size_t end_byte);`
`115`		`-template ChunkedBufferImpl<3968> truncate(const ChunkedBufferImpl<3968>& input, size_t start_byte, size_t end_byte);`
	`115`	`+template ChunkedBufferImpl<4032ul> truncate(const ChunkedBufferImpl<4032ul>& input, size_t start_byte, size_t end_byte);`
`116`	`116`
`117`	`117`	`} //namespace arcticdb`