diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_predict_dense_default_batch.h b/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_predict_dense_default_batch.h
index 499754808fd..3c70e6c68dd 100644
--- a/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_predict_dense_default_batch.h
+++ b/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_predict_dense_default_batch.h
@@ -67,10 +67,11 @@ class KNNClassificationPredictKernel<algorithmFpType, defaultDense, cpu> : publi
                              const daal::algorithms::Parameter * par);
 
 protected:
-    void findNearestNeighbors(const algorithmFpType * query, Heap<GlobalNeighbors<algorithmFpType, cpu>, cpu> & heap,
-                              kdtree_knn_classification::internal::Stack<SearchNode<algorithmFpType>, cpu> & stack, size_t k, algorithmFpType radius,
-                              const KDTreeTable & kdTreeTable, size_t rootTreeNodeIndex, const NumericTable & data, const bool isHomogenSOA,
-                              services::internal::TArrayScalable<algorithmFpType *, cpu> & soa_arrays);
+    services::Status findNearestNeighbors(const algorithmFpType * query, Heap<GlobalNeighbors<algorithmFpType, cpu>, cpu> & heap,
+                                          kdtree_knn_classification::internal::Stack<SearchNode<algorithmFpType>, cpu> & stack, size_t k,
+                                          algorithmFpType radius, const KDTreeNode * nodes, size_t rootTreeNodeIndex,
+                                          const NumericTable & data, const bool isHomogenSOA,
+                                          services::internal::TArrayScalable<algorithmFpType *, cpu> & soa_arrays);
 
     services::Status predict(algorithmFpType * predictedClass, const Heap<GlobalNeighbors<algorithmFpType, cpu>, cpu> & heap,
                              const NumericTable * labels, size_t k, VoteWeights voteWeights, const NumericTable * modelIndices,
diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_predict_dense_default_batch_impl.i b/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_predict_dense_default_batch_impl.i
old mode 100755
new mode 100644
index 82cb20faaed..17e1038b664
--- a/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_predict_dense_default_batch_impl.i
+++ b/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_predict_dense_default_batch_impl.i
@@ -39,6 +39,11 @@
 #include "src/algorithms/k_nearest_neighbors/kdtree_knn_classification_model_impl.h"
 #include "src/algorithms/k_nearest_neighbors/kdtree_knn_impl.i"
 #include "src/algorithms/k_nearest_neighbors/knn_heap.h"
+#include <iostream>
+
+#if defined(DAAL_INTEL_CPP_COMPILER)
+    #include <immintrin.h>
+#endif
 
 namespace daal
 {
@@ -145,11 +150,33 @@ Status KNNClassificationPredictKernel<algorithmFpType, defaultDense, cpu>::compu
 
     if (par3 == NULL) return Status(ErrorNullParameterNotSupported);
 
-    const Model * const model    = static_cast<const Model *>(m);
-    const auto & kdTreeTable     = *(model->impl()->getKDTreeTable());
-    const auto rootTreeNodeIndex = model->impl()->getRootNodeIndex();
-    const NumericTable & data    = *(model->impl()->getData());
-    const NumericTable * labels  = nullptr;
+    const Model * const model       = static_cast<const Model *>(m);
+    const KDTreeTable & kdTreeTable = *(model->impl()->getKDTreeTable());
+    const KDTreeNode * const nodes = static_cast<const KDTreeNode *>(kdTreeTable.getArray());
+    const size_t xRowCount    = x->getNumberOfRows();
+
+
+
+    const algorithmFpType base        = 2.0;
+    const algorithmFpType baseInPower = Math::sPowx(base, Math::sCeil(Math::sLog(base * xRowCount - 1) / Math::sLog(base)));
+    DAAL_ASSERT(baseInPower > 0)
+    const size_t maxKDTreeNodeCount = ((size_t)baseInPower * __KDTREE_MAX_NODE_COUNT_MULTIPLICATION_FACTOR) / __KDTREE_LEAF_BUCKET_SIZE + 1;
+    for(int index = 0; index < maxKDTreeNodeCount; index++){
+        const KDTreeNode& node = nodes[index]; 
+
+
+        std::cout << "Node Index: " << index
+                  << ", Dimension: " << node.dimension
+                  << ", Cut Point: " << node.cutPoint
+                  << ", Left Index: " << node.leftIndex
+                  << ", Right Index: " << node.rightIndex << std::endl;
+    }
+
+
+
+    const auto rootTreeNodeIndex    = model->impl()->getRootNodeIndex();
+    const NumericTable & data       = *(model->impl()->getData());
+    const NumericTable * labels     = nullptr;
     if (resultsToEvaluate != 0)
     {
         labels = model->impl()->getLabels().get();
@@ -164,28 +191,29 @@ Status KNNClassificationPredictKernel<algorithmFpType, defaultDense, cpu>::compu
     }
     const size_t heapSize = (iSize / 16 + 1) * 16;
 
-    const size_t xRowCount        = x->getNumberOfRows();
-    const algorithmFpType base    = 2.0;
-    const size_t expectedMaxDepth = (Math::sLog(xRowCount) / Math::sLog(base) + 1) * __KDTREE_DEPTH_MULTIPLICATION_FACTOR;
-    const size_t stackSize        = Math::sPowx(base, Math::sCeil(Math::sLog(expectedMaxDepth) / Math::sLog(base)));
+    // const size_t xRowCount        = x->getNumberOfRows();
+    // const algorithmFpType base    = 2.0;
+    const size_t expectedMaxDepth = (Math::xsLog(xRowCount) / Math::xsLog(base) + 1) * __KDTREE_DEPTH_MULTIPLICATION_FACTOR;
+    const size_t stackSize        = Math::xsPowx(base, Math::xsCeil(Math::xsLog(expectedMaxDepth) / Math::xsLog(base)));
     struct Local
     {
         MaxHeap heap;
         SearchStack stack;
     };
+    SafeStatus safeStat;
     daal::tls<Local *> localTLS([&]() -> Local * {
         Local * const ptr = service_scalable_calloc<Local, cpu>(1);
         if (ptr)
         {
             if (!ptr->heap.init(heapSize))
             {
-                status.add(services::ErrorMemoryAllocationFailed);
+                safeStat.add(services::ErrorMemoryAllocationFailed);
                 service_scalable_free<Local, cpu>(ptr);
                 return nullptr;
             }
             if (!ptr->stack.init(stackSize))
             {
-                status.add(services::ErrorMemoryAllocationFailed);
+                safeStat.add(services::ErrorMemoryAllocationFailed);
                 ptr->heap.clear();
                 service_scalable_free<Local, cpu>(ptr);
                 return nullptr;
@@ -193,7 +221,7 @@ Status KNNClassificationPredictKernel<algorithmFpType, defaultDense, cpu>::compu
         }
         else
         {
-            status.add(services::ErrorMemoryAllocationFailed);
+            safeStat.add(services::ErrorMemoryAllocationFailed);
         }
         return ptr;
     });
@@ -201,23 +229,23 @@ Status KNNClassificationPredictKernel<algorithmFpType, defaultDense, cpu>::compu
     DAAL_CHECK_STATUS_OK((status.ok()), status);
 
     const auto maxThreads     = threader_get_threads_number();
+    auto nThreads             = (maxThreads < 1) ? 1 : maxThreads;
     const size_t xColumnCount = x->getNumberOfColumns();
-    const auto rowsPerBlock   = (xRowCount + maxThreads - 1) / maxThreads;
+    const auto rowsPerBlock   = (xRowCount + nThreads - 1) / nThreads;
     const auto blockCount     = (xRowCount + rowsPerBlock - 1) / rowsPerBlock;
-    SafeStatus safeStat;
 
     services::internal::TArrayScalable<algorithmFpType *, cpu> soa_arrays;
     bool isHomogenSOA = checkHomogenSOA<algorithmFpType, cpu>(data, soa_arrays);
 
     daal::threader_for(blockCount, blockCount, [&](int iBlock) {
         Local * const local = localTLS.local();
-        if (local)
-        {
-            services::Status s;
+        DAAL_CHECK_MALLOC_THR(local);
 
-            const size_t first = iBlock * rowsPerBlock;
-            const size_t last  = min<cpu>(static_cast<decltype(xRowCount)>(first + rowsPerBlock), xRowCount);
+        const size_t first = iBlock * rowsPerBlock;
+        const size_t last  = min<cpu>(static_cast<decltype(xRowCount)>(first + rowsPerBlock), xRowCount);
 
+        if (local)
+        {
             const algorithmFpType radius = MaxVal::get();
             data_management::BlockDescriptor<algorithmFpType> xBD;
             const_cast<NumericTable &>(*x).getBlockOfRows(first, last - first, readOnly, xBD);
@@ -227,60 +255,55 @@ Status KNNClassificationPredictKernel<algorithmFpType, defaultDense, cpu>::compu
             data_management::BlockDescriptor<algorithmFpType> distancesBD;
             if (indices)
             {
-                s = indices->getBlockOfRows(first, last - first, writeOnly, indicesBD);
-                DAAL_CHECK_STATUS_THR(s);
+                DAAL_CHECK_STATUS_THR(indices->getBlockOfRows(first, last - first, writeOnly, indicesBD));
             }
             if (distances)
             {
-                s = distances->getBlockOfRows(first, last - first, writeOnly, distancesBD);
-                DAAL_CHECK_STATUS_THR(s);
+                DAAL_CHECK_STATUS_THR(distances->getBlockOfRows(first, last - first, writeOnly, distancesBD));
             }
 
             if (labels)
             {
                 const size_t yColumnCount = y->getNumberOfColumns();
                 data_management::BlockDescriptor<algorithmFpType> yBD;
-                y->getBlockOfRows(first, last - first, writeOnly, yBD);
+                DAAL_CHECK_STATUS_THR(y->getBlockOfRows(first, last - first, writeOnly, yBD));
                 auto * const dy = yBD.getBlockPtr();
 
                 for (size_t i = 0; i < last - first; ++i)
                 {
-                    findNearestNeighbors(&dx[i * xColumnCount], local->heap, local->stack, k, radius, kdTreeTable, rootTreeNodeIndex, data,
-                                         isHomogenSOA, soa_arrays);
-                    s = predict(&(dy[i * yColumnCount]), local->heap, labels, k, voteWeights, modelIndices, indicesBD, distancesBD, i, nClasses);
-                    DAAL_CHECK_STATUS_THR(s)
+                    DAAL_CHECK_STATUS_THR(findNearestNeighbors(&dx[i * xColumnCount], local->heap, local->stack, k, radius, nodes,
+                                                               rootTreeNodeIndex, data, isHomogenSOA, soa_arrays));
+                    DAAL_CHECK_STATUS_THR(
+                        predict(&dy[i * yColumnCount], local->heap, labels, k, voteWeights, modelIndices, indicesBD, distancesBD, i, nClasses));
                 }
-
-                s |= y->releaseBlockOfRows(yBD);
-                DAAL_CHECK_STATUS_THR(s);
+                DAAL_CHECK_STATUS_THR(y->releaseBlockOfRows(yBD));
             }
             else
             {
                 for (size_t i = 0; i < last - first; ++i)
                 {
-                    findNearestNeighbors(&dx[i * xColumnCount], local->heap, local->stack, k, radius, kdTreeTable, rootTreeNodeIndex, data,
-                                         isHomogenSOA, soa_arrays);
-                    s = predict(nullptr, local->heap, labels, k, voteWeights, modelIndices, indicesBD, distancesBD, i, nClasses);
-                    DAAL_CHECK_STATUS_THR(s)
+                    DAAL_CHECK_STATUS_THR(findNearestNeighbors(&dx[i * xColumnCount], local->heap, local->stack, k, radius, nodes,
+                                                               rootTreeNodeIndex, data, isHomogenSOA, soa_arrays));
+                    DAAL_CHECK_STATUS_THR(predict(nullptr, local->heap, labels, k, voteWeights, modelIndices, indicesBD, distancesBD, i, nClasses));
                 }
             }
 
             if (indices)
             {
-                s |= indices->releaseBlockOfRows(indicesBD);
+                DAAL_CHECK_STATUS_THR(indices->releaseBlockOfRows(indicesBD));
             }
-            DAAL_CHECK_STATUS_THR(s);
+
             if (distances)
             {
-                s |= distances->releaseBlockOfRows(distancesBD);
+                DAAL_CHECK_STATUS_THR(distances->releaseBlockOfRows(distancesBD));
             }
-            DAAL_CHECK_STATUS_THR(s);
 
             const_cast<NumericTable &>(*x).releaseBlockOfRows(xBD);
         }
     });
 
-    DAAL_CHECK_SAFE_STATUS()
+    status = safeStat.detach();
+    if (!status) return status;
 
     localTLS.reduce([&](Local * ptr) -> void {
         if (ptr)
@@ -298,52 +321,59 @@ DAAL_FORCEINLINE void computeDistance(size_t start, size_t end, algorithmFpType
                                       const NumericTable & data, data_management::BlockDescriptor<algorithmFpType> xBD[2],
                                       services::internal::TArrayScalable<algorithmFpType *, cpu> & soa_arrays)
 {
+    // Initialize the distance array to zero for the range [start, end)
     for (size_t i = start; i < end; ++i)
     {
         distance[i - start] = 0;
     }
 
-    size_t curBDIdx  = 0;
-    size_t nextBDIdx = 1;
+    size_t curBDIdx  = 0; // Current block descriptor index
+    size_t nextBDIdx = 1; // Next block descriptor index
 
-    const size_t xColumnCount = data.getNumberOfColumns();
+    const size_t xColumnCount = data.getNumberOfColumns(); // Total number of columns in the data
 
-    const algorithmFpType * nx = nullptr;
-    const algorithmFpType * dx = getNtData(isHomogenSOA, 0, start, end - start, data, xBD[curBDIdx], soa_arrays);
+    const algorithmFpType * dx =
+        getNtData(isHomogenSOA, 0, start, end - start, data, xBD[curBDIdx], soa_arrays); // Retrieve data for the first column
 
-    size_t j;
-    for (j = 1; j < xColumnCount; ++j)
+    // Iterate over each column to compute squared distances
+    for (size_t j = 1; j < xColumnCount; ++j)
     {
-        nx = getNtData(isHomogenSOA, j, start, end - start, data, xBD[nextBDIdx], soa_arrays);
+        const algorithmFpType * nx =
+            getNtData(isHomogenSOA, j, start, end - start, data, xBD[nextBDIdx], soa_arrays); // Retrieve data for the next column
 
+        // Prefetch the next column data to optimize memory access
         DAAL_PREFETCH_READ_T0(nx);
-        DAAL_PREFETCH_READ_T0(nx + 16);
+        DAAL_PREFETCH_READ_T0(nx + 16); // Adjust prefetch based on expected access patterns
 
+        // Compute distance contributions from the current column
         for (size_t i = 0; i < end - start; ++i)
         {
             distance[i] += (query[j - 1] - dx[i]) * (query[j - 1] - dx[i]);
         }
 
+        // Release the current block of data to avoid memory leaks
         releaseNtData<algorithmFpType, cpu>(isHomogenSOA, data, xBD[curBDIdx]);
 
+        // Swap block descriptors and pointers for the next iteration
         services::internal::swap<cpu, size_t>(curBDIdx, nextBDIdx);
         services::internal::swap<cpu, const algorithmFpType *>(dx, nx);
     }
-    {
-        for (size_t i = 0; i < end - start; ++i)
-        {
-            distance[i] += (query[j - 1] - dx[i]) * (query[j - 1] - dx[i]);
-        }
 
-        releaseNtData<algorithmFpType, cpu>(isHomogenSOA, data, xBD[curBDIdx]);
+    // Handle the last column after the loop
+    for (size_t i = 0; i < end - start; ++i)
+    {
+        distance[i] += (query[xColumnCount - 1] - dx[i]) * (query[xColumnCount - 1] - dx[i]);
     }
+
+    // Release the final block of data
+    releaseNtData<algorithmFpType, cpu>(isHomogenSOA, data, xBD[curBDIdx]);
 }
 
 template <typename algorithmFpType, CpuType cpu>
-void KNNClassificationPredictKernel<algorithmFpType, defaultDense, cpu>::findNearestNeighbors(
+services::Status KNNClassificationPredictKernel<algorithmFpType, defaultDense, cpu>::findNearestNeighbors(
     const algorithmFpType * query, Heap<GlobalNeighbors<algorithmFpType, cpu>, cpu> & heap,
     kdtree_knn_classification::internal::Stack<SearchNode<algorithmFpType>, cpu> & stack, size_t k, algorithmFpType radius,
-    const KDTreeTable & kdTreeTable, size_t rootTreeNodeIndex, const NumericTable & data, const bool isHomogenSOA,
+    const KDTreeNode * nodes, size_t rootTreeNodeIndex, const NumericTable & data, const bool isHomogenSOA,
     services::internal::TArrayScalable<algorithmFpType *, cpu> & soa_arrays)
 {
     heap.reset();
@@ -354,22 +384,20 @@ void KNNClassificationPredictKernel<algorithmFpType, defaultDense, cpu>::findNea
     const KDTreeNode * node;
     cur.nodeIndex   = rootTreeNodeIndex;
     cur.minDistance = 0;
-
-    DAAL_ALIGNAS(256) algorithmFpType distance[__KDTREE_LEAF_BUCKET_SIZE + 1];
+    algorithmFpType distance[__KDTREE_LEAF_BUCKET_SIZE + 1];
     size_t start, end;
 
     data_management::BlockDescriptor<algorithmFpType> xBD[2];
+
     for (;;)
     {
-        node = static_cast<const KDTreeNode *>(kdTreeTable.getArray()) + cur.nodeIndex;
-        if (node->dimension == __KDTREE_NULLDIMENSION)
+        node = &nodes[cur.nodeIndex];
+        if (node->dimension >1000)
         {
             start = node->leftIndex;
             end   = node->rightIndex;
-
             computeDistance<algorithmFpType, cpu>(start, end, distance, query, isHomogenSOA, data, xBD, soa_arrays);
-
-            for (i = start; i < end; ++i)
+            for (i = start; i < end; i++)
             {
                 if (distance[i - start] <= radius)
                 {
@@ -398,7 +426,7 @@ void KNNClassificationPredictKernel<algorithmFpType, defaultDense, cpu>::findNea
             if (!stack.empty())
             {
                 cur = stack.pop();
-                DAAL_PREFETCH_READ_T0(static_cast<const KDTreeNode *>(kdTreeTable.getArray()) + cur.nodeIndex);
+                //DAAL_PREFETCH_READ_T0(nodes[cur.nodeIndex]);
             }
             else
             {
@@ -409,7 +437,6 @@ void KNNClassificationPredictKernel<algorithmFpType, defaultDense, cpu>::findNea
         {
             algorithmFpType val        = query[node->dimension];
             const algorithmFpType diff = val - node->cutPoint;
-
             if (cur.minDistance <= radius)
             {
                 cur.nodeIndex    = (diff < 0) ? node->leftIndex : node->rightIndex;
@@ -421,7 +448,7 @@ void KNNClassificationPredictKernel<algorithmFpType, defaultDense, cpu>::findNea
             else if (!stack.empty())
             {
                 cur = stack.pop();
-                DAAL_PREFETCH_READ_T0(static_cast<const KDTreeNode *>(kdTreeTable.getArray()) + cur.nodeIndex);
+                //DAAL_PREFETCH_READ_T0(nodes[cur.nodeIndex]);
             }
             else
             {
@@ -429,6 +456,7 @@ void KNNClassificationPredictKernel<algorithmFpType, defaultDense, cpu>::findNea
             }
         }
     }
+    return services::Status();
 }
 
 template <typename algorithmFpType, CpuType cpu>
@@ -441,7 +469,7 @@ services::Status KNNClassificationPredictKernel<algorithmFpType, defaultDense, c
 
     const size_t heapSize = heap.size();
     if (heapSize < 1) return services::Status();
-
+    SafeStatus safeStat;
     if (indices.getNumberOfRows() != 0)
     {
         DAAL_ASSERT(modelIndices);
@@ -479,7 +507,7 @@ services::Status KNNClassificationPredictKernel<algorithmFpType, defaultDense, c
             distancesPtr[i] = heap[i].distance;
         }
 
-        Math::vSqrt(heapSize, distancesPtr, distancesPtr);
+        Math::xvSqrt(heapSize, distancesPtr, distancesPtr);
 
         for (size_t i = heapSize; i < nDistances; ++i)
         {
@@ -492,10 +520,12 @@ services::Status KNNClassificationPredictKernel<algorithmFpType, defaultDense, c
         DAAL_ASSERT(predictedClass);
 
         data_management::BlockDescriptor<algorithmFpType> labelBD;
-        algorithmFpType * classes      = static_cast<algorithmFpType *>(daal::services::internal::service_malloc<algorithmFpType, cpu>(heapSize));
-        algorithmFpType * classWeights = static_cast<algorithmFpType *>(daal::services::internal::service_malloc<algorithmFpType, cpu>(nClasses));
-        DAAL_CHECK_MALLOC(classWeights);
-        DAAL_CHECK_MALLOC(classes);
+        algorithmFpType * classes =
+            static_cast<algorithmFpType *>(daal::services::internal::service_malloc<algorithmFpType, cpu>(heapSize * sizeof(algorithmFpType)));
+        DAAL_CHECK_MALLOC(classes)
+        algorithmFpType * classWeights =
+            static_cast<algorithmFpType *>(daal::services::internal::service_malloc<algorithmFpType, cpu>(nClasses * sizeof(algorithmFpType)));
+        DAAL_CHECK_MALLOC(classWeights)
 
         for (size_t i = 0; i < nClasses; ++i)
         {
@@ -563,10 +593,8 @@ services::Status KNNClassificationPredictKernel<algorithmFpType, defaultDense, c
             }
         }
         *predictedClass = maxWeightClass;
-
-        service_free<algorithmFpType, cpu>(classes);
-        service_free<algorithmFpType, cpu>(classWeights);
-        classes = nullptr;
+        daal_free(classes);
+        daal_free(classWeights);
     }
 
     return services::Status();
diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_train_dense_default_impl.i b/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_train_dense_default_impl.i
index 7822be4cd0b..b2f50feb4b7 100644
--- a/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_train_dense_default_impl.i
+++ b/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_train_dense_default_impl.i
@@ -61,39 +61,72 @@ using namespace kdtree_knn_classification::internal;
 template <typename T, CpuType cpu>
 class Queue
 {
+    static const size_t defaultSize = 4;
 public:
-    Queue() : _data(nullptr) {}
+    Queue() : _data(nullptr), _first(0), _last(0), _count(0), _size(0), _capacity(0) {}
 
     ~Queue()
     {
-        services::daal_free(_data);
-        _data = nullptr;
+        clear();
     }
 
+    Queue(const Queue &)             = delete;
+    Queue & operator=(const Queue &) = delete;
+
     bool init(size_t size)
     {
         clear();
+        if (size == 0) // Check for valid size
+        {
+            return false;
+        }
+
         _first = _count = 0;
         _last = _sizeMinus1 = (_size = size) - 1;
-        return ((_data = static_cast<T *>(service_malloc<T, cpu>(size * sizeof(T)))) != nullptr);
+        _data = static_cast<T *>(service_malloc<T, cpu>(size));
+
+        if (!_data) // Check if memory allocation was successful
+        {
+            return false;
+        }
+
+        _capacity = _size; // Initialize capacity
+        return true;
     }
 
     void clear()
     {
-        daal_free(_data);
-        _data = nullptr;
+        if (_data)
+        {
+            daal::services::internal::service_free<T, cpu>(_data); // Free allocated memory if it exists
+            _data = nullptr;
+        }
+        _first = _last = _count = _size = _sizeMinus1 = _capacity = 0; // Reset state
     }
 
+    void reset() { _first = _last = _count = 0; }
+
     DAAL_FORCEINLINE void push(const T & value)
     {
-        _data[_last = (_last + 1) & _sizeMinus1] = value;
+        if (_count >= _capacity) // Check if capacity is exceeded
+        {
+            services::Status status = grow(); // Grow if necessary
+            //DAAL_CHECK_STATUS_VAR(status);
+        }
+
+        _data[_last = (_last + 1) & _sizeMinus1] = value; // Add element to queue
         ++_count;
     }
 
     DAAL_FORCEINLINE T pop()
     {
-        const T value = _data[_first++];
-        _first *= (_first != _size);
+        // if (empty()) // Check if queue is empty
+        // {
+        //     throw std::underflow_error("Queue underflow: no elements to pop.");
+        // }
+
+        const T value = _data[_first++]; // Retrieve element
+        _first *= (_first != _size); // Reset first index if it reaches the end
         --_count;
         return value;
     }
@@ -102,13 +135,36 @@ public:
 
     size_t size() const { return _count; }
 
+
 private:
+    services::Status grow()
+    {
+        int result = 0;
+        _capacity  = (_capacity == 0 ? defaultSize : _capacity * 2); // Double capacity or set to default
+
+        T * const newData = daal::services::internal::service_malloc<T, cpu>(_capacity);
+        DAAL_CHECK_MALLOC(newData);
+
+        if (_data != nullptr)
+        {
+            result = services::internal::daal_memcpy_s(newData, _last * sizeof(T), _data, _last * sizeof(T));
+            daal::services::internal::service_free<T, cpu>(_data); // Free old data
+            _data = nullptr;
+        }
+
+        _data = newData; // Assign new expanded memory
+        _size = _capacity; // Adjust size to new capacity
+        _sizeMinus1 = _capacity - 1; // Update size minus 1 for wrapping
+        return (!result) ? services::Status() : services::Status(services::ErrorMemoryCopyFailedInternal);
+    }
+
     T * _data;
-    size_t _first;
-    size_t _last;
-    size_t _count;
-    size_t _size;
-    size_t _sizeMinus1;
+    size_t _first;      // Index of the first element
+    size_t _last;       // Index of the last element
+    size_t _count;      // Current number of elements
+    size_t _size;       // Current size of the queue
+    size_t _sizeMinus1; // Helper for wrap-around logic
+    size_t _capacity;   // Maximum capacity of the queue
 };
 
 struct BuildNode
@@ -160,14 +216,16 @@ Status KNNClassificationTrainBatchKernel<algorithmFpType, training::defaultDense
 
     Queue<BuildNode, cpu> q;
     BBox * bboxQ = nullptr;
+    auto oldThreads = services::Environment::getInstance()->getNumberOfThreads();
     DAAL_CHECK_STATUS(status, buildFirstPartOfKDTree(q, bboxQ, *x, *r, indexes, engine));
+    services::Environment::getInstance()->setNumberOfThreads(1);
     DAAL_CHECK_STATUS(status, buildSecondPartOfKDTree(q, bboxQ, *x, *r, indexes, engine));
+    services::Environment::getInstance()->setNumberOfThreads(oldThreads);
     DAAL_CHECK_STATUS(status, rearrangePoints(*x, indexes));
     if (y)
     {
         DAAL_CHECK_STATUS(status, rearrangePoints(*y, indexes));
     }
-
     daal_free(bboxQ);
     bboxQ = nullptr;
     return status;
@@ -183,10 +241,9 @@ Status KNNClassificationTrainBatchKernel<algorithmFpType, training::defaultDense
     typedef daal::internal::MathInst<algorithmFpType, cpu> Math;
     typedef BoundingBox<algorithmFpType> BBox;
 
-    const auto maxThreads      = threader_get_threads_number();
     const algorithmFpType base = 2.0;
     const size_t queueSize =
-        2 * Math::sPowx(base, Math::sCeil(Math::sLog(__KDTREE_FIRST_PART_LEAF_NODES_PER_THREAD * maxThreads) / Math::sLog(base)));
+        2 * Math::sPowx(base, Math::sCeil(Math::sLog(__KDTREE_FIRST_PART_LEAF_NODES_PER_THREAD) / Math::sLog(base)));
     const size_t firstPartLeafNodeCount = queueSize / 2;
     q.init(queueSize);
     const size_t xColumnCount = x.getNumberOfColumns();
@@ -196,7 +253,7 @@ Status KNNClassificationTrainBatchKernel<algorithmFpType, training::defaultDense
     DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, queueSize, xColumnCount);
     DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, bboxSize, sizeof(BBox));
 
-    bboxQ = static_cast<BBox *>(service_malloc<BBox, cpu>(bboxSize * sizeof(BBox), sizeof(BBox)));
+    bboxQ = static_cast<BBox *>(service_malloc<BBox, cpu>(bboxSize));
 
     DAAL_CHECK_MALLOC(bboxQ)
     r.impl()->setLastNodeIndex(0);
@@ -221,7 +278,7 @@ Status KNNClassificationTrainBatchKernel<algorithmFpType, training::defaultDense
     size_t sophisticatedSampleIndexes[__KDTREE_DIMENSION_SELECTION_SIZE];
     algorithmFpType sophisticatedSampleValues[__KDTREE_DIMENSION_SELECTION_SIZE];
     const size_t subSampleCount  = xRowCount / __KDTREE_SEARCH_SKIP + 1;
-    algorithmFpType * subSamples = static_cast<algorithmFpType *>(service_malloc<algorithmFpType, cpu>(subSampleCount * sizeof(algorithmFpType)));
+    algorithmFpType * subSamples = static_cast<algorithmFpType *>(service_malloc<algorithmFpType, cpu>(subSampleCount));
     DAAL_CHECK_MALLOC(subSamples)
 
     while (maxNodeCountForCurrentDepth < firstPartLeafNodeCount)
@@ -315,8 +372,8 @@ Status KNNClassificationTrainBatchKernel<algorithmFpType, training::defaultDense
 
         const_cast<NumericTable &>(x).getBlockOfColumnValues(j, 0, xRowCount, readOnly, columnBD);
         const algorithmFpType * const dx = columnBD.getBlockPtr();
-
-        daal::tls<BBox *> bboxTLS([=, &status]() -> BBox * {
+        SafeStatus safeStat;
+        daal::tls<BBox *> bboxTLS([&]() -> BBox * {
             BBox * const ptr = service_scalable_calloc<BBox, cpu>(1);
             if (ptr)
             {
@@ -325,51 +382,52 @@ Status KNNClassificationTrainBatchKernel<algorithmFpType, training::defaultDense
             }
             else
             {
-                status.add(services::ErrorMemoryAllocationFailed);
+                safeStat.add(services::ErrorMemoryAllocationFailed);
             }
             return ptr;
         });
 
         DAAL_CHECK_STATUS_OK((status.ok()), status);
 
-        daal::threader_for(blockCount, blockCount, [=, &bboxTLS](int iBlock) {
+        daal::threader_for(blockCount, blockCount, [=, &bboxTLS, &safeStat](int iBlock) {
             BBox * const bboxLocal = bboxTLS.local();
-            if (bboxLocal)
-            {
-                const size_t first = iBlock * rowsPerBlock;
-                const size_t last  = min<cpu>(static_cast<decltype(xRowCount)>(first + rowsPerBlock), xRowCount);
+            DAAL_CHECK_MALLOC_THR(bboxLocal);
+            const size_t first = iBlock * rowsPerBlock;
+            const size_t last  = min<cpu>(static_cast<decltype(xRowCount)>(first + rowsPerBlock), xRowCount);
 
-                if (first < last)
+            if (first < last)
+            {
+                BBox b;
+                size_t i = first;
+                b.upper  = dx[indexes[i]];
+                b.lower  = dx[indexes[i]];
+                PRAGMA_IVDEP
+                for (++i; i < last; ++i)
                 {
-                    BBox b;
-                    size_t i = first;
-                    b.upper  = dx[indexes[i]];
-                    b.lower  = dx[indexes[i]];
-                    PRAGMA_IVDEP
-                    for (++i; i < last; ++i)
+                    if (b.lower > dx[indexes[i]])
                     {
-                        if (b.lower > dx[indexes[i]])
-                        {
-                            b.lower = dx[indexes[i]];
-                        }
-                        if (b.upper < dx[indexes[i]])
-                        {
-                            b.upper = dx[indexes[i]];
-                        }
-                    }
-
-                    if (bboxLocal->upper < b.upper)
-                    {
-                        bboxLocal->upper = b.upper;
+                        b.lower = dx[indexes[i]];
                     }
-                    if (bboxLocal->lower > b.lower)
+                    if (b.upper < dx[indexes[i]])
                     {
-                        bboxLocal->lower = b.lower;
+                        b.upper = dx[indexes[i]];
                     }
                 }
+
+                if (bboxLocal->upper < b.upper)
+                {
+                    bboxLocal->upper = b.upper;
+                }
+                if (bboxLocal->lower > b.lower)
+                {
+                    bboxLocal->lower = b.lower;
+                }
             }
         });
 
+        status = safeStat.detach();
+        if (!status) return status;
+
         bboxTLS.reduce([=](BBox * v) -> void {
             if (v)
             {
@@ -713,8 +771,8 @@ size_t KNNClassificationTrainBatchKernel<algorithmFpType, training::defaultDense
     const auto blockCount    = (end - start + rowsPerBlock - 1) / rowsPerBlock;
     const auto idxMultiplier = 16; // For cache line separation.
 
-    size_t * leftSegmentStartPerBlock  = static_cast<size_t *>(service_malloc<size_t, cpu>(idxMultiplier * (blockCount + 1) * sizeof(size_t)));
-    size_t * rightSegmentStartPerBlock = static_cast<size_t *>(service_malloc<size_t, cpu>(idxMultiplier * blockCount * sizeof(size_t)));
+    size_t * leftSegmentStartPerBlock  = static_cast<size_t *>(service_malloc<size_t, cpu>(idxMultiplier * (blockCount + 1)));
+    size_t * rightSegmentStartPerBlock = static_cast<size_t *>(service_malloc<size_t, cpu>(idxMultiplier * blockCount));
 
     if (!leftSegmentStartPerBlock || !rightSegmentStartPerBlock)
     {
@@ -846,7 +904,7 @@ Status KNNClassificationTrainBatchKernel<algorithmFpType, training::defaultDense
             (rx != wx) ?
                 wx :
                 (buffer ? buffer :
-                          (buffer = static_cast<algorithmFpType *>(service_malloc<algorithmFpType, cpu>(xRowCount * sizeof(algorithmFpType)))));
+                          (buffer = static_cast<algorithmFpType *>(service_malloc<algorithmFpType, cpu>(xRowCount))));
         if (!awx)
         {
             status.add(services::ErrorMemoryAllocationFailed);
@@ -928,10 +986,10 @@ Status KNNClassificationTrainBatchKernel<algorithmFpType, training::defaultDense
     const size_t xColumnCount = x.getNumberOfColumns();
 
     const algorithmFpType base    = 2.0;
-    const size_t expectedMaxDepth = (Math::sLog(xRowCount) / Math::sLog(base) + 1) * __KDTREE_DEPTH_MULTIPLICATION_FACTOR;
-    const size_t stackSize        = Math::sPowx(base, Math::sCeil(Math::sLog(expectedMaxDepth) / Math::sLog(base)));
+    const size_t expectedMaxDepth = (Math::xsLog(xRowCount) / Math::xsLog(base) + 1) * __KDTREE_DEPTH_MULTIPLICATION_FACTOR;
+    const size_t stackSize        = Math::xsPowx(base, Math::xsCeil(Math::xsLog(expectedMaxDepth) / Math::xsLog(base)));
 
-    BuildNode * bnQ = static_cast<BuildNode *>(service_malloc<BuildNode, cpu>(q.size() * sizeof(BuildNode)));
+    BuildNode * bnQ = static_cast<BuildNode *>(service_malloc<BuildNode, cpu>(q.size()));
     DAAL_CHECK_MALLOC(bnQ)
     size_t posQ = 0;
     while (q.size() > 0)
@@ -969,7 +1027,7 @@ Status KNNClassificationTrainBatchKernel<algorithmFpType, training::defaultDense
     const size_t maxNodeCount   = kdTreeTable.getNumberOfRows();
     const size_t emptyNodeCount = maxNodeCount - lastNodeIndex;
     const size_t segment        = (emptyNodeCount + maxThreads - 1) / maxThreads;
-    size_t * firstNodeIndex     = static_cast<size_t *>(service_malloc<size_t, cpu>((maxThreads + 1) * sizeof(*firstNodeIndex)));
+    size_t * firstNodeIndex     = static_cast<size_t *>(service_malloc<size_t, cpu>((maxThreads + 1)));
     DAAL_CHECK_MALLOC(firstNodeIndex)
     size_t nodeIndex = lastNodeIndex;
     for (size_t i = 0; i < maxThreads; ++i)
@@ -988,7 +1046,7 @@ Status KNNClassificationTrainBatchKernel<algorithmFpType, training::defaultDense
             if (!(((ptr->bboxes = service_scalable_calloc<BBox, cpu>(ptr->bboxesCapacity * xColumnCount)) != nullptr)
                   && ((ptr->inSortValues = service_scalable_calloc<IdxValue, cpu>(__KDTREE_INDEX_VALUE_PAIRS_PER_THREAD)) != nullptr)
                   && ((ptr->outSortValues = service_scalable_calloc<IdxValue, cpu>(__KDTREE_INDEX_VALUE_PAIRS_PER_THREAD)) != nullptr)
-                  && ((ptr->fixupQueue = static_cast<size_t *>(service_malloc<size_t, cpu>(ptr->fixupQueueCapacity * sizeof(size_t)))) != nullptr)
+                  && ((ptr->fixupQueue = static_cast<size_t *>(service_malloc<size_t, cpu>(ptr->fixupQueueCapacity))) != nullptr)
                   && ptr->buildStack.init(stackSize)))
             {
                 status.add(services::ErrorMemoryAllocationFailed);
@@ -1087,7 +1145,7 @@ Status KNNClassificationTrainBatchKernel<algorithmFpType, training::defaultDense
                                 if (local->fixupQueueIndex >= local->fixupQueueCapacity)
                                 {
                                     const size_t newCapacity = local->fixupQueueCapacity * 2;
-                                    size_t * const newQueue  = static_cast<size_t *>(service_malloc<size_t, cpu>(newCapacity * sizeof(size_t)));
+                                    size_t * const newQueue  = static_cast<size_t *>(service_malloc<size_t, cpu>(newCapacity));
                                     DAAL_CHECK_THR(newQueue, services::ErrorMemoryAllocationFailed);
                                     result |= daal::services::internal::daal_memcpy_s(newQueue, newCapacity * sizeof(size_t), local->fixupQueue,
                                                                                       local->fixupQueueIndex * sizeof(size_t));
@@ -1126,13 +1184,13 @@ Status KNNClassificationTrainBatchKernel<algorithmFpType, training::defaultDense
                                             local->extraKDTreeNodesCapacity > 0 ? local->extraKDTreeNodesCapacity * 2 : static_cast<size_t>(1024),
                                             extraIndex + 1);
                                         KDTreeNode * const newNodes =
-                                            static_cast<KDTreeNode *>(service_malloc<KDTreeNode, cpu>(newCapacity * sizeof(KDTreeNode)));
+                                            static_cast<KDTreeNode *>(service_malloc<KDTreeNode, cpu>(newCapacity));
 
                                         DAAL_CHECK_THR(newNodes, services::ErrorMemoryAllocationFailed);
 
-                                        result |= daal::services::internal::daal_memcpy_s(newNodes, newCapacity * sizeof(KDTreeNode),
+                                        result |= daal::services::internal::daal_memcpy_s(newNodes, newCapacity,
                                                                                           local->extraKDTreeNodes,
-                                                                                          local->extraKDTreeNodesCapacity * sizeof(KDTreeNode));
+                                                                                          local->extraKDTreeNodesCapacity);
                                         KDTreeNode * oldNodes           = local->extraKDTreeNodes;
                                         local->extraKDTreeNodes         = newNodes;
                                         local->extraKDTreeNodesCapacity = newCapacity;
@@ -1144,7 +1202,7 @@ Status KNNClassificationTrainBatchKernel<algorithmFpType, training::defaultDense
                                 {
                                     local->extraKDTreeNodesCapacity = max<cpu>(extraIndex + 1, static_cast<size_t>(1024));
                                     local->extraKDTreeNodes         = static_cast<KDTreeNode *>(
-                                        service_malloc<KDTreeNode, cpu>(local->extraKDTreeNodesCapacity * sizeof(KDTreeNode)));
+                                        service_malloc<KDTreeNode, cpu>(local->extraKDTreeNodesCapacity));
 
                                     DAAL_CHECK_THR(local->extraKDTreeNodes, services::ErrorMemoryAllocationFailed);
                                 }
@@ -1355,7 +1413,7 @@ algorithmFpType KNNClassificationTrainBatchKernel<algorithmFpType, training::def
         sampleCount = __KDTREE_MIN_SAMPLES + 1;
     }
 
-    algorithmFpType * samples = static_cast<algorithmFpType *>(service_malloc<algorithmFpType, cpu>(sampleCount * sizeof(*samples)));
+    algorithmFpType * samples = static_cast<algorithmFpType *>(service_malloc<algorithmFpType, cpu>(sampleCount));
     if (!samples)
     {
         status = services::ErrorMemoryAllocationFailed;
@@ -1380,7 +1438,7 @@ algorithmFpType KNNClassificationTrainBatchKernel<algorithmFpType, training::def
     samples[i] = upper;
     daal::algorithms::internal::qSort<algorithmFpType, cpu>(sampleCount, samples);
 
-    size_t * hist = static_cast<size_t *>(service_malloc<size_t, cpu>(sampleCount * sizeof(*hist)));
+    size_t * hist = static_cast<size_t *>(service_malloc<size_t, cpu>(sampleCount));
     if (!hist)
     {
         status = services::ErrorMemoryAllocationFailed;
@@ -1393,7 +1451,7 @@ algorithmFpType KNNClassificationTrainBatchKernel<algorithmFpType, training::def
     }
 
     size_t subSampleCount        = (end - start) / __KDTREE_SEARCH_SKIP + 1;
-    algorithmFpType * subSamples = static_cast<algorithmFpType *>(service_malloc<algorithmFpType, cpu>(subSampleCount * sizeof(*subSamples)));
+    algorithmFpType * subSamples = static_cast<algorithmFpType *>(service_malloc<algorithmFpType, cpu>(subSampleCount));
     if (!subSamples)
     {
         status = services::ErrorMemoryAllocationFailed;
diff --git a/cpp/daal/src/externals/service_math.h b/cpp/daal/src/externals/service_math.h
index bbf948bd500..7a4e6221151 100644
--- a/cpp/daal/src/externals/service_math.h
+++ b/cpp/daal/src/externals/service_math.h
@@ -50,18 +50,32 @@ struct Math
 
     static fpType sPowx(fpType in, fpType in1) { return _impl<fpType, cpu>::sPowx(in, in1); }
 
+    static fpType xsPowx(fpType in, fpType in1) { return _impl<fpType, cpu>::xsPowx(in, in1); }
+
     static fpType sCeil(fpType in) { return _impl<fpType, cpu>::sCeil(in); }
 
+    static fpType xsCeil(fpType in) { return _impl<fpType, cpu>::xsCeil(in); }
+
     static fpType sErfInv(fpType in) { return _impl<fpType, cpu>::sErfInv(in); }
 
+    static fpType xsErfInv(fpType in) { return _impl<fpType, cpu>::xsErfInv(in); }
+
     static fpType sErf(fpType in) { return _impl<fpType, cpu>::sErf(in); }
 
+    static fpType xsErf(fpType in) { return _impl<fpType, cpu>::xsErf(in); }
+
     static fpType sLog(fpType in) { return _impl<fpType, cpu>::sLog(in); }
 
+    static fpType xsLog(fpType in) { return _impl<fpType, cpu>::xsLog(in); }
+
     static fpType sCdfNormInv(fpType in) { return _impl<fpType, cpu>::sCdfNormInv(in); }
 
+    static fpType xsCdfNormInv(fpType in) { return _impl<fpType, cpu>::xsCdfNormInv(in); }
+
     static void vPowx(SizeType n, const fpType * in, fpType in1, fpType * out) { _impl<fpType, cpu>::vPowx(n, in, in1, out); }
 
+    static void xvPowx(SizeType n, const fpType * in, fpType in1, fpType * out) { _impl<fpType, cpu>::xvPowx(n, in, in1, out); }
+
     static void vPowxAsLnExp(SizeType n, const fpType * in, fpType in1, fpType * out)
     {
         _impl<fpType, cpu>::vLog(n, in, out);
@@ -72,25 +86,53 @@ struct Math
         _impl<fpType, cpu>::vExp(n, out, out);
     }
 
+    static void xvPowxAsLnExp(SizeType n, const fpType * in, fpType in1, fpType * out)
+    {
+        _impl<fpType, cpu>::xvLog(n, in, out);
+        for (size_t i = 0; i < n; i++)
+        {
+            out[i] *= in1;
+        }
+        _impl<fpType, cpu>::xvExp(n, out, out);
+    }
+
     static void vCeil(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::vCeil(n, in, out); }
 
+    static void xvCeil(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::xvCeil(n, in, out); }
+
     static void vErfInv(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::vErfInv(n, in, out); }
 
+    static void xvErfInv(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::xvErfInv(n, in, out); }
+
     static void vErf(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::vErf(n, in, out); }
 
+    static void xvErf(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::xvErf(n, in, out); }
+
     static void vExp(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::vExp(n, in, out); }
 
+    static void xvExp(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::xvExp(n, in, out); }
+
     static fpType vExpThreshold() { return _impl<fpType, cpu>::vExpThreshold(); }
 
     static void vTanh(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::vTanh(n, in, out); }
 
+    static void xvTanh(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::xvTanh(n, in, out); }
+
     static void vSqrt(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::vSqrt(n, in, out); }
 
+    static void xvSqrt(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::xvSqrt(n, in, out); }
+
     static void vLog(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::vLog(n, in, out); }
 
+    static void xvLog(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::xvLog(n, in, out); }
+
     static void vLog1p(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::vLog1p(n, in, out); }
 
+    static void xvLog1p(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::xvLog1p(n, in, out); }
+
     static void vCdfNormInv(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::vCdfNormInv(n, in, out); }
+
+    static void xvCdfNormInv(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::xvCdfNormInv(n, in, out); }
 };
 
 } // namespace internal
diff --git a/cpp/daal/src/externals/service_math_mkl.h b/cpp/daal/src/externals/service_math_mkl.h
index fa5ce46a5ea..08df5d7b11d 100644
--- a/cpp/daal/src/externals/service_math_mkl.h
+++ b/cpp/daal/src/externals/service_math_mkl.h
@@ -66,6 +66,13 @@ struct MklMath<double, cpu>
         return r;
     }
 
+    static double xsPowx(double in, double in1)
+    {
+        double r;
+        xvPowx(1, &in, in1, &r);
+        return r;
+    }
+
     static double sCeil(double in)
     {
         double r;
@@ -73,6 +80,13 @@ struct MklMath<double, cpu>
         return r;
     }
 
+    static double xsCeil(double in)
+    {
+        double r;
+        xvCeil(1, &in, &r);
+        return r;
+    }
+
     static double sErfInv(double in)
     {
         double r;
@@ -80,6 +94,13 @@ struct MklMath<double, cpu>
         return r;
     }
 
+    static double xsErfInv(double in)
+    {
+        double r;
+        xvErfInv(1, &in, &r);
+        return r;
+    }
+
     static double sErf(double in)
     {
         double r;
@@ -87,6 +108,13 @@ struct MklMath<double, cpu>
         return r;
     }
 
+    static double xsErf(double in)
+    {
+        double r;
+        xvErf(1, &in, &r);
+        return r;
+    }
+
     static double sLog(double in)
     {
         double r;
@@ -94,6 +122,13 @@ struct MklMath<double, cpu>
         return r;
     }
 
+    static double xsLog(double in)
+    {
+        double r;
+        xvLog(1, &in, &r);
+        return r;
+    }
+
     static double sCdfNormInv(double in)
     {
         double r;
@@ -101,56 +136,133 @@ struct MklMath<double, cpu>
         return r;
     }
 
+    static double xsCdfNormInv(double in)
+    {
+        double r;
+        xvCdfNormInv(1, &in, &r);
+        return r;
+    }
+
     static void vPowx(SizeType n, const double * in, double in1, double * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmdPowx, ((int)n, in, in1, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmdPowx, ((MKL_INT)n, in, in1, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvPowx(SizeType n, const double * in, double in1, double * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmdPowx, ((MKL_INT)n, in, in1, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void vCeil(SizeType n, const double * in, double * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmdCeil, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmdCeil, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvCeil(SizeType n, const double * in, double * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmdCeil, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void vErfInv(SizeType n, const double * in, double * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmdErfInv, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmdErfInv, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvErfInv(SizeType n, const double * in, double * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmdErfInv, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void vErf(SizeType n, const double * in, double * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmdErf, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmdErf, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvErf(SizeType n, const double * in, double * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmdErf, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void vExp(SizeType n, const double * in, double * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmdExp, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmdExp, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvExp(SizeType n, const double * in, double * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmdExp, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static double vExpThreshold() { return -650.0; }
 
     static void vTanh(SizeType n, const double * in, double * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmdTanh, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmdTanh, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvTanh(SizeType n, const double * in, double * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmdTanh, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void vSqrt(SizeType n, const double * in, double * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmdSqrt, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmdSqrt, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvSqrt(SizeType n, const double * in, double * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmdSqrt, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void vLog(SizeType n, const double * in, double * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmdLn, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmdLn, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvLog(SizeType n, const double * in, double * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmdLn, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void vLog1p(SizeType n, const double * in, double * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmdLog1p, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmdLog1p, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvLog1p(SizeType n, const double * in, double * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmdLog1p, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void vCdfNormInv(SizeType n, const double * in, double * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmdCdfNormInv, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmdCdfNormInv, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvCdfNormInv(SizeType n, const double * in, double * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmdCdfNormInv, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 };
 
@@ -178,6 +290,13 @@ struct MklMath<float, cpu>
         return r;
     }
 
+    static float xsPowx(float in, float in1)
+    {
+        float r;
+        xvPowx(1, &in, in1, &r);
+        return r;
+    }
+
     static float sCeil(float in)
     {
         float r;
@@ -185,6 +304,13 @@ struct MklMath<float, cpu>
         return r;
     }
 
+    static float xsCeil(float in)
+    {
+        float r;
+        xvCeil(1, &in, &r);
+        return r;
+    }
+
     static float sErfInv(float in)
     {
         float r;
@@ -192,6 +318,13 @@ struct MklMath<float, cpu>
         return r;
     }
 
+    static float xsErfInv(float in)
+    {
+        float r;
+        xvErfInv(1, &in, &r);
+        return r;
+    }
+
     static float sErf(float in)
     {
         float r;
@@ -199,6 +332,13 @@ struct MklMath<float, cpu>
         return r;
     }
 
+    static float xsErf(float in)
+    {
+        float r;
+        xvErf(1, &in, &r);
+        return r;
+    }
+
     static float sLog(float in)
     {
         float r;
@@ -206,6 +346,13 @@ struct MklMath<float, cpu>
         return r;
     }
 
+    static float xsLog(float in)
+    {
+        float r;
+        xvLog(1, &in, &r);
+        return r;
+    }
+
     static float sCdfNormInv(float in)
     {
         float r;
@@ -213,56 +360,133 @@ struct MklMath<float, cpu>
         return r;
     }
 
+    static float xsCdfNormInv(float in)
+    {
+        float r;
+        xvCdfNormInv(1, &in, &r);
+        return r;
+    }
+
     static void vPowx(SizeType n, const float * in, float in1, float * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmsPowx, ((int)n, in, in1, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmsPowx, ((MKL_INT)n, in, in1, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvPowx(SizeType n, const float * in, float in1, float * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmsPowx, ((MKL_INT)n, in, in1, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void vCeil(SizeType n, const float * in, float * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmsCeil, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmsCeil, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvCeil(SizeType n, const float * in, float * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmsCeil, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void vErfInv(SizeType n, const float * in, float * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmsErfInv, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmsErfInv, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvErfInv(SizeType n, const float * in, float * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmsErfInv, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void vErf(SizeType n, const float * in, float * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmsErf, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmsErf, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvErf(SizeType n, const float * in, float * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmsErf, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void vExp(SizeType n, const float * in, float * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmsExp, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmsExp, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvExp(SizeType n, const float * in, float * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmsExp, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static float vExpThreshold() { return -75.0f; }
 
     static void vTanh(SizeType n, const float * in, float * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmsTanh, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmsTanh, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvTanh(SizeType n, const float * in, float * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmsTanh, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void vSqrt(SizeType n, const float * in, float * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmsSqrt, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmsSqrt, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvSqrt(SizeType n, const float * in, float * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmsSqrt, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void vLog(SizeType n, const float * in, float * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmsLn, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmsLn, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvLog(SizeType n, const float * in, float * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmsLn, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void vLog1p(SizeType n, const float * in, float * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmsLog1p, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmsLog1p, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvLog1p(SizeType n, const float * in, float * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmsLog1p, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void vCdfNormInv(SizeType n, const float * in, float * out)
     {
-        __DAAL_MKLFN_CALL_MATH(vmsCdfNormInv, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        __DAAL_MKLFN_CALL_MATH(vmsCdfNormInv, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
+    static void xvCdfNormInv(SizeType n, const float * in, float * out)
+    {
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_MATH(vmsCdfNormInv, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+        mkl_set_num_threads_local(old_nthr);
     }
 };
 
diff --git a/cpp/daal/src/externals/service_math_ref.h b/cpp/daal/src/externals/service_math_ref.h
index 07062c1ba2f..5ac4d515dd1 100644
--- a/cpp/daal/src/externals/service_math_ref.h
+++ b/cpp/daal/src/externals/service_math_ref.h
@@ -58,48 +58,88 @@ struct RefMath<double, cpu>
 
     static double sPowx(double in, double in1) { return pow(in, in1); }
 
+    static double xsPowx(double in, double in1) { return pow(in, in1); }
+
     static double sCeil(double in) { return ceil(in); }
 
+    static double xsCeil(double in) { return ceil(in); }
+
     // Not implemented
     static double sErfInv(double in) { return std::numeric_limits<double>::quiet_NaN(); }
 
+    // Not implemented
+    static double xsErfInv(double in) { return std::numeric_limits<double>::quiet_NaN(); }
+
     static double sErf(double in) { return erf(in); }
 
+    static double xsErf(double in) { return erf(in); }
+
     static double sLog(double in) { return log(in); }
 
+    static double xsLog(double in) { return log(in); }
+
     // Not implemented
     static double sCdfNormInv(double in) { return std::numeric_limits<double>::quiet_NaN(); }
 
+    // Not implemented
+    static double xsCdfNormInv(double in) { return std::numeric_limits<double>::quiet_NaN(); }
+
     static void vPowx(SizeType n, const double * in, double in1, double * out)
     {
 #pragma omp simd
         for (SizeType i = 0; i < n; ++i) out[i] = pow(in[i], in1);
     }
 
+    static void xvPowx(SizeType n, const double * in, double in1, double * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = pow(in[i], in1);
+    }
+
     static void vCeil(SizeType n, const double * in, double * out)
     {
 #pragma omp simd
         for (SizeType i = 0; i < n; ++i) out[i] = ceil(in[i]);
     }
 
+    static void xvCeil(SizeType n, const double * in, double * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = ceil(in[i]);
+    }
+
     // Not implemented
     static void vErfInv(SizeType n, const double * in, double * out)
     {
         for (SizeType i = 0; i < n; ++i) out[i] = std::numeric_limits<double>::quiet_NaN();
     }
 
+    // Not implemented
+    static void xvErfInv(SizeType n, const double * in, double * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = std::numeric_limits<double>::quiet_NaN();
+    }
+
     static void vErf(SizeType n, const double * in, double * out)
     {
 #pragma omp simd
         for (SizeType i = 0; i < n; ++i) out[i] = erf(in[i]);
     }
 
+    static void xvErf(SizeType n, const double * in, double * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = erf(in[i]);
+    }
+
     static void vExp(SizeType n, const double * in, double * out)
     {
 #pragma omp simd
         for (SizeType i = 0; i < n; ++i) out[i] = exp(in[i]);
     }
 
+    static void xvExp(SizeType n, const double * in, double * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = exp(in[i]);
+    }
+
     static double vExpThreshold()
     {
         return -650.0;
@@ -111,29 +151,55 @@ struct RefMath<double, cpu>
         for (SizeType i = 0; i < n; ++i) out[i] = tanh(in[i]);
     }
 
+    static void xvTanh(SizeType n, const double * in, double * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = tanh(in[i]);
+    }
+
     static void vSqrt(SizeType n, const double * in, double * out)
     {
 #pragma omp simd
         for (SizeType i = 0; i < n; ++i) out[i] = sqrt(in[i]);
     }
 
+    static void xvSqrt(SizeType n, const double * in, double * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = sqrt(in[i]);
+    }
+
     static void vLog(SizeType n, const double * in, double * out)
     {
 #pragma omp simd
         for (SizeType i = 0; i < n; ++i) out[i] = log(in[i]);
     }
 
+    static void xvLog(SizeType n, const double * in, double * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = log(in[i]);
+    }
+
     static void vLog1p(SizeType n, const double * in, double * out)
     {
 #pragma omp simd
         for (SizeType i = 0; i < n; ++i) out[i] = log1p(in[i]);
     }
 
+    static void xvLog1p(SizeType n, const double * in, double * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = log1p(in[i]);
+    }
+
     // Not implemented
     static void vCdfNormInv(SizeType n, const double * in, double * out)
     {
         for (SizeType i = 0; i < n; ++i) out[i] = std::numeric_limits<double>::quiet_NaN();
     }
+
+    // Not implemented
+    static void xvCdfNormInv(SizeType n, const double * in, double * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = std::numeric_limits<double>::quiet_NaN();
+    }
 };
 
 /*
@@ -155,48 +221,87 @@ struct RefMath<float, cpu>
 
     static float sPowx(float in, float in1) { return pow(in, in1); }
 
+    static float xsPowx(float in, float in1) { return pow(in, in1); }
+
     static float sCeil(float in) { return ceil(in); }
 
+    static float xsCeil(float in) { return ceil(in); }
+
     // Not implemented
     static float sErfInv(float in) { return std::numeric_limits<float>::quiet_NaN(); }
 
+    // Not implemented
+    static float xsErfInv(float in) { return std::numeric_limits<float>::quiet_NaN(); }
+
     static float sErf(float in) { return erf(in); }
 
+    static float xsErf(float in) { return erf(in); }
+
     static float sLog(float in) { return log(in); }
 
+    static float xsLog(float in) { return log(in); }
+
     // Not implemented
     static float sCdfNormInv(float in) { return std::numeric_limits<float>::quiet_NaN(); }
 
+    static float xsCdfNormInv(float in) { return std::numeric_limits<float>::quiet_NaN(); }
+
     static void vPowx(SizeType n, const float * in, float in1, float * out)
     {
 #pragma omp simd
         for (SizeType i = 0; i < n; ++i) out[i] = pow(in[i], in1);
     }
 
+    static void xvPowx(SizeType n, const float * in, float in1, float * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = pow(in[i], in1);
+    }
+
     static void vCeil(SizeType n, const float * in, float * out)
     {
 #pragma omp simd
         for (SizeType i = 0; i < n; ++i) out[i] = ceil(in[i]);
     }
 
+    static void xvCeil(SizeType n, const float * in, float * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = ceil(in[i]);
+    }
+
     // Not implemented
     static void vErfInv(SizeType n, const float * in, float * out)
     {
         for (SizeType i = 0; i < n; ++i) out[i] = std::numeric_limits<float>::quiet_NaN();
     }
 
+    // Not implemented
+    static void xvErfInv(SizeType n, const float * in, float * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = std::numeric_limits<float>::quiet_NaN();
+    }
+
     static void vErf(SizeType n, const float * in, float * out)
     {
 #pragma omp simd
         for (SizeType i = 0; i < n; ++i) out[i] = erf(in[i]);
     }
 
+    static void xvErf(SizeType n, const float * in, float * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = erf(in[i]);
+    }
+
     static void vExp(SizeType n, const float * in, float * out)
     {
 #pragma omp simd
         for (SizeType i = 0; i < n; ++i) out[i] = exp(in[i]);
     }
 
+    static void xvExp(SizeType n, const float * in, float * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = exp(in[i]);
+    }
+
     static float vExpThreshold()
     {
         return -75.0f;
@@ -208,29 +313,54 @@ struct RefMath<float, cpu>
         for (SizeType i = 0; i < n; ++i) out[i] = tanh(in[i]);
     }
 
+    static void xvTanh(SizeType n, const float * in, float * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = tanh(in[i]);
+    }
+
     static void vSqrt(SizeType n, const float * in, float * out)
     {
 #pragma omp simd
         for (SizeType i = 0; i < n; ++i) out[i] = sqrt(in[i]);
     }
 
+    static void xvSqrt(SizeType n, const float * in, float * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = sqrt(in[i]);
+    }
+
     static void vLog(SizeType n, const float * in, float * out)
     {
 #pragma omp simd
         for (SizeType i = 0; i < n; ++i) out[i] = log(in[i]);
     }
 
+    static void xvLog(SizeType n, const float * in, float * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = log(in[i]);
+    }
+
     static void vLog1p(SizeType n, const float * in, float * out)
     {
 #pragma omp simd
         for (SizeType i = 0; i < n; ++i) out[i] = log1p(in[i]);
     }
 
+    static void xvLog1p(SizeType n, const float * in, float * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = log1p(in[i]);
+    }
+
     // Not implemented
     static void vCdfNormInv(SizeType n, const float * in, float * out)
     {
         for (SizeType i = 0; i < n; ++i) out[i] = std::numeric_limits<float>::quiet_NaN();
     }
+    // Not implemented
+    static void xvCdfNormInv(SizeType n, const float * in, float * out)
+    {
+        for (SizeType i = 0; i < n; ++i) out[i] = std::numeric_limits<float>::quiet_NaN();
+    }
 };
 
 } // namespace ref