diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_predict_dense_default_batch.h b/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_predict_dense_default_batch.h index 499754808fd..3c70e6c68dd 100644 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_predict_dense_default_batch.h +++ b/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_predict_dense_default_batch.h @@ -67,10 +67,11 @@ class KNNClassificationPredictKernel : publi const daal::algorithms::Parameter * par); protected: - void findNearestNeighbors(const algorithmFpType * query, Heap, cpu> & heap, - kdtree_knn_classification::internal::Stack, cpu> & stack, size_t k, algorithmFpType radius, - const KDTreeTable & kdTreeTable, size_t rootTreeNodeIndex, const NumericTable & data, const bool isHomogenSOA, - services::internal::TArrayScalable & soa_arrays); + services::Status findNearestNeighbors(const algorithmFpType * query, Heap, cpu> & heap, + kdtree_knn_classification::internal::Stack, cpu> & stack, size_t k, + algorithmFpType radius, const KDTreeNode * nodes, size_t rootTreeNodeIndex, + const NumericTable & data, const bool isHomogenSOA, + services::internal::TArrayScalable & soa_arrays); services::Status predict(algorithmFpType * predictedClass, const Heap, cpu> & heap, const NumericTable * labels, size_t k, VoteWeights voteWeights, const NumericTable * modelIndices, diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_predict_dense_default_batch_impl.i b/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_predict_dense_default_batch_impl.i old mode 100755 new mode 100644 index 82cb20faaed..17e1038b664 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_predict_dense_default_batch_impl.i +++ b/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_predict_dense_default_batch_impl.i @@ -39,6 +39,11 @@ #include "src/algorithms/k_nearest_neighbors/kdtree_knn_classification_model_impl.h" #include "src/algorithms/k_nearest_neighbors/kdtree_knn_impl.i" #include "src/algorithms/k_nearest_neighbors/knn_heap.h" +#include + +#if defined(DAAL_INTEL_CPP_COMPILER) + #include +#endif namespace daal { @@ -145,11 +150,33 @@ Status KNNClassificationPredictKernel::compu if (par3 == NULL) return Status(ErrorNullParameterNotSupported); - const Model * const model = static_cast(m); - const auto & kdTreeTable = *(model->impl()->getKDTreeTable()); - const auto rootTreeNodeIndex = model->impl()->getRootNodeIndex(); - const NumericTable & data = *(model->impl()->getData()); - const NumericTable * labels = nullptr; + const Model * const model = static_cast(m); + const KDTreeTable & kdTreeTable = *(model->impl()->getKDTreeTable()); + const KDTreeNode * const nodes = static_cast(kdTreeTable.getArray()); + const size_t xRowCount = x->getNumberOfRows(); + + + + const algorithmFpType base = 2.0; + const algorithmFpType baseInPower = Math::sPowx(base, Math::sCeil(Math::sLog(base * xRowCount - 1) / Math::sLog(base))); + DAAL_ASSERT(baseInPower > 0) + const size_t maxKDTreeNodeCount = ((size_t)baseInPower * __KDTREE_MAX_NODE_COUNT_MULTIPLICATION_FACTOR) / __KDTREE_LEAF_BUCKET_SIZE + 1; + for(int index = 0; index < maxKDTreeNodeCount; index++){ + const KDTreeNode& node = nodes[index]; + + + std::cout << "Node Index: " << index + << ", Dimension: " << node.dimension + << ", Cut Point: " << node.cutPoint + << ", Left Index: " << node.leftIndex + << ", Right Index: " << node.rightIndex << std::endl; + } + + + + const auto rootTreeNodeIndex = model->impl()->getRootNodeIndex(); + const NumericTable & data = *(model->impl()->getData()); + const NumericTable * labels = nullptr; if (resultsToEvaluate != 0) { labels = model->impl()->getLabels().get(); @@ -164,28 +191,29 @@ Status KNNClassificationPredictKernel::compu } const size_t heapSize = (iSize / 16 + 1) * 16; - const size_t xRowCount = x->getNumberOfRows(); - const algorithmFpType base = 2.0; - const size_t expectedMaxDepth = (Math::sLog(xRowCount) / Math::sLog(base) + 1) * __KDTREE_DEPTH_MULTIPLICATION_FACTOR; - const size_t stackSize = Math::sPowx(base, Math::sCeil(Math::sLog(expectedMaxDepth) / Math::sLog(base))); + // const size_t xRowCount = x->getNumberOfRows(); + // const algorithmFpType base = 2.0; + const size_t expectedMaxDepth = (Math::xsLog(xRowCount) / Math::xsLog(base) + 1) * __KDTREE_DEPTH_MULTIPLICATION_FACTOR; + const size_t stackSize = Math::xsPowx(base, Math::xsCeil(Math::xsLog(expectedMaxDepth) / Math::xsLog(base))); struct Local { MaxHeap heap; SearchStack stack; }; + SafeStatus safeStat; daal::tls localTLS([&]() -> Local * { Local * const ptr = service_scalable_calloc(1); if (ptr) { if (!ptr->heap.init(heapSize)) { - status.add(services::ErrorMemoryAllocationFailed); + safeStat.add(services::ErrorMemoryAllocationFailed); service_scalable_free(ptr); return nullptr; } if (!ptr->stack.init(stackSize)) { - status.add(services::ErrorMemoryAllocationFailed); + safeStat.add(services::ErrorMemoryAllocationFailed); ptr->heap.clear(); service_scalable_free(ptr); return nullptr; @@ -193,7 +221,7 @@ Status KNNClassificationPredictKernel::compu } else { - status.add(services::ErrorMemoryAllocationFailed); + safeStat.add(services::ErrorMemoryAllocationFailed); } return ptr; }); @@ -201,23 +229,23 @@ Status KNNClassificationPredictKernel::compu DAAL_CHECK_STATUS_OK((status.ok()), status); const auto maxThreads = threader_get_threads_number(); + auto nThreads = (maxThreads < 1) ? 1 : maxThreads; const size_t xColumnCount = x->getNumberOfColumns(); - const auto rowsPerBlock = (xRowCount + maxThreads - 1) / maxThreads; + const auto rowsPerBlock = (xRowCount + nThreads - 1) / nThreads; const auto blockCount = (xRowCount + rowsPerBlock - 1) / rowsPerBlock; - SafeStatus safeStat; services::internal::TArrayScalable soa_arrays; bool isHomogenSOA = checkHomogenSOA(data, soa_arrays); daal::threader_for(blockCount, blockCount, [&](int iBlock) { Local * const local = localTLS.local(); - if (local) - { - services::Status s; + DAAL_CHECK_MALLOC_THR(local); - const size_t first = iBlock * rowsPerBlock; - const size_t last = min(static_cast(first + rowsPerBlock), xRowCount); + const size_t first = iBlock * rowsPerBlock; + const size_t last = min(static_cast(first + rowsPerBlock), xRowCount); + if (local) + { const algorithmFpType radius = MaxVal::get(); data_management::BlockDescriptor xBD; const_cast(*x).getBlockOfRows(first, last - first, readOnly, xBD); @@ -227,60 +255,55 @@ Status KNNClassificationPredictKernel::compu data_management::BlockDescriptor distancesBD; if (indices) { - s = indices->getBlockOfRows(first, last - first, writeOnly, indicesBD); - DAAL_CHECK_STATUS_THR(s); + DAAL_CHECK_STATUS_THR(indices->getBlockOfRows(first, last - first, writeOnly, indicesBD)); } if (distances) { - s = distances->getBlockOfRows(first, last - first, writeOnly, distancesBD); - DAAL_CHECK_STATUS_THR(s); + DAAL_CHECK_STATUS_THR(distances->getBlockOfRows(first, last - first, writeOnly, distancesBD)); } if (labels) { const size_t yColumnCount = y->getNumberOfColumns(); data_management::BlockDescriptor yBD; - y->getBlockOfRows(first, last - first, writeOnly, yBD); + DAAL_CHECK_STATUS_THR(y->getBlockOfRows(first, last - first, writeOnly, yBD)); auto * const dy = yBD.getBlockPtr(); for (size_t i = 0; i < last - first; ++i) { - findNearestNeighbors(&dx[i * xColumnCount], local->heap, local->stack, k, radius, kdTreeTable, rootTreeNodeIndex, data, - isHomogenSOA, soa_arrays); - s = predict(&(dy[i * yColumnCount]), local->heap, labels, k, voteWeights, modelIndices, indicesBD, distancesBD, i, nClasses); - DAAL_CHECK_STATUS_THR(s) + DAAL_CHECK_STATUS_THR(findNearestNeighbors(&dx[i * xColumnCount], local->heap, local->stack, k, radius, nodes, + rootTreeNodeIndex, data, isHomogenSOA, soa_arrays)); + DAAL_CHECK_STATUS_THR( + predict(&dy[i * yColumnCount], local->heap, labels, k, voteWeights, modelIndices, indicesBD, distancesBD, i, nClasses)); } - - s |= y->releaseBlockOfRows(yBD); - DAAL_CHECK_STATUS_THR(s); + DAAL_CHECK_STATUS_THR(y->releaseBlockOfRows(yBD)); } else { for (size_t i = 0; i < last - first; ++i) { - findNearestNeighbors(&dx[i * xColumnCount], local->heap, local->stack, k, radius, kdTreeTable, rootTreeNodeIndex, data, - isHomogenSOA, soa_arrays); - s = predict(nullptr, local->heap, labels, k, voteWeights, modelIndices, indicesBD, distancesBD, i, nClasses); - DAAL_CHECK_STATUS_THR(s) + DAAL_CHECK_STATUS_THR(findNearestNeighbors(&dx[i * xColumnCount], local->heap, local->stack, k, radius, nodes, + rootTreeNodeIndex, data, isHomogenSOA, soa_arrays)); + DAAL_CHECK_STATUS_THR(predict(nullptr, local->heap, labels, k, voteWeights, modelIndices, indicesBD, distancesBD, i, nClasses)); } } if (indices) { - s |= indices->releaseBlockOfRows(indicesBD); + DAAL_CHECK_STATUS_THR(indices->releaseBlockOfRows(indicesBD)); } - DAAL_CHECK_STATUS_THR(s); + if (distances) { - s |= distances->releaseBlockOfRows(distancesBD); + DAAL_CHECK_STATUS_THR(distances->releaseBlockOfRows(distancesBD)); } - DAAL_CHECK_STATUS_THR(s); const_cast(*x).releaseBlockOfRows(xBD); } }); - DAAL_CHECK_SAFE_STATUS() + status = safeStat.detach(); + if (!status) return status; localTLS.reduce([&](Local * ptr) -> void { if (ptr) @@ -298,52 +321,59 @@ DAAL_FORCEINLINE void computeDistance(size_t start, size_t end, algorithmFpType const NumericTable & data, data_management::BlockDescriptor xBD[2], services::internal::TArrayScalable & soa_arrays) { + // Initialize the distance array to zero for the range [start, end) for (size_t i = start; i < end; ++i) { distance[i - start] = 0; } - size_t curBDIdx = 0; - size_t nextBDIdx = 1; + size_t curBDIdx = 0; // Current block descriptor index + size_t nextBDIdx = 1; // Next block descriptor index - const size_t xColumnCount = data.getNumberOfColumns(); + const size_t xColumnCount = data.getNumberOfColumns(); // Total number of columns in the data - const algorithmFpType * nx = nullptr; - const algorithmFpType * dx = getNtData(isHomogenSOA, 0, start, end - start, data, xBD[curBDIdx], soa_arrays); + const algorithmFpType * dx = + getNtData(isHomogenSOA, 0, start, end - start, data, xBD[curBDIdx], soa_arrays); // Retrieve data for the first column - size_t j; - for (j = 1; j < xColumnCount; ++j) + // Iterate over each column to compute squared distances + for (size_t j = 1; j < xColumnCount; ++j) { - nx = getNtData(isHomogenSOA, j, start, end - start, data, xBD[nextBDIdx], soa_arrays); + const algorithmFpType * nx = + getNtData(isHomogenSOA, j, start, end - start, data, xBD[nextBDIdx], soa_arrays); // Retrieve data for the next column + // Prefetch the next column data to optimize memory access DAAL_PREFETCH_READ_T0(nx); - DAAL_PREFETCH_READ_T0(nx + 16); + DAAL_PREFETCH_READ_T0(nx + 16); // Adjust prefetch based on expected access patterns + // Compute distance contributions from the current column for (size_t i = 0; i < end - start; ++i) { distance[i] += (query[j - 1] - dx[i]) * (query[j - 1] - dx[i]); } + // Release the current block of data to avoid memory leaks releaseNtData(isHomogenSOA, data, xBD[curBDIdx]); + // Swap block descriptors and pointers for the next iteration services::internal::swap(curBDIdx, nextBDIdx); services::internal::swap(dx, nx); } - { - for (size_t i = 0; i < end - start; ++i) - { - distance[i] += (query[j - 1] - dx[i]) * (query[j - 1] - dx[i]); - } - releaseNtData(isHomogenSOA, data, xBD[curBDIdx]); + // Handle the last column after the loop + for (size_t i = 0; i < end - start; ++i) + { + distance[i] += (query[xColumnCount - 1] - dx[i]) * (query[xColumnCount - 1] - dx[i]); } + + // Release the final block of data + releaseNtData(isHomogenSOA, data, xBD[curBDIdx]); } template -void KNNClassificationPredictKernel::findNearestNeighbors( +services::Status KNNClassificationPredictKernel::findNearestNeighbors( const algorithmFpType * query, Heap, cpu> & heap, kdtree_knn_classification::internal::Stack, cpu> & stack, size_t k, algorithmFpType radius, - const KDTreeTable & kdTreeTable, size_t rootTreeNodeIndex, const NumericTable & data, const bool isHomogenSOA, + const KDTreeNode * nodes, size_t rootTreeNodeIndex, const NumericTable & data, const bool isHomogenSOA, services::internal::TArrayScalable & soa_arrays) { heap.reset(); @@ -354,22 +384,20 @@ void KNNClassificationPredictKernel::findNea const KDTreeNode * node; cur.nodeIndex = rootTreeNodeIndex; cur.minDistance = 0; - - DAAL_ALIGNAS(256) algorithmFpType distance[__KDTREE_LEAF_BUCKET_SIZE + 1]; + algorithmFpType distance[__KDTREE_LEAF_BUCKET_SIZE + 1]; size_t start, end; data_management::BlockDescriptor xBD[2]; + for (;;) { - node = static_cast(kdTreeTable.getArray()) + cur.nodeIndex; - if (node->dimension == __KDTREE_NULLDIMENSION) + node = &nodes[cur.nodeIndex]; + if (node->dimension >1000) { start = node->leftIndex; end = node->rightIndex; - computeDistance(start, end, distance, query, isHomogenSOA, data, xBD, soa_arrays); - - for (i = start; i < end; ++i) + for (i = start; i < end; i++) { if (distance[i - start] <= radius) { @@ -398,7 +426,7 @@ void KNNClassificationPredictKernel::findNea if (!stack.empty()) { cur = stack.pop(); - DAAL_PREFETCH_READ_T0(static_cast(kdTreeTable.getArray()) + cur.nodeIndex); + //DAAL_PREFETCH_READ_T0(nodes[cur.nodeIndex]); } else { @@ -409,7 +437,6 @@ void KNNClassificationPredictKernel::findNea { algorithmFpType val = query[node->dimension]; const algorithmFpType diff = val - node->cutPoint; - if (cur.minDistance <= radius) { cur.nodeIndex = (diff < 0) ? node->leftIndex : node->rightIndex; @@ -421,7 +448,7 @@ void KNNClassificationPredictKernel::findNea else if (!stack.empty()) { cur = stack.pop(); - DAAL_PREFETCH_READ_T0(static_cast(kdTreeTable.getArray()) + cur.nodeIndex); + //DAAL_PREFETCH_READ_T0(nodes[cur.nodeIndex]); } else { @@ -429,6 +456,7 @@ void KNNClassificationPredictKernel::findNea } } } + return services::Status(); } template @@ -441,7 +469,7 @@ services::Status KNNClassificationPredictKernel labelBD; - algorithmFpType * classes = static_cast(daal::services::internal::service_malloc(heapSize)); - algorithmFpType * classWeights = static_cast(daal::services::internal::service_malloc(nClasses)); - DAAL_CHECK_MALLOC(classWeights); - DAAL_CHECK_MALLOC(classes); + algorithmFpType * classes = + static_cast(daal::services::internal::service_malloc(heapSize * sizeof(algorithmFpType))); + DAAL_CHECK_MALLOC(classes) + algorithmFpType * classWeights = + static_cast(daal::services::internal::service_malloc(nClasses * sizeof(algorithmFpType))); + DAAL_CHECK_MALLOC(classWeights) for (size_t i = 0; i < nClasses; ++i) { @@ -563,10 +593,8 @@ services::Status KNNClassificationPredictKernel(classes); - service_free(classWeights); - classes = nullptr; + daal_free(classes); + daal_free(classWeights); } return services::Status(); diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_train_dense_default_impl.i b/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_train_dense_default_impl.i index 7822be4cd0b..b2f50feb4b7 100644 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_train_dense_default_impl.i +++ b/cpp/daal/src/algorithms/k_nearest_neighbors/kdtree_knn_classification_train_dense_default_impl.i @@ -61,39 +61,72 @@ using namespace kdtree_knn_classification::internal; template class Queue { + static const size_t defaultSize = 4; public: - Queue() : _data(nullptr) {} + Queue() : _data(nullptr), _first(0), _last(0), _count(0), _size(0), _capacity(0) {} ~Queue() { - services::daal_free(_data); - _data = nullptr; + clear(); } + Queue(const Queue &) = delete; + Queue & operator=(const Queue &) = delete; + bool init(size_t size) { clear(); + if (size == 0) // Check for valid size + { + return false; + } + _first = _count = 0; _last = _sizeMinus1 = (_size = size) - 1; - return ((_data = static_cast(service_malloc(size * sizeof(T)))) != nullptr); + _data = static_cast(service_malloc(size)); + + if (!_data) // Check if memory allocation was successful + { + return false; + } + + _capacity = _size; // Initialize capacity + return true; } void clear() { - daal_free(_data); - _data = nullptr; + if (_data) + { + daal::services::internal::service_free(_data); // Free allocated memory if it exists + _data = nullptr; + } + _first = _last = _count = _size = _sizeMinus1 = _capacity = 0; // Reset state } + void reset() { _first = _last = _count = 0; } + DAAL_FORCEINLINE void push(const T & value) { - _data[_last = (_last + 1) & _sizeMinus1] = value; + if (_count >= _capacity) // Check if capacity is exceeded + { + services::Status status = grow(); // Grow if necessary + //DAAL_CHECK_STATUS_VAR(status); + } + + _data[_last = (_last + 1) & _sizeMinus1] = value; // Add element to queue ++_count; } DAAL_FORCEINLINE T pop() { - const T value = _data[_first++]; - _first *= (_first != _size); + // if (empty()) // Check if queue is empty + // { + // throw std::underflow_error("Queue underflow: no elements to pop."); + // } + + const T value = _data[_first++]; // Retrieve element + _first *= (_first != _size); // Reset first index if it reaches the end --_count; return value; } @@ -102,13 +135,36 @@ public: size_t size() const { return _count; } + private: + services::Status grow() + { + int result = 0; + _capacity = (_capacity == 0 ? defaultSize : _capacity * 2); // Double capacity or set to default + + T * const newData = daal::services::internal::service_malloc(_capacity); + DAAL_CHECK_MALLOC(newData); + + if (_data != nullptr) + { + result = services::internal::daal_memcpy_s(newData, _last * sizeof(T), _data, _last * sizeof(T)); + daal::services::internal::service_free(_data); // Free old data + _data = nullptr; + } + + _data = newData; // Assign new expanded memory + _size = _capacity; // Adjust size to new capacity + _sizeMinus1 = _capacity - 1; // Update size minus 1 for wrapping + return (!result) ? services::Status() : services::Status(services::ErrorMemoryCopyFailedInternal); + } + T * _data; - size_t _first; - size_t _last; - size_t _count; - size_t _size; - size_t _sizeMinus1; + size_t _first; // Index of the first element + size_t _last; // Index of the last element + size_t _count; // Current number of elements + size_t _size; // Current size of the queue + size_t _sizeMinus1; // Helper for wrap-around logic + size_t _capacity; // Maximum capacity of the queue }; struct BuildNode @@ -160,14 +216,16 @@ Status KNNClassificationTrainBatchKernel q; BBox * bboxQ = nullptr; + auto oldThreads = services::Environment::getInstance()->getNumberOfThreads(); DAAL_CHECK_STATUS(status, buildFirstPartOfKDTree(q, bboxQ, *x, *r, indexes, engine)); + services::Environment::getInstance()->setNumberOfThreads(1); DAAL_CHECK_STATUS(status, buildSecondPartOfKDTree(q, bboxQ, *x, *r, indexes, engine)); + services::Environment::getInstance()->setNumberOfThreads(oldThreads); DAAL_CHECK_STATUS(status, rearrangePoints(*x, indexes)); if (y) { DAAL_CHECK_STATUS(status, rearrangePoints(*y, indexes)); } - daal_free(bboxQ); bboxQ = nullptr; return status; @@ -183,10 +241,9 @@ Status KNNClassificationTrainBatchKernel Math; typedef BoundingBox BBox; - const auto maxThreads = threader_get_threads_number(); const algorithmFpType base = 2.0; const size_t queueSize = - 2 * Math::sPowx(base, Math::sCeil(Math::sLog(__KDTREE_FIRST_PART_LEAF_NODES_PER_THREAD * maxThreads) / Math::sLog(base))); + 2 * Math::sPowx(base, Math::sCeil(Math::sLog(__KDTREE_FIRST_PART_LEAF_NODES_PER_THREAD) / Math::sLog(base))); const size_t firstPartLeafNodeCount = queueSize / 2; q.init(queueSize); const size_t xColumnCount = x.getNumberOfColumns(); @@ -196,7 +253,7 @@ Status KNNClassificationTrainBatchKernel(service_malloc(bboxSize * sizeof(BBox), sizeof(BBox))); + bboxQ = static_cast(service_malloc(bboxSize)); DAAL_CHECK_MALLOC(bboxQ) r.impl()->setLastNodeIndex(0); @@ -221,7 +278,7 @@ Status KNNClassificationTrainBatchKernel(service_malloc(subSampleCount * sizeof(algorithmFpType))); + algorithmFpType * subSamples = static_cast(service_malloc(subSampleCount)); DAAL_CHECK_MALLOC(subSamples) while (maxNodeCountForCurrentDepth < firstPartLeafNodeCount) @@ -315,8 +372,8 @@ Status KNNClassificationTrainBatchKernel(x).getBlockOfColumnValues(j, 0, xRowCount, readOnly, columnBD); const algorithmFpType * const dx = columnBD.getBlockPtr(); - - daal::tls bboxTLS([=, &status]() -> BBox * { + SafeStatus safeStat; + daal::tls bboxTLS([&]() -> BBox * { BBox * const ptr = service_scalable_calloc(1); if (ptr) { @@ -325,51 +382,52 @@ Status KNNClassificationTrainBatchKernel(static_cast(first + rowsPerBlock), xRowCount); + DAAL_CHECK_MALLOC_THR(bboxLocal); + const size_t first = iBlock * rowsPerBlock; + const size_t last = min(static_cast(first + rowsPerBlock), xRowCount); - if (first < last) + if (first < last) + { + BBox b; + size_t i = first; + b.upper = dx[indexes[i]]; + b.lower = dx[indexes[i]]; + PRAGMA_IVDEP + for (++i; i < last; ++i) { - BBox b; - size_t i = first; - b.upper = dx[indexes[i]]; - b.lower = dx[indexes[i]]; - PRAGMA_IVDEP - for (++i; i < last; ++i) + if (b.lower > dx[indexes[i]]) { - if (b.lower > dx[indexes[i]]) - { - b.lower = dx[indexes[i]]; - } - if (b.upper < dx[indexes[i]]) - { - b.upper = dx[indexes[i]]; - } - } - - if (bboxLocal->upper < b.upper) - { - bboxLocal->upper = b.upper; + b.lower = dx[indexes[i]]; } - if (bboxLocal->lower > b.lower) + if (b.upper < dx[indexes[i]]) { - bboxLocal->lower = b.lower; + b.upper = dx[indexes[i]]; } } + + if (bboxLocal->upper < b.upper) + { + bboxLocal->upper = b.upper; + } + if (bboxLocal->lower > b.lower) + { + bboxLocal->lower = b.lower; + } } }); + status = safeStat.detach(); + if (!status) return status; + bboxTLS.reduce([=](BBox * v) -> void { if (v) { @@ -713,8 +771,8 @@ size_t KNNClassificationTrainBatchKernel(service_malloc(idxMultiplier * (blockCount + 1) * sizeof(size_t))); - size_t * rightSegmentStartPerBlock = static_cast(service_malloc(idxMultiplier * blockCount * sizeof(size_t))); + size_t * leftSegmentStartPerBlock = static_cast(service_malloc(idxMultiplier * (blockCount + 1))); + size_t * rightSegmentStartPerBlock = static_cast(service_malloc(idxMultiplier * blockCount)); if (!leftSegmentStartPerBlock || !rightSegmentStartPerBlock) { @@ -846,7 +904,7 @@ Status KNNClassificationTrainBatchKernel(service_malloc(xRowCount * sizeof(algorithmFpType))))); + (buffer = static_cast(service_malloc(xRowCount)))); if (!awx) { status.add(services::ErrorMemoryAllocationFailed); @@ -928,10 +986,10 @@ Status KNNClassificationTrainBatchKernel(service_malloc(q.size() * sizeof(BuildNode))); + BuildNode * bnQ = static_cast(service_malloc(q.size())); DAAL_CHECK_MALLOC(bnQ) size_t posQ = 0; while (q.size() > 0) @@ -969,7 +1027,7 @@ Status KNNClassificationTrainBatchKernel(service_malloc((maxThreads + 1) * sizeof(*firstNodeIndex))); + size_t * firstNodeIndex = static_cast(service_malloc((maxThreads + 1))); DAAL_CHECK_MALLOC(firstNodeIndex) size_t nodeIndex = lastNodeIndex; for (size_t i = 0; i < maxThreads; ++i) @@ -988,7 +1046,7 @@ Status KNNClassificationTrainBatchKernelbboxes = service_scalable_calloc(ptr->bboxesCapacity * xColumnCount)) != nullptr) && ((ptr->inSortValues = service_scalable_calloc(__KDTREE_INDEX_VALUE_PAIRS_PER_THREAD)) != nullptr) && ((ptr->outSortValues = service_scalable_calloc(__KDTREE_INDEX_VALUE_PAIRS_PER_THREAD)) != nullptr) - && ((ptr->fixupQueue = static_cast(service_malloc(ptr->fixupQueueCapacity * sizeof(size_t)))) != nullptr) + && ((ptr->fixupQueue = static_cast(service_malloc(ptr->fixupQueueCapacity))) != nullptr) && ptr->buildStack.init(stackSize))) { status.add(services::ErrorMemoryAllocationFailed); @@ -1087,7 +1145,7 @@ Status KNNClassificationTrainBatchKernelfixupQueueIndex >= local->fixupQueueCapacity) { const size_t newCapacity = local->fixupQueueCapacity * 2; - size_t * const newQueue = static_cast(service_malloc(newCapacity * sizeof(size_t))); + size_t * const newQueue = static_cast(service_malloc(newCapacity)); DAAL_CHECK_THR(newQueue, services::ErrorMemoryAllocationFailed); result |= daal::services::internal::daal_memcpy_s(newQueue, newCapacity * sizeof(size_t), local->fixupQueue, local->fixupQueueIndex * sizeof(size_t)); @@ -1126,13 +1184,13 @@ Status KNNClassificationTrainBatchKernelextraKDTreeNodesCapacity > 0 ? local->extraKDTreeNodesCapacity * 2 : static_cast(1024), extraIndex + 1); KDTreeNode * const newNodes = - static_cast(service_malloc(newCapacity * sizeof(KDTreeNode))); + static_cast(service_malloc(newCapacity)); DAAL_CHECK_THR(newNodes, services::ErrorMemoryAllocationFailed); - result |= daal::services::internal::daal_memcpy_s(newNodes, newCapacity * sizeof(KDTreeNode), + result |= daal::services::internal::daal_memcpy_s(newNodes, newCapacity, local->extraKDTreeNodes, - local->extraKDTreeNodesCapacity * sizeof(KDTreeNode)); + local->extraKDTreeNodesCapacity); KDTreeNode * oldNodes = local->extraKDTreeNodes; local->extraKDTreeNodes = newNodes; local->extraKDTreeNodesCapacity = newCapacity; @@ -1144,7 +1202,7 @@ Status KNNClassificationTrainBatchKernelextraKDTreeNodesCapacity = max(extraIndex + 1, static_cast(1024)); local->extraKDTreeNodes = static_cast( - service_malloc(local->extraKDTreeNodesCapacity * sizeof(KDTreeNode))); + service_malloc(local->extraKDTreeNodesCapacity)); DAAL_CHECK_THR(local->extraKDTreeNodes, services::ErrorMemoryAllocationFailed); } @@ -1355,7 +1413,7 @@ algorithmFpType KNNClassificationTrainBatchKernel(service_malloc(sampleCount * sizeof(*samples))); + algorithmFpType * samples = static_cast(service_malloc(sampleCount)); if (!samples) { status = services::ErrorMemoryAllocationFailed; @@ -1380,7 +1438,7 @@ algorithmFpType KNNClassificationTrainBatchKernel(sampleCount, samples); - size_t * hist = static_cast(service_malloc(sampleCount * sizeof(*hist))); + size_t * hist = static_cast(service_malloc(sampleCount)); if (!hist) { status = services::ErrorMemoryAllocationFailed; @@ -1393,7 +1451,7 @@ algorithmFpType KNNClassificationTrainBatchKernel(service_malloc(subSampleCount * sizeof(*subSamples))); + algorithmFpType * subSamples = static_cast(service_malloc(subSampleCount)); if (!subSamples) { status = services::ErrorMemoryAllocationFailed; diff --git a/cpp/daal/src/externals/service_math.h b/cpp/daal/src/externals/service_math.h index bbf948bd500..7a4e6221151 100644 --- a/cpp/daal/src/externals/service_math.h +++ b/cpp/daal/src/externals/service_math.h @@ -50,18 +50,32 @@ struct Math static fpType sPowx(fpType in, fpType in1) { return _impl::sPowx(in, in1); } + static fpType xsPowx(fpType in, fpType in1) { return _impl::xsPowx(in, in1); } + static fpType sCeil(fpType in) { return _impl::sCeil(in); } + static fpType xsCeil(fpType in) { return _impl::xsCeil(in); } + static fpType sErfInv(fpType in) { return _impl::sErfInv(in); } + static fpType xsErfInv(fpType in) { return _impl::xsErfInv(in); } + static fpType sErf(fpType in) { return _impl::sErf(in); } + static fpType xsErf(fpType in) { return _impl::xsErf(in); } + static fpType sLog(fpType in) { return _impl::sLog(in); } + static fpType xsLog(fpType in) { return _impl::xsLog(in); } + static fpType sCdfNormInv(fpType in) { return _impl::sCdfNormInv(in); } + static fpType xsCdfNormInv(fpType in) { return _impl::xsCdfNormInv(in); } + static void vPowx(SizeType n, const fpType * in, fpType in1, fpType * out) { _impl::vPowx(n, in, in1, out); } + static void xvPowx(SizeType n, const fpType * in, fpType in1, fpType * out) { _impl::xvPowx(n, in, in1, out); } + static void vPowxAsLnExp(SizeType n, const fpType * in, fpType in1, fpType * out) { _impl::vLog(n, in, out); @@ -72,25 +86,53 @@ struct Math _impl::vExp(n, out, out); } + static void xvPowxAsLnExp(SizeType n, const fpType * in, fpType in1, fpType * out) + { + _impl::xvLog(n, in, out); + for (size_t i = 0; i < n; i++) + { + out[i] *= in1; + } + _impl::xvExp(n, out, out); + } + static void vCeil(SizeType n, const fpType * in, fpType * out) { _impl::vCeil(n, in, out); } + static void xvCeil(SizeType n, const fpType * in, fpType * out) { _impl::xvCeil(n, in, out); } + static void vErfInv(SizeType n, const fpType * in, fpType * out) { _impl::vErfInv(n, in, out); } + static void xvErfInv(SizeType n, const fpType * in, fpType * out) { _impl::xvErfInv(n, in, out); } + static void vErf(SizeType n, const fpType * in, fpType * out) { _impl::vErf(n, in, out); } + static void xvErf(SizeType n, const fpType * in, fpType * out) { _impl::xvErf(n, in, out); } + static void vExp(SizeType n, const fpType * in, fpType * out) { _impl::vExp(n, in, out); } + static void xvExp(SizeType n, const fpType * in, fpType * out) { _impl::xvExp(n, in, out); } + static fpType vExpThreshold() { return _impl::vExpThreshold(); } static void vTanh(SizeType n, const fpType * in, fpType * out) { _impl::vTanh(n, in, out); } + static void xvTanh(SizeType n, const fpType * in, fpType * out) { _impl::xvTanh(n, in, out); } + static void vSqrt(SizeType n, const fpType * in, fpType * out) { _impl::vSqrt(n, in, out); } + static void xvSqrt(SizeType n, const fpType * in, fpType * out) { _impl::xvSqrt(n, in, out); } + static void vLog(SizeType n, const fpType * in, fpType * out) { _impl::vLog(n, in, out); } + static void xvLog(SizeType n, const fpType * in, fpType * out) { _impl::xvLog(n, in, out); } + static void vLog1p(SizeType n, const fpType * in, fpType * out) { _impl::vLog1p(n, in, out); } + static void xvLog1p(SizeType n, const fpType * in, fpType * out) { _impl::xvLog1p(n, in, out); } + static void vCdfNormInv(SizeType n, const fpType * in, fpType * out) { _impl::vCdfNormInv(n, in, out); } + + static void xvCdfNormInv(SizeType n, const fpType * in, fpType * out) { _impl::xvCdfNormInv(n, in, out); } }; } // namespace internal diff --git a/cpp/daal/src/externals/service_math_mkl.h b/cpp/daal/src/externals/service_math_mkl.h index fa5ce46a5ea..08df5d7b11d 100644 --- a/cpp/daal/src/externals/service_math_mkl.h +++ b/cpp/daal/src/externals/service_math_mkl.h @@ -66,6 +66,13 @@ struct MklMath return r; } + static double xsPowx(double in, double in1) + { + double r; + xvPowx(1, &in, in1, &r); + return r; + } + static double sCeil(double in) { double r; @@ -73,6 +80,13 @@ struct MklMath return r; } + static double xsCeil(double in) + { + double r; + xvCeil(1, &in, &r); + return r; + } + static double sErfInv(double in) { double r; @@ -80,6 +94,13 @@ struct MklMath return r; } + static double xsErfInv(double in) + { + double r; + xvErfInv(1, &in, &r); + return r; + } + static double sErf(double in) { double r; @@ -87,6 +108,13 @@ struct MklMath return r; } + static double xsErf(double in) + { + double r; + xvErf(1, &in, &r); + return r; + } + static double sLog(double in) { double r; @@ -94,6 +122,13 @@ struct MklMath return r; } + static double xsLog(double in) + { + double r; + xvLog(1, &in, &r); + return r; + } + static double sCdfNormInv(double in) { double r; @@ -101,56 +136,133 @@ struct MklMath return r; } + static double xsCdfNormInv(double in) + { + double r; + xvCdfNormInv(1, &in, &r); + return r; + } + static void vPowx(SizeType n, const double * in, double in1, double * out) { - __DAAL_MKLFN_CALL_MATH(vmdPowx, ((int)n, in, in1, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmdPowx, ((MKL_INT)n, in, in1, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvPowx(SizeType n, const double * in, double in1, double * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmdPowx, ((MKL_INT)n, in, in1, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } static void vCeil(SizeType n, const double * in, double * out) { - __DAAL_MKLFN_CALL_MATH(vmdCeil, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmdCeil, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvCeil(SizeType n, const double * in, double * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmdCeil, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } static void vErfInv(SizeType n, const double * in, double * out) { - __DAAL_MKLFN_CALL_MATH(vmdErfInv, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmdErfInv, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvErfInv(SizeType n, const double * in, double * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmdErfInv, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } static void vErf(SizeType n, const double * in, double * out) { - __DAAL_MKLFN_CALL_MATH(vmdErf, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmdErf, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvErf(SizeType n, const double * in, double * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmdErf, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } static void vExp(SizeType n, const double * in, double * out) { - __DAAL_MKLFN_CALL_MATH(vmdExp, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmdExp, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvExp(SizeType n, const double * in, double * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmdExp, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } static double vExpThreshold() { return -650.0; } static void vTanh(SizeType n, const double * in, double * out) { - __DAAL_MKLFN_CALL_MATH(vmdTanh, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmdTanh, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvTanh(SizeType n, const double * in, double * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmdTanh, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } static void vSqrt(SizeType n, const double * in, double * out) { - __DAAL_MKLFN_CALL_MATH(vmdSqrt, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmdSqrt, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvSqrt(SizeType n, const double * in, double * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmdSqrt, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } static void vLog(SizeType n, const double * in, double * out) { - __DAAL_MKLFN_CALL_MATH(vmdLn, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmdLn, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvLog(SizeType n, const double * in, double * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmdLn, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } static void vLog1p(SizeType n, const double * in, double * out) { - __DAAL_MKLFN_CALL_MATH(vmdLog1p, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmdLog1p, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvLog1p(SizeType n, const double * in, double * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmdLog1p, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } static void vCdfNormInv(SizeType n, const double * in, double * out) { - __DAAL_MKLFN_CALL_MATH(vmdCdfNormInv, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmdCdfNormInv, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvCdfNormInv(SizeType n, const double * in, double * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmdCdfNormInv, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } }; @@ -178,6 +290,13 @@ struct MklMath return r; } + static float xsPowx(float in, float in1) + { + float r; + xvPowx(1, &in, in1, &r); + return r; + } + static float sCeil(float in) { float r; @@ -185,6 +304,13 @@ struct MklMath return r; } + static float xsCeil(float in) + { + float r; + xvCeil(1, &in, &r); + return r; + } + static float sErfInv(float in) { float r; @@ -192,6 +318,13 @@ struct MklMath return r; } + static float xsErfInv(float in) + { + float r; + xvErfInv(1, &in, &r); + return r; + } + static float sErf(float in) { float r; @@ -199,6 +332,13 @@ struct MklMath return r; } + static float xsErf(float in) + { + float r; + xvErf(1, &in, &r); + return r; + } + static float sLog(float in) { float r; @@ -206,6 +346,13 @@ struct MklMath return r; } + static float xsLog(float in) + { + float r; + xvLog(1, &in, &r); + return r; + } + static float sCdfNormInv(float in) { float r; @@ -213,56 +360,133 @@ struct MklMath return r; } + static float xsCdfNormInv(float in) + { + float r; + xvCdfNormInv(1, &in, &r); + return r; + } + static void vPowx(SizeType n, const float * in, float in1, float * out) { - __DAAL_MKLFN_CALL_MATH(vmsPowx, ((int)n, in, in1, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmsPowx, ((MKL_INT)n, in, in1, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvPowx(SizeType n, const float * in, float in1, float * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmsPowx, ((MKL_INT)n, in, in1, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } static void vCeil(SizeType n, const float * in, float * out) { - __DAAL_MKLFN_CALL_MATH(vmsCeil, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmsCeil, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvCeil(SizeType n, const float * in, float * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmsCeil, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } static void vErfInv(SizeType n, const float * in, float * out) { - __DAAL_MKLFN_CALL_MATH(vmsErfInv, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmsErfInv, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvErfInv(SizeType n, const float * in, float * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmsErfInv, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } static void vErf(SizeType n, const float * in, float * out) { - __DAAL_MKLFN_CALL_MATH(vmsErf, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmsErf, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvErf(SizeType n, const float * in, float * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmsErf, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } static void vExp(SizeType n, const float * in, float * out) { - __DAAL_MKLFN_CALL_MATH(vmsExp, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmsExp, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvExp(SizeType n, const float * in, float * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmsExp, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } static float vExpThreshold() { return -75.0f; } static void vTanh(SizeType n, const float * in, float * out) { - __DAAL_MKLFN_CALL_MATH(vmsTanh, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmsTanh, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvTanh(SizeType n, const float * in, float * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmsTanh, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } static void vSqrt(SizeType n, const float * in, float * out) { - __DAAL_MKLFN_CALL_MATH(vmsSqrt, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmsSqrt, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvSqrt(SizeType n, const float * in, float * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmsSqrt, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } static void vLog(SizeType n, const float * in, float * out) { - __DAAL_MKLFN_CALL_MATH(vmsLn, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmsLn, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvLog(SizeType n, const float * in, float * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmsLn, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } static void vLog1p(SizeType n, const float * in, float * out) { - __DAAL_MKLFN_CALL_MATH(vmsLog1p, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmsLog1p, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvLog1p(SizeType n, const float * in, float * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmsLog1p, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } static void vCdfNormInv(SizeType n, const float * in, float * out) { - __DAAL_MKLFN_CALL_MATH(vmsCdfNormInv, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + __DAAL_MKLFN_CALL_MATH(vmsCdfNormInv, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + } + + static void xvCdfNormInv(SizeType n, const float * in, float * out) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_MATH(vmsCdfNormInv, ((MKL_INT)n, in, out, (MKL_INT)(VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE))); + mkl_set_num_threads_local(old_nthr); } }; diff --git a/cpp/daal/src/externals/service_math_ref.h b/cpp/daal/src/externals/service_math_ref.h index 07062c1ba2f..5ac4d515dd1 100644 --- a/cpp/daal/src/externals/service_math_ref.h +++ b/cpp/daal/src/externals/service_math_ref.h @@ -58,48 +58,88 @@ struct RefMath static double sPowx(double in, double in1) { return pow(in, in1); } + static double xsPowx(double in, double in1) { return pow(in, in1); } + static double sCeil(double in) { return ceil(in); } + static double xsCeil(double in) { return ceil(in); } + // Not implemented static double sErfInv(double in) { return std::numeric_limits::quiet_NaN(); } + // Not implemented + static double xsErfInv(double in) { return std::numeric_limits::quiet_NaN(); } + static double sErf(double in) { return erf(in); } + static double xsErf(double in) { return erf(in); } + static double sLog(double in) { return log(in); } + static double xsLog(double in) { return log(in); } + // Not implemented static double sCdfNormInv(double in) { return std::numeric_limits::quiet_NaN(); } + // Not implemented + static double xsCdfNormInv(double in) { return std::numeric_limits::quiet_NaN(); } + static void vPowx(SizeType n, const double * in, double in1, double * out) { #pragma omp simd for (SizeType i = 0; i < n; ++i) out[i] = pow(in[i], in1); } + static void xvPowx(SizeType n, const double * in, double in1, double * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = pow(in[i], in1); + } + static void vCeil(SizeType n, const double * in, double * out) { #pragma omp simd for (SizeType i = 0; i < n; ++i) out[i] = ceil(in[i]); } + static void xvCeil(SizeType n, const double * in, double * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = ceil(in[i]); + } + // Not implemented static void vErfInv(SizeType n, const double * in, double * out) { for (SizeType i = 0; i < n; ++i) out[i] = std::numeric_limits::quiet_NaN(); } + // Not implemented + static void xvErfInv(SizeType n, const double * in, double * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = std::numeric_limits::quiet_NaN(); + } + static void vErf(SizeType n, const double * in, double * out) { #pragma omp simd for (SizeType i = 0; i < n; ++i) out[i] = erf(in[i]); } + static void xvErf(SizeType n, const double * in, double * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = erf(in[i]); + } + static void vExp(SizeType n, const double * in, double * out) { #pragma omp simd for (SizeType i = 0; i < n; ++i) out[i] = exp(in[i]); } + static void xvExp(SizeType n, const double * in, double * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = exp(in[i]); + } + static double vExpThreshold() { return -650.0; @@ -111,29 +151,55 @@ struct RefMath for (SizeType i = 0; i < n; ++i) out[i] = tanh(in[i]); } + static void xvTanh(SizeType n, const double * in, double * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = tanh(in[i]); + } + static void vSqrt(SizeType n, const double * in, double * out) { #pragma omp simd for (SizeType i = 0; i < n; ++i) out[i] = sqrt(in[i]); } + static void xvSqrt(SizeType n, const double * in, double * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = sqrt(in[i]); + } + static void vLog(SizeType n, const double * in, double * out) { #pragma omp simd for (SizeType i = 0; i < n; ++i) out[i] = log(in[i]); } + static void xvLog(SizeType n, const double * in, double * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = log(in[i]); + } + static void vLog1p(SizeType n, const double * in, double * out) { #pragma omp simd for (SizeType i = 0; i < n; ++i) out[i] = log1p(in[i]); } + static void xvLog1p(SizeType n, const double * in, double * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = log1p(in[i]); + } + // Not implemented static void vCdfNormInv(SizeType n, const double * in, double * out) { for (SizeType i = 0; i < n; ++i) out[i] = std::numeric_limits::quiet_NaN(); } + + // Not implemented + static void xvCdfNormInv(SizeType n, const double * in, double * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = std::numeric_limits::quiet_NaN(); + } }; /* @@ -155,48 +221,87 @@ struct RefMath static float sPowx(float in, float in1) { return pow(in, in1); } + static float xsPowx(float in, float in1) { return pow(in, in1); } + static float sCeil(float in) { return ceil(in); } + static float xsCeil(float in) { return ceil(in); } + // Not implemented static float sErfInv(float in) { return std::numeric_limits::quiet_NaN(); } + // Not implemented + static float xsErfInv(float in) { return std::numeric_limits::quiet_NaN(); } + static float sErf(float in) { return erf(in); } + static float xsErf(float in) { return erf(in); } + static float sLog(float in) { return log(in); } + static float xsLog(float in) { return log(in); } + // Not implemented static float sCdfNormInv(float in) { return std::numeric_limits::quiet_NaN(); } + static float xsCdfNormInv(float in) { return std::numeric_limits::quiet_NaN(); } + static void vPowx(SizeType n, const float * in, float in1, float * out) { #pragma omp simd for (SizeType i = 0; i < n; ++i) out[i] = pow(in[i], in1); } + static void xvPowx(SizeType n, const float * in, float in1, float * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = pow(in[i], in1); + } + static void vCeil(SizeType n, const float * in, float * out) { #pragma omp simd for (SizeType i = 0; i < n; ++i) out[i] = ceil(in[i]); } + static void xvCeil(SizeType n, const float * in, float * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = ceil(in[i]); + } + // Not implemented static void vErfInv(SizeType n, const float * in, float * out) { for (SizeType i = 0; i < n; ++i) out[i] = std::numeric_limits::quiet_NaN(); } + // Not implemented + static void xvErfInv(SizeType n, const float * in, float * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = std::numeric_limits::quiet_NaN(); + } + static void vErf(SizeType n, const float * in, float * out) { #pragma omp simd for (SizeType i = 0; i < n; ++i) out[i] = erf(in[i]); } + static void xvErf(SizeType n, const float * in, float * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = erf(in[i]); + } + static void vExp(SizeType n, const float * in, float * out) { #pragma omp simd for (SizeType i = 0; i < n; ++i) out[i] = exp(in[i]); } + static void xvExp(SizeType n, const float * in, float * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = exp(in[i]); + } + static float vExpThreshold() { return -75.0f; @@ -208,29 +313,54 @@ struct RefMath for (SizeType i = 0; i < n; ++i) out[i] = tanh(in[i]); } + static void xvTanh(SizeType n, const float * in, float * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = tanh(in[i]); + } + static void vSqrt(SizeType n, const float * in, float * out) { #pragma omp simd for (SizeType i = 0; i < n; ++i) out[i] = sqrt(in[i]); } + static void xvSqrt(SizeType n, const float * in, float * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = sqrt(in[i]); + } + static void vLog(SizeType n, const float * in, float * out) { #pragma omp simd for (SizeType i = 0; i < n; ++i) out[i] = log(in[i]); } + static void xvLog(SizeType n, const float * in, float * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = log(in[i]); + } + static void vLog1p(SizeType n, const float * in, float * out) { #pragma omp simd for (SizeType i = 0; i < n; ++i) out[i] = log1p(in[i]); } + static void xvLog1p(SizeType n, const float * in, float * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = log1p(in[i]); + } + // Not implemented static void vCdfNormInv(SizeType n, const float * in, float * out) { for (SizeType i = 0; i < n; ++i) out[i] = std::numeric_limits::quiet_NaN(); } + // Not implemented + static void xvCdfNormInv(SizeType n, const float * in, float * out) + { + for (SizeType i = 0; i < n; ++i) out[i] = std::numeric_limits::quiet_NaN(); + } }; } // namespace ref