diff --git a/NeoMathEngine/include/NeoMathEngine/NeoMathEngine.h b/NeoMathEngine/include/NeoMathEngine/NeoMathEngine.h index 4f4a45325..9992023e5 100644 --- a/NeoMathEngine/include/NeoMathEngine/NeoMathEngine.h +++ b/NeoMathEngine/include/NeoMathEngine/NeoMathEngine.h @@ -1149,15 +1149,19 @@ struct CMathEngineInfo { CMathEngineInfo( TMathEngineType type, size_t availableMemory, int id ) : Type( type ), AvailableMemory( availableMemory ), Id( id ) { Name[0] = 0; } }; +//------------------------------------------------------------------------------------------------------------ + // CMathEngine class implements an engine to perform calculations on data specified by CMemoryHandle (CFloatHandle) class NEOMATHENGINE_API IMathEngine : public IDnnEngine { public: virtual ~IMathEngine(); + // Gets the device type virtual TMathEngineType GetType() const = 0; - // Gets the device information virtual void GetMathEngineInfo( CMathEngineInfo& info ) const = 0; + // CMemoryEngineMixin has a delayed initialization after the device initialization + virtual bool IsInitialized() const = 0; // Memory management @@ -1165,6 +1169,7 @@ class NEOMATHENGINE_API IMathEngine : public IDnnEngine { // In this mode, the allocated memory blocks will not be deleted on HeapFree() and may be used until CleanUp() virtual void SetReuseMemoryMode( bool enable ) = 0; virtual bool GetReuseMemoryMode() const = 0; + // Specialize the size threshold in bytes for the current thread, so // memory blocks of a size <= this threshold would be allocated in buffers if 'reuse' mode enabled // memory blocks of a size > this threshold would be allocated in raw RAM memory (malloc/free) @@ -1216,9 +1221,11 @@ class NEOMATHENGINE_API IMathEngine : public IDnnEngine { // Typed data exchange template - void DataExchangeTyped( const CTypedMemoryHandle& result, const T* source, size_t size ) { DataExchangeRaw( result, source, size * sizeof(T) ); } + void DataExchangeTyped( const CTypedMemoryHandle& result, const T* source, size_t size ) + { DataExchangeRaw( result, source, size * sizeof(T) ); } template - void DataExchangeTyped( T* result, const CTypedMemoryHandle& source, size_t size ) { DataExchangeRaw( result, source, size * sizeof(T) ); } + void DataExchangeTyped( T* result, const CTypedMemoryHandle& source, size_t size ) + { DataExchangeRaw( result, source, size * sizeof(T) ); } // Creates a handle with data from another math engine virtual CMemoryHandle CopyFrom( const CMemoryHandle& handle, size_t size ) = 0; @@ -1227,11 +1234,15 @@ class NEOMATHENGINE_API IMathEngine : public IDnnEngine { // This object should be destroyed using the standard delete operator after use. virtual IPerformanceCounters* CreatePerformanceCounters( bool isTimeOnly = false ) const = 0; + // For Distributed only virtual CMathEngineDistributedInfo GetDistributedInfo() { return CMathEngineDistributedInfo(); } virtual void AllReduce( const CFloatHandle& handle, int size ) = 0; virtual void Broadcast( const CFloatHandle& handle, int size, int root ) = 0; virtual void AbortDistributed() {}; virtual bool IsDistributed() const { return false; } + +protected: + virtual void CleanUpSpecial() = 0; }; //------------------------------------------------------------------------------------------------------------ diff --git a/NeoMathEngine/src/CMakeLists.txt b/NeoMathEngine/src/CMakeLists.txt index 916f322e8..90a7d71a9 100644 --- a/NeoMathEngine/src/CMakeLists.txt +++ b/NeoMathEngine/src/CMakeLists.txt @@ -31,6 +31,7 @@ set(CPU_COMMON_SOURCES MathEngineDnnDropout.cpp MathEngine.cpp MathEngineHostStackAllocator.cpp + MemoryEngineMixin.cpp MemoryPool.cpp ThreadPool.cpp common.cpp @@ -50,6 +51,7 @@ target_sources(${PROJECT_NAME} MathEngineDnnLrn.h MathEngineDnnPoolings.h MathEngineHostStackAllocator.h + MemoryEngineMixin.h MemoryHandleInternal.h MemoryPool.h RawMemoryManager.h diff --git a/NeoMathEngine/src/CPU/CpuMathEngine.cpp b/NeoMathEngine/src/CPU/CpuMathEngine.cpp index d85bc38ef..8e82a4132 100644 --- a/NeoMathEngine/src/CPU/CpuMathEngine.cpp +++ b/NeoMathEngine/src/CPU/CpuMathEngine.cpp @@ -54,15 +54,12 @@ CCpuMathEngine::CCpuMathEngine( size_t _memoryLimit, std::shared_ptr communicator, const CMathEngineDistributedInfo& distributedInfo ) : floatAlignment( FloatAlignment ), - memoryAlignment( floatAlignment * sizeof(float) ), communicator( communicator ), distributedInfo( distributedInfo ), - memoryPool( new CMemoryPool( _memoryLimit == 0 ? SIZE_MAX : _memoryLimit, this, distributedInfo.Threads > 1 ) ), - stackAllocator( new CDeviceStackAllocator( *memoryPool, memoryAlignment ) ), - dllLoader( CDllLoader::AVX_DLL ), - simdMathEngine( nullptr ), - customSgemmFunction( nullptr ) + dllLoader( CDllLoader::AVX_DLL ) { + InitializeMemory( this, _memoryLimit, static_cast( floatAlignment * sizeof( float ) ), + /*reuse*/IsDistributed(), /*hostStack*/false ); #ifdef NEOML_USE_AVX if( dllLoader.IsLoaded( CDllLoader::AVX_DLL ) ) { simdMathEngine = std::unique_ptr( CDllLoader::avxDll->CreateSimdMathEngine( this ) ); @@ -73,7 +70,7 @@ CCpuMathEngine::CCpuMathEngine( size_t _memoryLimit, } #else // !NEOML_USE_AVX // warning fix - (void)customSgemmFunction; + ( void ) customSgemmFunction; #endif // !NEOML_USE_AVX #ifdef NEOML_USE_MKL vmlSetMode( VML_ERRMODE_NOERR ); @@ -85,117 +82,8 @@ CCpuMathEngine::~CCpuMathEngine() CleanUp(); } -void CCpuMathEngine::SetReuseMemoryMode( bool enable ) +void CCpuMathEngine::CleanUpSpecial() { - // Distributed CPU math engine always uses memory pools - // because big simultaneous allocations on multiple (20+) threads are extremely slow - if( IsDistributed() ) { - return; - } - - std::lock_guard lock( mutex ); - memoryPool->SetReuseMemoryMode( enable ); -} - -bool CCpuMathEngine::GetReuseMemoryMode() const -{ - // Distributed CPU math engine always uses memory pools - if( IsDistributed() ) { - return true; - } - std::lock_guard lock( mutex ); - return memoryPool->GetReuseMemoryMode(); -} - -void CCpuMathEngine::SetThreadBufferMemoryThreshold( size_t threshold ) -{ - std::lock_guard lock( mutex ); - memoryPool->SetThreadBufferMemoryThreshold( threshold ); -} - -size_t CCpuMathEngine::GetThreadBufferMemoryThreshold() const -{ - std::lock_guard lock( mutex ); - return memoryPool->GetThreadBufferMemoryThreshold(); -} - -CMemoryHandle CCpuMathEngine::HeapAlloc( size_t size ) -{ - std::lock_guard lock( mutex ); - CMemoryHandle result = memoryPool->Alloc( size ); - if( result.IsNull() ) { - THROW_MEMORY_EXCEPTION; - } - return result; -} - -void CCpuMathEngine::HeapFree( const CMemoryHandle& handle ) -{ - ASSERT_EXPR( handle.GetMathEngine() == this ); - - std::lock_guard lock( mutex ); - memoryPool->Free( handle ); -} - -void CCpuMathEngine::TransferHandleToThisThread( const CMemoryHandle& handle, size_t size ) -{ - ASSERT_EXPR( handle.GetMathEngine() == this ); - - std::lock_guard lock( mutex ); - memoryPool->TransferHandleToThisThread( handle, size ); -} - -CMemoryHandle CCpuMathEngine::StackAlloc( size_t size ) -{ - std::lock_guard lock( mutex ); - CMemoryHandle result = stackAllocator->Alloc(size); - if( result.IsNull() ) { - THROW_MEMORY_EXCEPTION; - } - return result; -} - -void CCpuMathEngine::StackFree( const CMemoryHandle& ptr ) -{ - std::lock_guard lock( mutex ); - stackAllocator->Free( ptr ); -} - -size_t CCpuMathEngine::GetFreeMemorySize() const -{ - std::lock_guard lock( mutex ); - return memoryPool->GetFreeMemorySize(); -} - -size_t CCpuMathEngine::GetPeakMemoryUsage() const -{ - std::lock_guard lock( mutex ); - return memoryPool->GetPeakMemoryUsage(); -} - -void CCpuMathEngine::ResetPeakMemoryUsage() -{ - std::lock_guard lock( mutex ); - memoryPool->ResetPeakMemoryUsage(); -} - -size_t CCpuMathEngine::GetCurrentMemoryUsage() const -{ - std::lock_guard lock( mutex ); - return memoryPool->GetCurrentMemoryUsage(); -} - -size_t CCpuMathEngine::GetMemoryInPools() const -{ - std::lock_guard lock( mutex ); - return memoryPool->GetMemoryInPools(); -} - -void CCpuMathEngine::CleanUp() -{ - std::lock_guard lock( mutex ); - stackAllocator->CleanUp(); - memoryPool->CleanUp(); #ifdef NEOML_USE_MKL mkl_thread_free_buffers(); #endif // NEOML_USE_MKL @@ -203,7 +91,7 @@ void CCpuMathEngine::CleanUp() void* CCpuMathEngine::GetBuffer( const CMemoryHandle& handle, size_t pos, size_t, bool exchange ) { - (void) exchange; // always returned, no need to copy + ( void ) exchange; // always returned, no need to copy return reinterpret_cast( GetRaw( handle ) ) + pos; } @@ -215,38 +103,26 @@ void CCpuMathEngine::ReleaseBuffer( const CMemoryHandle&, void*, bool ) void CCpuMathEngine::DataExchangeRaw( const CMemoryHandle& handle, const void* data, size_t size ) { ASSERT_EXPR( handle.GetMathEngine() == this ); - ::memcpy( GetRaw( handle ), data, size ); } void CCpuMathEngine::DataExchangeRaw( void* data, const CMemoryHandle& handle, size_t size ) { ASSERT_EXPR( handle.GetMathEngine() == this ); - ::memcpy( data, GetRaw( handle ), size ); } -CMemoryHandle CCpuMathEngine::CopyFrom( const CMemoryHandle& handle, size_t size ) -{ - CMemoryHandle result = HeapAlloc( size ); - - IMathEngine* otherMathEngine = handle.GetMathEngine(); - otherMathEngine->DataExchangeRaw( GetRaw( result ), handle, size ); - - return result; -} - CMemoryHandle CCpuMathEngine::Alloc( size_t size ) { // Ensure the correct alignment void* ptr = 0; - if( MEMORY_ALLOCATION_ALIGNMENT % memoryAlignment == 0 ) { + if( MEMORY_ALLOCATION_ALIGNMENT % MemoryAlignment == 0 ) { ptr = malloc(size); } else { - char* p = static_cast(malloc(size + memoryAlignment)); + char* p = static_cast(malloc(size + MemoryAlignment)); if( p != 0 ) { - const intptr_t delta = memoryAlignment - std::abs( ( reinterpret_cast( p ) % memoryAlignment ) ); - ASSERT_EXPR( delta > 0 && delta <= static_cast( memoryAlignment ) ); + const intptr_t delta = MemoryAlignment - std::abs( ( reinterpret_cast( p ) % MemoryAlignment ) ); + ASSERT_EXPR( delta > 0 && delta <= static_cast( MemoryAlignment ) ); p[delta - 1] = static_cast( delta - 1 ); ptr = p + delta; @@ -266,7 +142,7 @@ void CCpuMathEngine::Free( const CMemoryHandle& handle ) char* ptr = GetRaw( CTypedMemoryHandle( handle ) ); - if( MEMORY_ALLOCATION_ALIGNMENT % memoryAlignment == 0 ) { + if( MEMORY_ALLOCATION_ALIGNMENT % MemoryAlignment == 0 ) { free(ptr); return; } diff --git a/NeoMathEngine/src/CPU/CpuMathEngine.h b/NeoMathEngine/src/CPU/CpuMathEngine.h index 4eef40de5..3b402c70a 100644 --- a/NeoMathEngine/src/CPU/CpuMathEngine.h +++ b/NeoMathEngine/src/CPU/CpuMathEngine.h @@ -17,9 +17,8 @@ limitations under the License. #include #include -#include +#include #include -#include #include #include @@ -30,12 +29,10 @@ struct CCommon2DPoolingDesc; struct CCommonMaxPoolingDesc; struct CCommon3dConvolutionDesc; struct CCommonChannelwiseConvolutionDesc; -class CDeviceStackAllocator; -class CMemoryPool; class ISimdMathEngine; // Math engine that uses a CPU for calculations -class CCpuMathEngine : public IMathEngine, public IRawMemoryManager { +class CCpuMathEngine : public CMemoryEngineMixin, public IRawMemoryManager { public: CCpuMathEngine( size_t memoryLimit, std::shared_ptr communicator = nullptr, @@ -44,27 +41,12 @@ class CCpuMathEngine : public IMathEngine, public IRawMemoryManager { // IMathEngine interface methods TMathEngineType GetType() const override { return MET_Cpu; } - void SetReuseMemoryMode( bool enabled ) override; - bool GetReuseMemoryMode() const override; - void SetThreadBufferMemoryThreshold( size_t threshold ) override; - size_t GetThreadBufferMemoryThreshold() const override; - CMemoryHandle HeapAlloc( size_t count ) override; - void HeapFree( const CMemoryHandle& handle ) override; - void TransferHandleToThisThread( const CMemoryHandle& handle, size_t size ) override; - CMemoryHandle StackAlloc( size_t count ) override; - void StackFree( const CMemoryHandle& handle ) override; - size_t GetFreeMemorySize() const override; - size_t GetPeakMemoryUsage() const override; - void ResetPeakMemoryUsage() override; - size_t GetCurrentMemoryUsage() const override; - size_t GetMemoryInPools() const override; - void CleanUp() override; - void* GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) override; - void ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) override; + void GetMathEngineInfo( CMathEngineInfo& info ) const override; + + void* GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) override; // specialize + void ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) override; // specialize void DataExchangeRaw( const CMemoryHandle& handle, const void* data, size_t size ) override; void DataExchangeRaw( void* data, const CMemoryHandle& handle, size_t size ) override; - CMemoryHandle CopyFrom( const CMemoryHandle& handle, size_t size ) override; - void GetMathEngineInfo( CMathEngineInfo& info ) const override; // IVectorMathEngine interface methods void VectorFill( const CFloatHandle& result, float value, int vectorSize ) override; @@ -637,18 +619,16 @@ class CCpuMathEngine : public IMathEngine, public IRawMemoryManager { CMemoryHandle Alloc( size_t size ) override; void Free( const CMemoryHandle& handle ) override; + void CleanUpSpecial() override; + private: const int floatAlignment; // float alignment - const int memoryAlignment; // allocation alignment std::shared_ptr communicator; CMathEngineDistributedInfo distributedInfo; - const std::unique_ptr memoryPool; // the memory manager - const std::unique_ptr stackAllocator; // the stack memory allocator - mutable std::mutex mutex; // to protect the allocations CDllLoader dllLoader; // loading library for simd instructions std::unique_ptr simdMathEngine; // interface for using simd instructions - SgemmFunc customSgemmFunction; // Used when it is availabled and is faster then default sgemm + SgemmFunc customSgemmFunction = nullptr; // Used when it is availabled and is faster then default sgemm IMathEngine& mathEngine() { IMathEngine* engine = this; return *engine; } diff --git a/NeoMathEngine/src/CPU/CpuMathEngineDnnDistributed.cpp b/NeoMathEngine/src/CPU/CpuMathEngineDnnDistributed.cpp index 92b859707..774efea32 100644 --- a/NeoMathEngine/src/CPU/CpuMathEngineDnnDistributed.cpp +++ b/NeoMathEngine/src/CPU/CpuMathEngineDnnDistributed.cpp @@ -90,8 +90,9 @@ void CMultiThreadDistributedCommunicator::Broadcast( const CFloatHandle& handle, void CreateDistributedCpuMathEngines( IMathEngine** mathEngines, int count, size_t memoryLimit ) { auto communicator = std::make_shared( count ); - for( int i = 0; i < count; i++ ){ + for( int i = 0; i < count; ++i ) { mathEngines[i] = new CCpuMathEngine( memoryLimit, communicator, CMathEngineDistributedInfo( i, count ) ); + ASSERT_EXPR( mathEngines[i] && mathEngines[i]->IsInitialized() ); // Fails, if no call CMemoryEngineMixin::InitializeMemory in some child ctor } } diff --git a/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.cpp b/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.cpp index c8d5538ea..6c173a307 100644 --- a/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.cpp +++ b/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.cpp @@ -37,145 +37,6 @@ limitations under the License. namespace NeoML { -void CCudaMathEngine::CleanUp() -{ - std::lock_guard lock( mutex ); - deviceStackRunTime->CleanUp(); - hostStackRunTime->CleanUp(); - memoryPool->CleanUp(); -} - -size_t CCudaMathEngine::GetFreeMemorySize() const -{ - std::lock_guard lock( mutex ); - return memoryPool->GetFreeMemorySize(); -} - -size_t CCudaMathEngine::GetPeakMemoryUsage() const -{ - std::lock_guard lock( mutex ); - return memoryPool->GetPeakMemoryUsage(); -} - -void CCudaMathEngine::ResetPeakMemoryUsage() -{ - std::lock_guard lock( mutex ); - memoryPool->ResetPeakMemoryUsage(); -} - -size_t CCudaMathEngine::GetCurrentMemoryUsage() const -{ - std::lock_guard lock( mutex ); - return memoryPool->GetCurrentMemoryUsage(); -} - -size_t CCudaMathEngine::GetMemoryInPools() const -{ - std::lock_guard lock( mutex ); - return memoryPool->GetMemoryInPools(); -} - -void CCudaMathEngine::SetReuseMemoryMode( bool ) -{ - // Always true, because allocation is sync -} - -bool CCudaMathEngine::GetReuseMemoryMode() const -{ - // Always true, because allocation is sync - return true; -} - -void CCudaMathEngine::SetThreadBufferMemoryThreshold( size_t threshold ) -{ - std::lock_guard lock( mutex ); - memoryPool->SetThreadBufferMemoryThreshold( threshold ); -} - -size_t CCudaMathEngine::GetThreadBufferMemoryThreshold() const -{ - std::lock_guard lock( mutex ); - return memoryPool->GetThreadBufferMemoryThreshold(); -} - -CMemoryHandle CCudaMathEngine::HeapAlloc( size_t size ) -{ - std::lock_guard lock( mutex ); - CMemoryHandle result = memoryPool->Alloc( size ); - if( result.IsNull() ) { - THROW_MEMORY_EXCEPTION; - } - return result; -} - -void CCudaMathEngine::HeapFree( const CMemoryHandle& handle ) -{ - ASSERT_EXPR( handle.GetMathEngine() == this ); - - std::lock_guard lock( mutex ); - memoryPool->Free( handle ); -} - -void CCudaMathEngine::TransferHandleToThisThread( const CMemoryHandle& handle, size_t size ) -{ - ASSERT_EXPR( handle.GetMathEngine() == this ); - - std::lock_guard lock( mutex ); - memoryPool->TransferHandleToThisThread( handle, size ); -} - -CMemoryHandle CCudaMathEngine::StackAlloc( size_t size ) -{ - ASSERT_EXPR( deviceStackRunTime != 0 ); - - std::lock_guard lock( mutex ); - CMemoryHandle result = deviceStackRunTime->Alloc( size ); - if( result.IsNull() ) { - THROW_MEMORY_EXCEPTION; - } - return result; -} - -void CCudaMathEngine::StackFree( const CMemoryHandle& ptr ) -{ - ASSERT_EXPR(ptr.GetMathEngine() == this); - - std::lock_guard lock( mutex ); - deviceStackRunTime->Free( ptr ); -} - -void* CCudaMathEngine::GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) -{ - ASSERT_EXPR(handle.GetMathEngine() == this); - - size_t realSize = size + 16; - char* result = reinterpret_cast( hostStackRunTime->Alloc( realSize ) ); - size_t* posPtr = reinterpret_cast( result ); - *posPtr = pos; - size_t* sizePtr = reinterpret_cast( result ) + 1; - *sizePtr = size; - if( exchange ) { - DataExchangeRaw( result + 16, handle, size ); - } - return result + 16; -} - -void CCudaMathEngine::ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) -{ - ASSERT_EXPR(handle.GetMathEngine() == this); - - if( exchange ) { - size_t* posPtr = reinterpret_cast( reinterpret_cast( ptr ) - 16 ); - size_t pos = *posPtr; - size_t* sizePtr = posPtr + 1; - size_t size = *sizePtr; - - DataExchangeRaw( CTypedMemoryHandle( handle ) + pos, ptr, size ); - } - - hostStackRunTime->Free( reinterpret_cast( ptr ) - 16 ); -} - void CCudaMathEngine::DataExchangeRaw(const CMemoryHandle& handle, const void* data, size_t size) { ASSERT_EXPR(handle.GetMathEngine() == this); @@ -188,20 +49,6 @@ void CCudaMathEngine::DataExchangeRaw(void* data, const CMemoryHandle& handle, s ASSERT_CUDA(cudaMemcpy(data, GetRaw(handle), size, cudaMemcpyDeviceToHost)); } -CMemoryHandle CCudaMathEngine::CopyFrom( const CMemoryHandle& handle, size_t size ) -{ - CMemoryHandle result = HeapAlloc( size ); - - IMathEngine* otherMathEngine = handle.GetMathEngine(); - void* ptr = otherMathEngine->GetBuffer( handle, 0, size, true ); - - DataExchangeRaw( result, ptr, size ); - - otherMathEngine->ReleaseBuffer( handle, ptr, false ); - - return result; -} - CMemoryHandle CCudaMathEngine::Alloc( size_t size ) { SetCudaDevice( device->DeviceNumber ); @@ -237,7 +84,7 @@ void CCudaMathEngine::AllReduce( const CFloatHandle& handle, int size ) if( ncclCommunicator != nullptr ){ ncclCommunicator->AllReduce( handle, size ); } -#endif +#endif //NEOML_USE_NCCL } void CCudaMathEngine::AbortDistributed() @@ -246,7 +93,7 @@ void CCudaMathEngine::AbortDistributed() if( ncclCommunicator != nullptr ){ ncclCommunicator->Abort(); } -#endif +#endif //NEOML_USE_NCCL } void CCudaMathEngine::Broadcast( const CFloatHandle& handle, int size, int root ) @@ -258,7 +105,7 @@ void CCudaMathEngine::Broadcast( const CFloatHandle& handle, int size, int root if( ncclCommunicator != nullptr ){ ncclCommunicator->Broadcast( handle, size, root ); } -#endif +#endif //NEOML_USE_NCCL } #ifdef NEOML_USE_NCCL @@ -268,7 +115,7 @@ void CCudaMathEngine::SetDistributedCommunicator( const ncclUniqueId& uniqueId, ncclCommunicator = std::make_unique( uniqueId, info, isAbort ); distributedInfo = info; } -#endif +#endif //NEOML_USE_NCCL } // namespace NeoML diff --git a/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.cu b/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.cu index c69c4f8e1..3beb99b6a 100644 --- a/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.cu +++ b/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.cu @@ -40,7 +40,8 @@ const int CudaMemoryAlignment = 4; //------------------------------------------------------------------------------------------------------------ -CCudaMathEngine::CCudaMathEngine( const CCusparse* _cusparse, const CCublas* _cublas, std::unique_ptr& _device, int flags ) : +CCudaMathEngine::CCudaMathEngine( const CCusparse* _cusparse, const CCublas* _cublas, + std::unique_ptr& _device, int flags ) : loader( CDllLoader::CUDA_DLL ), cusparse( _cusparse ), cublas( _cublas ), @@ -72,16 +73,14 @@ CCudaMathEngine::CCudaMathEngine( const CCusparse* _cusparse, const CCublas* _cu ASSERT_CUDA( cudaGetSymbolAddress((void**)&cudaConstZero, ZeroDev) ); ASSERT_CUDA( cudaGetSymbolAddress((void**)&cudaConstOne, OneDev) ); - memoryPool = std::unique_ptr( new CMemoryPool( device->MemoryLimit, this, true ) ); - deviceStackRunTime = std::unique_ptr( new CDeviceStackAllocator( *memoryPool, CudaMemoryAlignment ) ); - hostStackRunTime = std::unique_ptr( new CHostStackAllocator( CudaMemoryAlignment ) ); + InitializeMemory( this, device->MemoryLimit, CudaMemoryAlignment, /*reuse*/true, /*hostStack*/true ); } CCudaMathEngine::~CCudaMathEngine() { - hostStackRunTime.reset(); - deviceStackRunTime.reset(); - memoryPool.reset(); + HostStackAllocator.reset(); + DeviceStackAllocator.reset(); + MemoryPool.reset(); cusparse->Destroy( cusparseHandle ); cublas->Destroy( cublasHandle ); diff --git a/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.h b/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.h index 88cab943b..895bd0a3b 100644 --- a/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.h +++ b/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.h @@ -21,10 +21,9 @@ limitations under the License. #include #include -#include +#include #include #include -#include #include #include #include @@ -38,12 +37,9 @@ struct CCuda3dConvolutionDescInternal; struct CCusparse; struct CCublas; struct CCudaDevice; -class CDeviceStackAllocator; -class CHostStackAllocator; -class CMemoryPool; // CUDA math engine -class CCudaMathEngine : public IMathEngine, public IRawMemoryManager { +class CCudaMathEngine : public CMemoryEngineMixin, public IRawMemoryManager { public: CCudaMathEngine( const CCusparse* cusparse, const CCublas* cublas, std::unique_ptr& device, int flags = 0 ); ~CCudaMathEngine() override; @@ -51,26 +47,9 @@ class CCudaMathEngine : public IMathEngine, public IRawMemoryManager { // IMathEngine interface methods TMathEngineType GetType() const override { return MET_Cuda; } void GetMathEngineInfo( CMathEngineInfo& info ) const override; - void SetReuseMemoryMode( bool enable ) override; - bool GetReuseMemoryMode() const override; - void SetThreadBufferMemoryThreshold( size_t threshold ) override; - size_t GetThreadBufferMemoryThreshold() const override; - CMemoryHandle HeapAlloc( size_t count ) override; - void HeapFree( const CMemoryHandle& handle ) override; - void TransferHandleToThisThread( const CMemoryHandle& handle, size_t size ) override; - CMemoryHandle StackAlloc( size_t count ) override; - void StackFree( const CMemoryHandle& handle ) override; - size_t GetFreeMemorySize() const override; - size_t GetPeakMemoryUsage() const override; - void ResetPeakMemoryUsage() override; - size_t GetCurrentMemoryUsage() const override; - size_t GetMemoryInPools() const override; - void CleanUp() override; - void* GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) override; - void ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) override; + void DataExchangeRaw( const CMemoryHandle& handle, const void* data, size_t size ) override; void DataExchangeRaw( void* data, const CMemoryHandle& handle, size_t size ) override; - CMemoryHandle CopyFrom( const CMemoryHandle& handle, size_t size ) override; // IVectorMathematicsEngine interface methods void VectorFill( const CFloatHandle& result, float value, int vectorSize ) override; @@ -656,13 +635,9 @@ class CCudaMathEngine : public IMathEngine, public IRawMemoryManager { const float* cudaConstZero; // pointer to __constant__ == 0.f const float* cudaConstOne; // pointer to __constant__ == 1.f - mutable std::mutex mutex; // protects the data below std::unique_ptr device; // the device descriptor cublasHandle_t cublasHandle; // cublas library handle cusparseHandle_t cusparseHandle; // cusparse library handle - std::unique_ptr memoryPool; // memory manager - std::unique_ptr deviceStackRunTime; // GPU memory stack allocator - std::unique_ptr hostStackRunTime; // regular memory stack allocator CMathEngineDistributedInfo distributedInfo; #ifdef NEOML_USE_NCCL std::unique_ptr ncclCommunicator = nullptr; diff --git a/NeoMathEngine/src/GPU/Metal/MetalMathEngine.h b/NeoMathEngine/src/GPU/Metal/MetalMathEngine.h index d7af3e537..7035ac1ce 100644 --- a/NeoMathEngine/src/GPU/Metal/MetalMathEngine.h +++ b/NeoMathEngine/src/GPU/Metal/MetalMathEngine.h @@ -22,15 +22,12 @@ limitations under the License. #include #include #include -#include +#include #include namespace NeoML { class CMetalCommandQueue; -class CMemoryPool; -class CDeviceStackAllocator; -class CMutex; struct CMetalRleConvolutionDesc; // Gets the information about an available device @@ -39,34 +36,19 @@ bool LoadMetalEngineInfo( CMathEngineInfo& info ); //------------------------------------------------------------------------------------------------------------ // The math engine using metal -class CMetalMathEngine : public IMathEngine, public IRawMemoryManager { +class CMetalMathEngine : public CMemoryEngineMixin, public IRawMemoryManager { public: explicit CMetalMathEngine( size_t memoryLimit ); ~CMetalMathEngine() override; // IMathEngine interface methods TMathEngineType GetType() const override { return MET_Metal; } - void SetReuseMemoryMode( bool enable ) override; - bool GetReuseMemoryMode() const override; - void SetThreadBufferMemoryThreshold( size_t threshold ) override; - size_t GetThreadBufferMemoryThreshold() const override; - CMemoryHandle HeapAlloc( size_t count ) override; - void HeapFree( const CMemoryHandle& handle ) override; - void TransferHandleToThisThread( const CMemoryHandle& /*handle*/, size_t /*size*/ ) override { ASSERT_EXPR( false ); } - CMemoryHandle StackAlloc( size_t count ) override; - void StackFree( const CMemoryHandle& handle ) override; - size_t GetFreeMemorySize() const override; - size_t GetPeakMemoryUsage() const override; - void ResetPeakMemoryUsage() override; - size_t GetCurrentMemoryUsage() const override; - size_t GetMemoryInPools() const override; - void CleanUp() override; - void* GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) override; - void ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) override; + void GetMathEngineInfo( CMathEngineInfo& info ) const override; + + void* GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) override; // specialize + void ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) override; // specialize void DataExchangeRaw( const CMemoryHandle& handle, const void* data, size_t size ) override; void DataExchangeRaw( void* data, const CMemoryHandle& handle, size_t size ) override; - CMemoryHandle CopyFrom( const CMemoryHandle& handle, size_t size ) override; - void GetMathEngineInfo( CMathEngineInfo& info ) const override; // IVectorMathematicsEngine interface methods void VectorFill(const CFloatHandle& result, float value, int vectorSize) override; @@ -643,28 +625,9 @@ class CMetalMathEngine : public IMathEngine, public IRawMemoryManager { void Free( const CMemoryHandle& handle ) override; private: - // STL cannot be used here - template - class CUniquePtr : public CCrtAllocatedObject { - public: - explicit CUniquePtr( T* _ptr ) : ptr( _ptr ) {} - ~CUniquePtr() { delete ptr; } - - operator T*() { return ptr; } - operator const T*() const { return ptr; } - T* operator ->() { return ptr; } - const T* operator ->() const { return ptr; } - - private: - T* ptr; - }; - - CUniquePtr queue; // the default command queue for a metal device - CUniquePtr memoryPool; // the memory manager - CUniquePtr deviceStackAllocator; // the stack allocator of GPU memory - mutable CUniquePtr mutex; // protecting allocations + std::unique_ptr queue; // the default command queue for a metal device - IMathEngine& mathEngine() { IMathEngine* engine = this; return *engine; } + IMathEngine& mathEngine() { return *this; } void blobMergeByDim(int dimNum, const CBlobDesc* from, const CFloatHandle* fromData, int fromCount, const CBlobDesc& to, const CFloatHandle& toData); diff --git a/NeoMathEngine/src/GPU/Metal/MetalMathEngine.mm b/NeoMathEngine/src/GPU/Metal/MetalMathEngine.mm index c2c803acc..6b70be7fc 100644 --- a/NeoMathEngine/src/GPU/Metal/MetalMathEngine.mm +++ b/NeoMathEngine/src/GPU/Metal/MetalMathEngine.mm @@ -64,121 +64,19 @@ bool LoadMetalEngineInfo( CMathEngineInfo& info ) //---------------------------------------------------------------------------------------------------------------------------- -// Not using STL in headers -class CMutex : public std::mutex { -}; - -//---------------------------------------------------------------------------------------------------------------------------- - const int MetalMemoryAlignment = 16; CMetalMathEngine::CMetalMathEngine( size_t memoryLimit ) : - queue( new CMetalCommandQueue() ), - memoryPool( new CMemoryPool( MIN( memoryLimit == 0 ? SIZE_MAX : memoryLimit, defineMemoryLimit() ), this, false ) ), - deviceStackAllocator( new CDeviceStackAllocator( *memoryPool, MetalMemoryAlignment ) ), - mutex( new CMutex() ) + queue( new CMetalCommandQueue() ) { ASSERT_EXPR( queue->Create() ); -} - -CMetalMathEngine::~CMetalMathEngine() -{ -} + memoryLimit = MIN( memoryLimit == 0 ? SIZE_MAX : memoryLimit, defineMemoryLimit() ); -void CMetalMathEngine::SetReuseMemoryMode( bool enable ) -{ - std::lock_guard lock( *mutex ); - memoryPool->SetReuseMemoryMode( enable ); -} - -bool CMetalMathEngine::GetReuseMemoryMode() const -{ - std::lock_guard lock( *mutex ); - return memoryPool->GetReuseMemoryMode(); + InitializeMemory( this, memoryLimit, MetalMemoryAlignment, /*reuse*/false, /*hostStack*/false ); } -void CMetalMathEngine::SetThreadBufferMemoryThreshold( size_t threshold ) -{ - std::lock_guard lock( *mutex ); - memoryPool->SetThreadBufferMemoryThreshold( threshold ); -} - -size_t CMetalMathEngine::GetThreadBufferMemoryThreshold() const -{ - std::lock_guard lock( *mutex ); - return memoryPool->GetThreadBufferMemoryThreshold(); -} - -CMemoryHandle CMetalMathEngine::HeapAlloc( size_t size ) -{ - std::lock_guard lock( *mutex ); - CMemoryHandle result = memoryPool->Alloc( size ); - if( result.IsNull() ) { - THROW_MEMORY_EXCEPTION; - } - - return result; -} - -void CMetalMathEngine::HeapFree( const CMemoryHandle& handle ) -{ - ASSERT_EXPR( handle.GetMathEngine() == this ); - - std::lock_guard lock( *mutex ); - return memoryPool->Free( handle ); -} - -CMemoryHandle CMetalMathEngine::StackAlloc( size_t size ) -{ - std::lock_guard lock( *mutex ); - CMemoryHandle result = deviceStackAllocator->Alloc( size ); - if( result.IsNull() ) { - THROW_MEMORY_EXCEPTION; - } - return result; -} - -void CMetalMathEngine::StackFree( const CMemoryHandle& ptr ) -{ - std::lock_guard lock( *mutex ); - deviceStackAllocator->Free( ptr ); -} - -size_t CMetalMathEngine::GetFreeMemorySize() const -{ - std::lock_guard lock( *mutex ); - return memoryPool->GetFreeMemorySize(); -} - -size_t CMetalMathEngine::GetPeakMemoryUsage() const -{ - std::lock_guard lock( *mutex ); - return memoryPool->GetPeakMemoryUsage(); -} - -void CMetalMathEngine::ResetPeakMemoryUsage() -{ - std::lock_guard lock( *mutex ); - memoryPool->ResetPeakMemoryUsage(); -} - -size_t CMetalMathEngine::GetCurrentMemoryUsage() const -{ - std::lock_guard lock( *mutex ); - return memoryPool->GetCurrentMemoryUsage(); -} - -size_t CMetalMathEngine::GetMemoryInPools() const -{ - std::lock_guard lock( *mutex ); - return memoryPool->GetMemoryInPools(); -} - -void CMetalMathEngine::CleanUp() +CMetalMathEngine::~CMetalMathEngine() { - std::lock_guard lock( *mutex ); - deviceStackAllocator->CleanUp(); - memoryPool->CleanUp(); } static void* getBufferPtr( void* buffer, ptrdiff_t offset ) @@ -218,20 +116,6 @@ bool LoadMetalEngineInfo( CMathEngineInfo& info ) memcpy( data, buf, size ); } -CMemoryHandle CMetalMathEngine::CopyFrom( const CMemoryHandle& handle, size_t size ) -{ - CMemoryHandle result = HeapAlloc( size ); - - IMathEngine* otherMathEngine = handle.GetMathEngine(); - void* ptr = otherMathEngine->GetBuffer( handle, 0, size, true ); - - DataExchangeRaw( result, ptr, size ); - - otherMathEngine->ReleaseBuffer( handle, ptr, false ); - - return result; -} - void CMetalMathEngine::VectorCopy( const CFloatHandle& first, const CConstFloatHandle& second, int vectorSize ) { ASSERT_EXPR( first.GetMathEngine() == this ); diff --git a/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngine.cpp b/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngine.cpp index c2a793397..227e422fa 100644 --- a/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngine.cpp +++ b/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngine.cpp @@ -79,150 +79,22 @@ CVulkanMathEngine::CVulkanMathEngine( std::unique_ptr& _dev shaderLoader = std::unique_ptr( new CVulkanShaderLoader( *device ) ); commandQueue = std::unique_ptr( new CVulkanCommandQueue( *device ) ); memoryLimit = std::min( memoryLimit == 0 ? SIZE_MAX : memoryLimit, device->AvailableMemory ); - memoryPool = std::unique_ptr( new CMemoryPool( memoryLimit, this, false ) ); - deviceStackAllocator = std::unique_ptr( new CDeviceStackAllocator( *memoryPool, VulkanMemoryAlignment ) ); - hostStackAllocator = std::unique_ptr( new CHostStackAllocator( VulkanMemoryAlignment ) ); -} -CVulkanMathEngine::~CVulkanMathEngine() -{ - for( auto cur : tmpImages ) { - delete cur; - } -} - -void CVulkanMathEngine::SetReuseMemoryMode( bool enable ) -{ - std::lock_guard lock( mutex ); - memoryPool->SetReuseMemoryMode( enable ); -} - -bool CVulkanMathEngine::GetReuseMemoryMode() const -{ - std::lock_guard lock( mutex ); - return memoryPool->GetReuseMemoryMode(); -} - -void CVulkanMathEngine::SetThreadBufferMemoryThreshold( size_t threshold ) -{ - std::lock_guard lock( mutex ); - memoryPool->SetThreadBufferMemoryThreshold( threshold ); -} - -size_t CVulkanMathEngine::GetThreadBufferMemoryThreshold() const -{ - std::lock_guard lock( mutex ); - return memoryPool->GetThreadBufferMemoryThreshold(); + InitializeMemory( this, memoryLimit, VulkanMemoryAlignment, /*reuse*/false, /*hostStack*/true ); } -CMemoryHandle CVulkanMathEngine::HeapAlloc( size_t size ) -{ - std::lock_guard lock( mutex ); - CMemoryHandle result = memoryPool->Alloc( size ); - if( result.IsNull() ) { - THROW_MEMORY_EXCEPTION; - } - - return result; -} - -void CVulkanMathEngine::HeapFree( const CMemoryHandle& handle ) -{ - ASSERT_EXPR( handle.GetMathEngine() == this ); - - std::lock_guard lock( mutex ); - return memoryPool->Free( handle ); -} - -CMemoryHandle CVulkanMathEngine::StackAlloc( size_t size ) -{ - std::lock_guard lock( mutex ); - CMemoryHandle result = deviceStackAllocator->Alloc( size ); - if( result.IsNull() ) { - THROW_MEMORY_EXCEPTION; - } - return result; -} - -void CVulkanMathEngine::StackFree( const CMemoryHandle& ptr ) -{ - std::lock_guard lock( mutex ); - deviceStackAllocator->Free( ptr ); -} - -size_t CVulkanMathEngine::GetFreeMemorySize() const -{ - std::lock_guard lock( mutex ); - return memoryPool->GetFreeMemorySize(); -} - -size_t CVulkanMathEngine::GetPeakMemoryUsage() const -{ - std::lock_guard lock( mutex ); - return memoryPool->GetPeakMemoryUsage(); -} - -void CVulkanMathEngine::ResetPeakMemoryUsage() -{ - std::lock_guard lock( mutex ); - memoryPool->ResetPeakMemoryUsage(); -} - -size_t CVulkanMathEngine::GetCurrentMemoryUsage() const -{ - std::lock_guard lock( mutex ); - return memoryPool->GetCurrentMemoryUsage(); -} - -size_t CVulkanMathEngine::GetMemoryInPools() const +CVulkanMathEngine::~CVulkanMathEngine() { - std::lock_guard lock( mutex ); - return memoryPool->GetMemoryInPools(); + CleanUp(); } -void CVulkanMathEngine::CleanUp() +void CVulkanMathEngine::CleanUpSpecial() { - std::lock_guard lock( mutex ); - deviceStackAllocator->CleanUp(); - hostStackAllocator->CleanUp(); commandQueue->CleanUp(); for( auto& cur : tmpImages ) { delete cur; cur = 0; } - memoryPool->CleanUp(); -} - -void* CVulkanMathEngine::GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) -{ - ASSERT_EXPR(handle.GetMathEngine() == this); - - size_t realSize = size + 16; - char* result = reinterpret_cast( hostStackAllocator->Alloc( realSize ) ); - size_t* posPtr = reinterpret_cast( result ); - *posPtr = pos; - size_t* sizePtr = reinterpret_cast( result ) + 1; - *sizePtr = size; - if( exchange ) { - DataExchangeRaw( result + 16, handle, size ); - } - return result + 16; -} - -void CVulkanMathEngine::ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) -{ - ASSERT_EXPR(handle.GetMathEngine() == this); - - if( exchange ) { - size_t* posPtr = reinterpret_cast( reinterpret_cast( ptr ) - 16 ); - size_t pos = *posPtr; - size_t* sizePtr = posPtr + 1; - size_t size = *sizePtr; - - DataExchangeRaw( CTypedMemoryHandle( handle ) + pos, ptr, size ); - } - - hostStackAllocator->Free( reinterpret_cast( ptr ) - 16 ); } void CVulkanMathEngine::DataExchangeRaw( const CMemoryHandle& to, const void* from, size_t size ) @@ -232,7 +104,7 @@ void CVulkanMathEngine::DataExchangeRaw( const CMemoryHandle& to, const void* fr CTypedMemoryHandle toPtr( to ); const char* fromPtr = reinterpret_cast( from ); - std::lock_guard lock( mutex ); + std::lock_guard lock( Mutex ); while( size != 0 ) { CVulkanMemory* vulkanMemory = GetRawAllocation( toPtr ); ptrdiff_t vulkanOffset = GetRawOffset( toPtr ); @@ -284,7 +156,7 @@ void CVulkanMathEngine::DataExchangeRaw( void* to, const CMemoryHandle& from, si CTypedMemoryHandle fromPtr( from ); char* toPtr = reinterpret_cast( to ); - std::lock_guard lock( mutex ); + std::lock_guard lock( Mutex ); while( size != 0 ) { CVulkanMemory* vulkanMemory = GetRawAllocation( fromPtr ); ptrdiff_t vulkanOffset = GetRawOffset( fromPtr ); @@ -323,20 +195,6 @@ void CVulkanMathEngine::DataExchangeRaw( void* to, const CMemoryHandle& from, si } } -CMemoryHandle CVulkanMathEngine::CopyFrom( const CMemoryHandle& handle, size_t size ) -{ - CMemoryHandle result = HeapAlloc( size ); - - IMathEngine* otherMathEngine = handle.GetMathEngine(); - void* ptr = otherMathEngine->GetBuffer( handle, 0, size, true ); - - DataExchangeRaw( result, ptr, size ); - - otherMathEngine->ReleaseBuffer( handle, ptr, false ); - - return result; -} - CMemoryHandle CVulkanMathEngine::Alloc( size_t size ) { VkBufferUsageFlags usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | @@ -425,7 +283,7 @@ void CVulkanMathEngine::runShader( const CVulkanShaderData& shader, const void* const CMemoryHandle* dataBuffers, const size_t* dataSizes, int dataBufferCount, int countX, int countY, int countZ ) { - std::lock_guard lock( mutex ); + std::lock_guard lock( Mutex ); commandQueue->RunComputeShader( shader, Ceil(countX, shader.GroupSizeX), Ceil(countY, shader.GroupSizeY), Ceil(countZ, shader.GroupSizeZ), param, paramSize, images, imageCount, samplers, samplerCount, dataBuffers, dataSizes, dataBufferCount ); @@ -441,7 +299,7 @@ void CVulkanMathEngine::runVectorShader( const CVulkanShaderData& shader, const ASSERT_EXPR(shader.GroupSizeY == 1 && shader.GroupSizeZ == 1); - std::lock_guard lock( mutex ); + std::lock_guard lock( Mutex ); commandQueue->RunComputeShader( shader, groupCountX, groupCountY, 1, param, paramSize, images, imageCount, samplers, samplerCount, dataBuffers, dataSizes, dataBufferCount ); } diff --git a/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngine.h b/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngine.h index 530ca9e63..acca19dab 100644 --- a/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngine.h +++ b/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngine.h @@ -20,13 +20,12 @@ limitations under the License. #ifdef NEOML_USE_VULKAN #include -#include #include #include #include #include #include -#include +#include #include #include @@ -39,10 +38,7 @@ struct CVulkanDevice; struct CVulkanRleConvolutionDesc; class CVulkanCommandQueue; class CVulkanShaderLoader; -class CDeviceStackAllocator; -class CHostStackAllocator; class CVulkanImage; -class CMemoryPool; // Adds the information about available vulkan devices into the result array // Returns true if at least one device has been added @@ -51,34 +47,17 @@ bool LoadVulkanEngineInfo( const CVulkanDll& dll, std::vector< CMathEngineInfo, //------------------------------------------------------------------------------------------------------------ // The math engine on vulkan -class CVulkanMathEngine : public IMathEngine, public IRawMemoryManager { +class CVulkanMathEngine : public CMemoryEngineMixin, public IRawMemoryManager { public: CVulkanMathEngine( std::unique_ptr& device, size_t memoryLimit ); ~CVulkanMathEngine() override; // IMathEngine interface methods TMathEngineType GetType() const override { return MET_Vulkan; } - void SetReuseMemoryMode( bool enable ) override; - bool GetReuseMemoryMode() const override; - void SetThreadBufferMemoryThreshold( size_t threshold ) override; - size_t GetThreadBufferMemoryThreshold() const override; - CMemoryHandle HeapAlloc( size_t count ) override; - void HeapFree( const CMemoryHandle& handle ) override; - void TransferHandleToThisThread( const CMemoryHandle& /*handle*/, size_t /*size*/ ) override { ASSERT_EXPR( false ); } - CMemoryHandle StackAlloc( size_t count ) override; - void StackFree( const CMemoryHandle& handle ) override; - size_t GetFreeMemorySize() const override; - size_t GetPeakMemoryUsage() const override; - void ResetPeakMemoryUsage() override; - size_t GetCurrentMemoryUsage() const override; - size_t GetMemoryInPools() const override; - void CleanUp() override; - void* GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) override; - void ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) override; + void GetMathEngineInfo( CMathEngineInfo& info ) const override; + void DataExchangeRaw( const CMemoryHandle& handle, const void* data, size_t size ) override; void DataExchangeRaw( void* data, const CMemoryHandle& handle, size_t size ) override; - CMemoryHandle CopyFrom( const CMemoryHandle& handle, size_t size ) override; - void GetMathEngineInfo( CMathEngineInfo& info ) const override; // IVectorMathematicsEngine interface methods void VectorFill( const CFloatHandle& result, float value, int vectorSize ) override; @@ -654,15 +633,13 @@ class CVulkanMathEngine : public IMathEngine, public IRawMemoryManager { CMemoryHandle Alloc( size_t size ) override; void Free( const CMemoryHandle& handle ) override; + void CleanUpSpecial() override; + private: CDllLoader dllLoader; // vulkan dll wrapper - mutable std::mutex mutex; // protecting the data below from non-thread-safe use std::unique_ptr device; // device descriptor std::unique_ptr shaderLoader; // shader loader std::unique_ptr commandQueue; // shader execution queue - std::unique_ptr memoryPool; // memory manager - std::unique_ptr deviceStackAllocator; // stack allocator for GPU memory - std::unique_ptr hostStackAllocator; // stack allocator for host memory std::vector< CVulkanImage*, CrtAllocator > tmpImages; // temporary images IMathEngine& mathEngine() { IMathEngine* engine = this; return *engine; } diff --git a/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngineVectorMath.cpp b/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngineVectorMath.cpp index 811defae7..a1560d69e 100644 --- a/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngineVectorMath.cpp +++ b/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngineVectorMath.cpp @@ -108,7 +108,7 @@ void CVulkanMathEngine::VectorFill( const CFloatHandle& result, float value, int CVulkanMemory* vulkanMemory = GetRawAllocation( result ); - std::lock_guard lock( mutex ); + std::lock_guard lock( Mutex ); commandQueue->RunFillBuffer( vulkanMemory->Buffer(), GetRawOffset( result ), size, data ); } @@ -119,7 +119,7 @@ void CVulkanMathEngine::VectorFill( const CIntHandle& result, int value, int vec CVulkanMemory* vulkanMemory = GetRawAllocation( result ); - std::lock_guard lock( mutex ); + std::lock_guard lock( Mutex ); commandQueue->RunFillBuffer( vulkanMemory->Buffer(), GetRawOffset(result), size, data ); } @@ -186,7 +186,7 @@ void CVulkanMathEngine::VectorCopy( const CFloatHandle& to, const CConstFloatHan CVulkanMemory* vulkanMemoryFrom = GetRawAllocation(from); CVulkanMemory* vulkanMemoryTo = GetRawAllocation(to); - std::lock_guard lock( mutex ); + std::lock_guard lock( Mutex ); commandQueue->RunCopyBuffer( vulkanMemoryFrom->Buffer(), vulkanMemoryTo->Buffer(), region ); } @@ -200,7 +200,7 @@ void CVulkanMathEngine::VectorCopy(const CIntHandle& to, const CConstIntHandle& CVulkanMemory* vulkanMemoryFrom = GetRawAllocation(from); CVulkanMemory* vulkanMemoryTo = GetRawAllocation(to); - std::lock_guard lock( mutex ); + std::lock_guard lock( Mutex ); commandQueue->RunCopyBuffer( vulkanMemoryFrom->Buffer(), vulkanMemoryTo->Buffer(), region ); } diff --git a/NeoMathEngine/src/MathEngine.cpp b/NeoMathEngine/src/MathEngine.cpp index 2e3f9ed55..9ce300f22 100644 --- a/NeoMathEngine/src/MathEngine.cpp +++ b/NeoMathEngine/src/MathEngine.cpp @@ -141,32 +141,38 @@ IMathEngine* CGpuMathEngineManager::CreateMathEngine( int index, size_t memoryLi if( size == 0 || index >= size ) { return nullptr; } + IMathEngine* mathEngine = nullptr; switch(info[index >= 0 ? index : 0].Type) { #ifdef NEOML_USE_CUDA case MET_Cuda: { std::unique_ptr device( CaptureCudaDevice( index >= 0 ? info[index].Id : -1, memoryLimit ) ); - if( device == nullptr ) { - return nullptr; + if( device != nullptr ) { + mathEngine = new CCudaMathEngine( CDllLoader::cusparseDll->GetFunctions(), CDllLoader::cublasDll->GetFunctions(), device, flags ); + break; } - return new CCudaMathEngine( CDllLoader::cusparseDll->GetFunctions(), CDllLoader::cublasDll->GetFunctions(), device, flags ); + return nullptr; } -#endif +#endif //NEOML_USE_CUDA #ifdef NEOML_USE_VULKAN case MET_Vulkan: { const auto& deviceInfo = loader.vulkanDll->GetDevices()[index >= 0 ? info[index].Id : 0]; std::unique_ptr device (loader.vulkanDll->CreateDevice( deviceInfo ) ); - if( !device ) { - return nullptr; + if( device != nullptr ) { + mathEngine = new CVulkanMathEngine( device, memoryLimit ); + break; } - return new CVulkanMathEngine( device, memoryLimit ); + return nullptr; } -#endif +#endif //NEOML_USE_VULKAN #ifdef NEOML_USE_METAL case MET_Metal: - return new CMetalMathEngine( memoryLimit ); -#endif + { + mathEngine = new CMetalMathEngine( memoryLimit ); + break; + } +#endif //NEOML_USE_METAL case MET_Undefined: default: { @@ -174,6 +180,8 @@ IMathEngine* CGpuMathEngineManager::CreateMathEngine( int index, size_t memoryLi return nullptr; } } + ASSERT_EXPR( mathEngine && mathEngine->IsInitialized() ); // Fails, if no call CMemoryEngineMixin::InitializeMemory in some child ctor + return mathEngine; } //------------------------------------------------------------------------------------------------------------ @@ -218,7 +226,9 @@ IMathEngineExceptionHandler* GetMathEngineExceptionHandler() IMathEngine* CreateCpuMathEngine( size_t memoryLimit ) { - return new CCpuMathEngine( memoryLimit ); + IMathEngine *mathEngine = new CCpuMathEngine( memoryLimit ); + ASSERT_EXPR( mathEngine && mathEngine->IsInitialized() ); // Fails, if no call CMemoryEngineMixin::InitializeMemory in some child ctor + return mathEngine; } // deprecated diff --git a/NeoMathEngine/src/MemoryEngineMixin.cpp b/NeoMathEngine/src/MemoryEngineMixin.cpp new file mode 100644 index 000000000..9b967980b --- /dev/null +++ b/NeoMathEngine/src/MemoryEngineMixin.cpp @@ -0,0 +1,231 @@ +/* Copyright © 2024 ABBYY + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +--------------------------------------------------------------------------------------------------------------*/ + +#include +#pragma hdrstop + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace NeoML { + +void CMemoryEngineMixin::InitializeMemory( IRawMemoryManager* _rawManager, size_t _memoryLimit, int _memoryAlignment, + bool _reuse, bool _hostStack ) +{ + MemoryAlignment = _memoryAlignment; + MemoryPool.reset( new CMemoryPool( _memoryLimit == 0 ? SIZE_MAX : _memoryLimit, _rawManager, _reuse ) ); + DeviceStackAllocator.reset( new CDeviceStackAllocator( *MemoryPool, MemoryAlignment ) ); + if( _hostStack == true ) { + HostStackAllocator.reset( new CHostStackAllocator( MemoryAlignment ) ); + } +} + +void CMemoryEngineMixin::SetReuseMemoryMode( bool enable ) +{ + switch( GetType() ) { + case MET_Cuda: + // Always true, because allocation is sync + break; + case MET_Cpu: + // Distributed CPU math engine always uses memory pools + // because big simultaneous allocations on multiple (20+) threads are extremely slow + if( IsDistributed() ) { + break; + } + // fallthrough + case MET_Metal: + case MET_Vulkan: + { + std::lock_guard lock( Mutex ); + MemoryPool->SetReuseMemoryMode( enable ); + break; + } + default: + ASSERT_EXPR( false ); + } +} + +bool CMemoryEngineMixin::GetReuseMemoryMode() const +{ + switch( GetType() ) { + case MET_Cuda: + // Always true, because allocation is sync + return true; + case MET_Cpu: + // Distributed CPU math engine always uses memory pools + if( IsDistributed() ) { + return true; + } + // fallthrough + case MET_Metal: + case MET_Vulkan: + { + std::lock_guard lock( Mutex ); + return MemoryPool->GetReuseMemoryMode(); + } + default: + ASSERT_EXPR( false ); + } + return false; +} + +void CMemoryEngineMixin::SetThreadBufferMemoryThreshold( size_t threshold ) +{ + std::lock_guard lock( Mutex ); + MemoryPool->SetThreadBufferMemoryThreshold( threshold ); +} + +size_t CMemoryEngineMixin::GetThreadBufferMemoryThreshold() const +{ + std::lock_guard lock( Mutex ); + return MemoryPool->GetThreadBufferMemoryThreshold(); +} + +CMemoryHandle CMemoryEngineMixin::HeapAlloc( size_t size ) +{ + std::lock_guard lock( Mutex ); + CMemoryHandle result = MemoryPool->Alloc( size ); + if( result.IsNull() ) { + THROW_MEMORY_EXCEPTION; + } + return result; +} + +void CMemoryEngineMixin::HeapFree( const CMemoryHandle& handle ) +{ + ASSERT_EXPR( handle.GetMathEngine() == this ); + + std::lock_guard lock( Mutex ); + MemoryPool->Free( handle ); +} + +void CMemoryEngineMixin::TransferHandleToThisThread( const CMemoryHandle& handle, size_t size ) +{ + ASSERT_EXPR( GetType() == MET_Cpu || GetType() == MET_Cuda ); + ASSERT_EXPR( handle.GetMathEngine() == this ); + + std::lock_guard lock( Mutex ); + MemoryPool->TransferHandleToThisThread( handle, size ); +} + +CMemoryHandle CMemoryEngineMixin::StackAlloc( size_t size ) +{ + std::lock_guard lock( Mutex ); + CMemoryHandle result = DeviceStackAllocator->Alloc( size ); + if( result.IsNull() ) { + THROW_MEMORY_EXCEPTION; + } + return result; +} + +void CMemoryEngineMixin::StackFree( const CMemoryHandle& ptr ) +{ + std::lock_guard lock( Mutex ); + DeviceStackAllocator->Free( ptr ); +} + +size_t CMemoryEngineMixin::GetFreeMemorySize() const +{ + std::lock_guard lock( Mutex ); + return MemoryPool->GetFreeMemorySize(); +} + +size_t CMemoryEngineMixin::GetPeakMemoryUsage() const +{ + std::lock_guard lock( Mutex ); + return MemoryPool->GetPeakMemoryUsage(); +} + +void CMemoryEngineMixin::ResetPeakMemoryUsage() +{ + std::lock_guard lock( Mutex ); + MemoryPool->ResetPeakMemoryUsage(); +} + +size_t CMemoryEngineMixin::GetCurrentMemoryUsage() const +{ + std::lock_guard lock( Mutex ); + return MemoryPool->GetCurrentMemoryUsage(); +} + +size_t CMemoryEngineMixin::GetMemoryInPools() const +{ + std::lock_guard lock( Mutex ); + return MemoryPool->GetMemoryInPools(); +} + +void CMemoryEngineMixin::CleanUp() +{ + std::lock_guard lock( Mutex ); + DeviceStackAllocator->CleanUp(); + if( HostStackAllocator != nullptr ) { + HostStackAllocator->CleanUp(); + } + CleanUpSpecial(); + MemoryPool->CleanUp(); +} + +constexpr size_t bufferHeaderSize = 2 * sizeof( size_t ); + +void* CMemoryEngineMixin::GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) +{ + ASSERT_EXPR( HostStackAllocator != nullptr ); + ASSERT_EXPR( handle.GetMathEngine() == this ); + + const size_t realSize = size + bufferHeaderSize; + char* result = static_cast( HostStackAllocator->Alloc( realSize ) ); + + size_t* header = reinterpret_cast( result ); + header[0] = pos; + header[1] = size; + + if( exchange ) { + DataExchangeRaw( result + bufferHeaderSize, handle, size ); + } + return result + bufferHeaderSize; +} + +void CMemoryEngineMixin::ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) +{ + ASSERT_EXPR( HostStackAllocator != nullptr ); + ASSERT_EXPR( handle.GetMathEngine() == this ); + + if( exchange ) { + size_t* header = reinterpret_cast( static_cast( ptr ) - bufferHeaderSize ); + size_t pos = header[0]; + size_t size = header[1]; + + DataExchangeRaw( CTypedMemoryHandle( handle ) + pos, ptr, size ); + } + HostStackAllocator->Free( static_cast( ptr ) - bufferHeaderSize ); +} + +CMemoryHandle CMemoryEngineMixin::CopyFrom( const CMemoryHandle& handle, size_t size ) +{ + CMemoryHandle result = HeapAlloc( size ); + + IMathEngine* otherMathEngine = handle.GetMathEngine(); + otherMathEngine->DataExchangeRaw( GetRaw( result ), handle, size ); + + return result; +} + +} // namespace NeoML diff --git a/NeoMathEngine/src/MemoryEngineMixin.h b/NeoMathEngine/src/MemoryEngineMixin.h new file mode 100644 index 000000000..bc4091363 --- /dev/null +++ b/NeoMathEngine/src/MemoryEngineMixin.h @@ -0,0 +1,66 @@ +/* Copyright © 2024 ABBYY + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +--------------------------------------------------------------------------------------------------------------*/ + +#pragma once + +#include +#include +#include +#include + +namespace NeoML { + +class CDeviceStackAllocator; +class CHostStackAllocator; +class CMemoryPool; + +// Memory management engine base class +class CMemoryEngineMixin : public IMathEngine { +public: + void InitializeMemory( IRawMemoryManager* _rawManager, size_t _memoryLimit, int _memoryAlignment, + bool _reuse, bool _hostStack ); + bool IsInitialized() const override { return MemoryAlignment > 0 && MemoryPool != 0 && DeviceStackAllocator != 0; } + + void SetReuseMemoryMode( bool enable ) override; + bool GetReuseMemoryMode() const override; + void SetThreadBufferMemoryThreshold( size_t threshold ) override; + size_t GetThreadBufferMemoryThreshold() const override; + CMemoryHandle HeapAlloc( size_t count ) override; + void HeapFree( const CMemoryHandle& handle ) override; + void TransferHandleToThisThread( const CMemoryHandle& handle, size_t size ) override; + CMemoryHandle StackAlloc( size_t count ) override; + void StackFree( const CMemoryHandle& handle ) override; + size_t GetFreeMemorySize() const override; + size_t GetPeakMemoryUsage() const override; + void ResetPeakMemoryUsage() override; + size_t GetCurrentMemoryUsage() const override; + size_t GetMemoryInPools() const override; + + void CleanUp() override; + void* GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) override; + void ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) override; + CMemoryHandle CopyFrom( const CMemoryHandle& handle, size_t size ) override; + +protected: + int MemoryAlignment = 0; // allocation alignment + mutable std::mutex Mutex; // protecting the data below from non-thread-safe use + std::unique_ptr MemoryPool; // memory manager + std::unique_ptr DeviceStackAllocator; // stack allocator for GPU memory + std::unique_ptr HostStackAllocator; // stack allocator for regular memory + + void CleanUpSpecial() override {} +}; + +} // namespace NeoML