[NeoMathEngine] Refactored MemoryEngine (#1066)

* [NeoMathEngine] Refactor MemoryEngine Signed-off-by: Kirill Golikov <kirill.golikov@abbyy.com> * [NeoMathEngine] Apply comments Signed-off-by: Kirill Golikov <kirill.golikov@abbyy.com> --------- Signed-off-by: Kirill Golikov <kirill.golikov@abbyy.com>
neoml-lib · Aug 20, 2024 · 8ec20ce · 8ec20ce
1 parent 555b950
commit 8ec20ce
Show file tree

Hide file tree

Showing 16 changed files with 399 additions and 719 deletions.
diff --git a/NeoMathEngine/include/NeoMathEngine/NeoMathEngine.h b/NeoMathEngine/include/NeoMathEngine/NeoMathEngine.h
@@ -1149,22 +1149,27 @@ struct CMathEngineInfo {
 	CMathEngineInfo( TMathEngineType type, size_t availableMemory, int id ) : Type( type ), AvailableMemory( availableMemory ), Id( id ) { Name[0] = 0; }
 };
 
+//------------------------------------------------------------------------------------------------------------
+
 // CMathEngine class implements an engine to perform calculations on data specified by CMemoryHandle (CFloatHandle)
 class NEOMATHENGINE_API IMathEngine : public IDnnEngine {
 public:
 	virtual ~IMathEngine();
+
 	// Gets the device type
 	virtual TMathEngineType GetType() const = 0;
-
 	// Gets the device information
 	virtual void GetMathEngineInfo( CMathEngineInfo& info ) const = 0;
+	// CMemoryEngineMixin has a delayed initialization after the device initialization
+	virtual bool IsInitialized() const = 0;
 
 	// Memory management
 
 	// Turns on and off the memory reuse mode
 	// In this mode, the allocated memory blocks will not be deleted on HeapFree() and may be used until CleanUp()
 	virtual void SetReuseMemoryMode( bool enable ) = 0;
 	virtual bool GetReuseMemoryMode() const = 0;
+
 	// Specialize the size threshold in bytes for the current thread, so
 	// memory blocks of a size <= this threshold would be allocated in buffers if 'reuse' mode enabled
 	// memory blocks of a size >  this threshold would be allocated in raw RAM memory (malloc/free)
@@ -1216,9 +1221,11 @@ class NEOMATHENGINE_API IMathEngine : public IDnnEngine {
 
 	// Typed data exchange
 	template<class T>
-	void DataExchangeTyped( const CTypedMemoryHandle<T>& result, const T* source, size_t size ) { DataExchangeRaw( result, source, size * sizeof(T) ); }
+	void DataExchangeTyped( const CTypedMemoryHandle<T>& result, const T* source, size_t size )
+		{ DataExchangeRaw( result, source, size * sizeof(T) ); }
 	template<class T>
-	void DataExchangeTyped( T* result, const CTypedMemoryHandle<const T>& source, size_t size ) { DataExchangeRaw( result, source, size * sizeof(T) ); }
+	void DataExchangeTyped( T* result, const CTypedMemoryHandle<const T>& source, size_t size )
+		{ DataExchangeRaw( result, source, size * sizeof(T) ); }
 
 	// Creates a handle with data from another math engine
 	virtual CMemoryHandle CopyFrom( const CMemoryHandle& handle, size_t size ) = 0;
@@ -1227,11 +1234,15 @@ class NEOMATHENGINE_API IMathEngine : public IDnnEngine {
 	// This object should be destroyed using the standard delete operator after use.
 	virtual IPerformanceCounters* CreatePerformanceCounters( bool isTimeOnly = false ) const = 0;
 
+	// For Distributed only
 	virtual CMathEngineDistributedInfo GetDistributedInfo() { return CMathEngineDistributedInfo(); }
 	virtual void AllReduce( const CFloatHandle& handle, int size ) = 0;
 	virtual void Broadcast( const CFloatHandle& handle, int size, int root ) = 0;
 	virtual void AbortDistributed() {};
 	virtual bool IsDistributed() const { return false; }
+
+protected:
+	virtual void CleanUpSpecial() = 0;
 };
 
 //------------------------------------------------------------------------------------------------------------

diff --git a/NeoMathEngine/src/CMakeLists.txt b/NeoMathEngine/src/CMakeLists.txt
@@ -31,6 +31,7 @@ set(CPU_COMMON_SOURCES
     MathEngineDnnDropout.cpp
     MathEngine.cpp
     MathEngineHostStackAllocator.cpp
+    MemoryEngineMixin.cpp
     MemoryPool.cpp
     ThreadPool.cpp
     common.cpp
@@ -50,6 +51,7 @@ target_sources(${PROJECT_NAME}
     MathEngineDnnLrn.h
     MathEngineDnnPoolings.h
     MathEngineHostStackAllocator.h
+    MemoryEngineMixin.h
     MemoryHandleInternal.h
     MemoryPool.h
     RawMemoryManager.h

diff --git a/NeoMathEngine/src/CPU/CpuMathEngine.cpp b/NeoMathEngine/src/CPU/CpuMathEngine.cpp
@@ -54,15 +54,12 @@ CCpuMathEngine::CCpuMathEngine( size_t _memoryLimit,
 		std::shared_ptr<CMultiThreadDistributedCommunicator> communicator,
 		const CMathEngineDistributedInfo& distributedInfo ) :
 	floatAlignment( FloatAlignment ),
-	memoryAlignment( floatAlignment * sizeof(float) ),
 	communicator( communicator ),
 	distributedInfo( distributedInfo ),
-	memoryPool( new CMemoryPool( _memoryLimit == 0 ? SIZE_MAX : _memoryLimit, this, distributedInfo.Threads > 1 ) ),
-	stackAllocator( new CDeviceStackAllocator( *memoryPool, memoryAlignment ) ),
-	dllLoader( CDllLoader::AVX_DLL ),
-	simdMathEngine( nullptr ),
-	customSgemmFunction( nullptr )
+	dllLoader( CDllLoader::AVX_DLL )
 {
+	InitializeMemory( this, _memoryLimit, static_cast<int>( floatAlignment * sizeof( float ) ),
+		/*reuse*/IsDistributed(), /*hostStack*/false );
 #ifdef NEOML_USE_AVX
 	if( dllLoader.IsLoaded( CDllLoader::AVX_DLL ) ) {
 		simdMathEngine = std::unique_ptr<ISimdMathEngine>( CDllLoader::avxDll->CreateSimdMathEngine( this ) );
@@ -73,7 +70,7 @@ CCpuMathEngine::CCpuMathEngine( size_t _memoryLimit,
 	}
 #else  // !NEOML_USE_AVX
 	// warning fix
-	(void)customSgemmFunction;
+	( void ) customSgemmFunction;
 #endif // !NEOML_USE_AVX
 #ifdef NEOML_USE_MKL
 	vmlSetMode( VML_ERRMODE_NOERR );
@@ -85,125 +82,16 @@ CCpuMathEngine::~CCpuMathEngine()
 	CleanUp();
 }
 
-void CCpuMathEngine::SetReuseMemoryMode( bool enable )
+void CCpuMathEngine::CleanUpSpecial()
 {
-	// Distributed CPU math engine always uses memory pools
-	// because big simultaneous allocations on multiple (20+) threads are extremely slow
-	if( IsDistributed() ) {
-		return;
-	}
-
-	std::lock_guard<std::mutex> lock( mutex );
-	memoryPool->SetReuseMemoryMode( enable );
-}
-
-bool CCpuMathEngine::GetReuseMemoryMode() const
-{
-	// Distributed CPU math engine always uses memory pools
-	if( IsDistributed() ) {
-		return true;
-	}
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetReuseMemoryMode();
-}
-
-void CCpuMathEngine::SetThreadBufferMemoryThreshold( size_t threshold )
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	memoryPool->SetThreadBufferMemoryThreshold( threshold );
-}
-
-size_t CCpuMathEngine::GetThreadBufferMemoryThreshold() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetThreadBufferMemoryThreshold();
-}
-
-CMemoryHandle CCpuMathEngine::HeapAlloc( size_t size )
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	CMemoryHandle result = memoryPool->Alloc( size );
-	if( result.IsNull() ) {
-		THROW_MEMORY_EXCEPTION;
-	}
-	return result;
-}
-
-void CCpuMathEngine::HeapFree( const CMemoryHandle& handle )
-{
-	ASSERT_EXPR( handle.GetMathEngine() == this );
-
-	std::lock_guard<std::mutex> lock( mutex );
-	memoryPool->Free( handle );
-}
-
-void CCpuMathEngine::TransferHandleToThisThread( const CMemoryHandle& handle, size_t size )
-{
-	ASSERT_EXPR( handle.GetMathEngine() == this );
-
-	std::lock_guard<std::mutex> lock( mutex );
-	memoryPool->TransferHandleToThisThread( handle, size );
-}
-
-CMemoryHandle CCpuMathEngine::StackAlloc( size_t size )
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	CMemoryHandle result = stackAllocator->Alloc(size);
-	if( result.IsNull() ) {
-		THROW_MEMORY_EXCEPTION;
-	}
-	return result;
-}
-
-void CCpuMathEngine::StackFree( const CMemoryHandle& ptr )
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	stackAllocator->Free( ptr );
-}
-
-size_t CCpuMathEngine::GetFreeMemorySize() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetFreeMemorySize();
-}
-
-size_t CCpuMathEngine::GetPeakMemoryUsage() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetPeakMemoryUsage();
-}
-
-void CCpuMathEngine::ResetPeakMemoryUsage()
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	memoryPool->ResetPeakMemoryUsage();
-}
-
-size_t CCpuMathEngine::GetCurrentMemoryUsage() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetCurrentMemoryUsage();
-}
-
-size_t CCpuMathEngine::GetMemoryInPools() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetMemoryInPools();
-}
-
-void CCpuMathEngine::CleanUp()
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	stackAllocator->CleanUp();
-	memoryPool->CleanUp();
 #ifdef NEOML_USE_MKL
 	mkl_thread_free_buffers();
 #endif // NEOML_USE_MKL
 }
 
 void* CCpuMathEngine::GetBuffer( const CMemoryHandle& handle, size_t pos, size_t, bool exchange )
 {
-	(void) exchange; // always returned, no need to copy
+	( void ) exchange; // always returned, no need to copy
 	return reinterpret_cast<char*>( GetRaw( handle ) ) + pos;
 }
 
@@ -215,38 +103,26 @@ void CCpuMathEngine::ReleaseBuffer( const CMemoryHandle&, void*, bool )
 void CCpuMathEngine::DataExchangeRaw( const CMemoryHandle& handle, const void* data, size_t size )
 {
 	ASSERT_EXPR( handle.GetMathEngine() == this );
-
 	::memcpy( GetRaw( handle ), data, size );
 }
 
 void CCpuMathEngine::DataExchangeRaw( void* data, const CMemoryHandle& handle, size_t size )
 {
 	ASSERT_EXPR( handle.GetMathEngine() == this );
-
 	::memcpy( data, GetRaw( handle ), size );
 }
 
-CMemoryHandle CCpuMathEngine::CopyFrom( const CMemoryHandle& handle, size_t size )
-{
-	CMemoryHandle result = HeapAlloc( size );
-
-	IMathEngine* otherMathEngine = handle.GetMathEngine();
-	otherMathEngine->DataExchangeRaw( GetRaw( result ), handle, size );
-
-	return result;
-}
-
 CMemoryHandle CCpuMathEngine::Alloc( size_t size )
 {
 	// Ensure the correct alignment
 	void* ptr = 0;
-	if( MEMORY_ALLOCATION_ALIGNMENT % memoryAlignment == 0 ) {
+	if( MEMORY_ALLOCATION_ALIGNMENT % MemoryAlignment == 0 ) {
 		ptr = malloc(size);
 	} else {
-		char* p = static_cast<char*>(malloc(size + memoryAlignment));
+		char* p = static_cast<char*>(malloc(size + MemoryAlignment));
 		if( p != 0 ) {
-			const intptr_t delta = memoryAlignment - std::abs( ( reinterpret_cast<intptr_t>( p ) % memoryAlignment ) );
-			ASSERT_EXPR( delta > 0 && delta <= static_cast<intptr_t>( memoryAlignment ) );
+			const intptr_t delta = MemoryAlignment - std::abs( ( reinterpret_cast<intptr_t>( p ) % MemoryAlignment ) );
+			ASSERT_EXPR( delta > 0 && delta <= static_cast<intptr_t>( MemoryAlignment ) );
 
 			p[delta - 1] = static_cast<char>( delta - 1 );
 			ptr = p + delta;
@@ -266,7 +142,7 @@ void CCpuMathEngine::Free( const CMemoryHandle& handle )
 
 	char* ptr = GetRaw( CTypedMemoryHandle<char>( handle ) );
 
-	if( MEMORY_ALLOCATION_ALIGNMENT % memoryAlignment == 0 ) {
+	if( MEMORY_ALLOCATION_ALIGNMENT % MemoryAlignment == 0 ) {
 		free(ptr);
 		return;
 	}

diff --git a/NeoMathEngine/src/CPU/CpuMathEngine.h b/NeoMathEngine/src/CPU/CpuMathEngine.h
@@ -17,9 +17,8 @@ limitations under the License.
 
 #include <NeoMathEngine/NeoMathEngine.h>
 #include <NeoMathEngine/SimdMathEngine.h>
-#include <RawMemoryManager.h>
+#include <MemoryEngineMixin.h>
 #include <DllLoader.h>
-#include <mutex>
 #include <memory>
 #include <CpuMathEngineDnnDistributed.h>
 
@@ -30,12 +29,10 @@ struct CCommon2DPoolingDesc;
 struct CCommonMaxPoolingDesc;
 struct CCommon3dConvolutionDesc;
 struct CCommonChannelwiseConvolutionDesc;
-class CDeviceStackAllocator;
-class CMemoryPool;
 class ISimdMathEngine;
 
 // Math engine that uses a CPU for calculations
-class CCpuMathEngine : public IMathEngine, public IRawMemoryManager {
+class CCpuMathEngine : public CMemoryEngineMixin, public IRawMemoryManager {
 public:
 	CCpuMathEngine( size_t memoryLimit,
 		std::shared_ptr<CMultiThreadDistributedCommunicator> communicator = nullptr,
@@ -44,27 +41,12 @@ class CCpuMathEngine : public IMathEngine, public IRawMemoryManager {
 
 	// IMathEngine interface methods
 	TMathEngineType GetType() const override { return MET_Cpu; }
-	void SetReuseMemoryMode( bool enabled ) override;
-	bool GetReuseMemoryMode() const override;
-	void SetThreadBufferMemoryThreshold( size_t threshold ) override;
-	size_t GetThreadBufferMemoryThreshold() const override;
-	CMemoryHandle HeapAlloc( size_t count ) override;
-	void HeapFree( const CMemoryHandle& handle ) override;
-	void TransferHandleToThisThread( const CMemoryHandle& handle, size_t size ) override;
-	CMemoryHandle StackAlloc( size_t count ) override;
-	void StackFree( const CMemoryHandle& handle ) override;
-	size_t GetFreeMemorySize() const override;
-	size_t GetPeakMemoryUsage() const override;
-	void ResetPeakMemoryUsage() override;
-	size_t GetCurrentMemoryUsage() const override;
-	size_t GetMemoryInPools() const override;
-	void CleanUp() override;
-	void* GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) override;
-	void ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) override;
+	void GetMathEngineInfo( CMathEngineInfo& info ) const override;
+
+	void* GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) override; // specialize
+	void ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) override; // specialize
 	void DataExchangeRaw( const CMemoryHandle& handle, const void* data, size_t size ) override;
 	void DataExchangeRaw( void* data, const CMemoryHandle& handle, size_t size ) override;
-	CMemoryHandle CopyFrom( const CMemoryHandle& handle, size_t size ) override;
-	void GetMathEngineInfo( CMathEngineInfo& info ) const override;
 
 	// IVectorMathEngine interface methods
 	void VectorFill( const CFloatHandle& result, float value, int vectorSize ) override;
@@ -637,18 +619,16 @@ class CCpuMathEngine : public IMathEngine, public IRawMemoryManager {
 	CMemoryHandle Alloc( size_t size ) override;
 	void Free( const CMemoryHandle& handle ) override;
 
+	void CleanUpSpecial() override;
+
 private:
 	const int floatAlignment; // float alignment
-	const int memoryAlignment; // allocation alignment
 	std::shared_ptr<CMultiThreadDistributedCommunicator> communicator;
 	CMathEngineDistributedInfo distributedInfo;
-	const std::unique_ptr<CMemoryPool> memoryPool; // the memory manager
-	const std::unique_ptr<CDeviceStackAllocator> stackAllocator; // the stack memory allocator
-	mutable std::mutex mutex; // to protect the allocations
 
 	CDllLoader dllLoader; // loading library for simd instructions
 	std::unique_ptr<ISimdMathEngine> simdMathEngine; // interface for using simd instructions
-	SgemmFunc customSgemmFunction; // Used when it is availabled and is faster then default sgemm
+	SgemmFunc customSgemmFunction = nullptr; // Used when it is availabled and is faster then default sgemm
 
 	IMathEngine& mathEngine() { IMathEngine* engine = this; return *engine; }
 

diff --git a/NeoMathEngine/src/CPU/CpuMathEngineDnnDistributed.cpp b/NeoMathEngine/src/CPU/CpuMathEngineDnnDistributed.cpp
@@ -90,8 +90,9 @@ void CMultiThreadDistributedCommunicator::Broadcast( const CFloatHandle& handle,
 void CreateDistributedCpuMathEngines( IMathEngine** mathEngines, int count, size_t memoryLimit )
 {
 	auto communicator = std::make_shared<CMultiThreadDistributedCommunicator>( count );
-	for( int i = 0; i < count; i++ ){
+	for( int i = 0; i < count; ++i ) {
 		mathEngines[i] = new CCpuMathEngine( memoryLimit, communicator, CMathEngineDistributedInfo( i, count ) );
+		ASSERT_EXPR( mathEngines[i] && mathEngines[i]->IsInitialized() ); // Fails, if no call CMemoryEngineMixin::InitializeMemory in some child ctor
 	}
 }