Skip to content

Commit

Permalink
[NeoMathEngine] Refactored MemoryEngine (#1066)
Browse files Browse the repository at this point in the history
* [NeoMathEngine] Refactor MemoryEngine

Signed-off-by: Kirill Golikov <kirill.golikov@abbyy.com>

* [NeoMathEngine] Apply comments

Signed-off-by: Kirill Golikov <kirill.golikov@abbyy.com>

---------

Signed-off-by: Kirill Golikov <kirill.golikov@abbyy.com>
  • Loading branch information
favorart authored Aug 20, 2024
1 parent 555b950 commit 8ec20ce
Show file tree
Hide file tree
Showing 16 changed files with 399 additions and 719 deletions.
17 changes: 14 additions & 3 deletions NeoMathEngine/include/NeoMathEngine/NeoMathEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -1149,22 +1149,27 @@ struct CMathEngineInfo {
CMathEngineInfo( TMathEngineType type, size_t availableMemory, int id ) : Type( type ), AvailableMemory( availableMemory ), Id( id ) { Name[0] = 0; }
};

//------------------------------------------------------------------------------------------------------------

// CMathEngine class implements an engine to perform calculations on data specified by CMemoryHandle (CFloatHandle)
class NEOMATHENGINE_API IMathEngine : public IDnnEngine {
public:
virtual ~IMathEngine();

// Gets the device type
virtual TMathEngineType GetType() const = 0;

// Gets the device information
virtual void GetMathEngineInfo( CMathEngineInfo& info ) const = 0;
// CMemoryEngineMixin has a delayed initialization after the device initialization
virtual bool IsInitialized() const = 0;

// Memory management

// Turns on and off the memory reuse mode
// In this mode, the allocated memory blocks will not be deleted on HeapFree() and may be used until CleanUp()
virtual void SetReuseMemoryMode( bool enable ) = 0;
virtual bool GetReuseMemoryMode() const = 0;

// Specialize the size threshold in bytes for the current thread, so
// memory blocks of a size <= this threshold would be allocated in buffers if 'reuse' mode enabled
// memory blocks of a size > this threshold would be allocated in raw RAM memory (malloc/free)
Expand Down Expand Up @@ -1216,9 +1221,11 @@ class NEOMATHENGINE_API IMathEngine : public IDnnEngine {

// Typed data exchange
template<class T>
void DataExchangeTyped( const CTypedMemoryHandle<T>& result, const T* source, size_t size ) { DataExchangeRaw( result, source, size * sizeof(T) ); }
void DataExchangeTyped( const CTypedMemoryHandle<T>& result, const T* source, size_t size )
{ DataExchangeRaw( result, source, size * sizeof(T) ); }
template<class T>
void DataExchangeTyped( T* result, const CTypedMemoryHandle<const T>& source, size_t size ) { DataExchangeRaw( result, source, size * sizeof(T) ); }
void DataExchangeTyped( T* result, const CTypedMemoryHandle<const T>& source, size_t size )
{ DataExchangeRaw( result, source, size * sizeof(T) ); }

// Creates a handle with data from another math engine
virtual CMemoryHandle CopyFrom( const CMemoryHandle& handle, size_t size ) = 0;
Expand All @@ -1227,11 +1234,15 @@ class NEOMATHENGINE_API IMathEngine : public IDnnEngine {
// This object should be destroyed using the standard delete operator after use.
virtual IPerformanceCounters* CreatePerformanceCounters( bool isTimeOnly = false ) const = 0;

// For Distributed only
virtual CMathEngineDistributedInfo GetDistributedInfo() { return CMathEngineDistributedInfo(); }
virtual void AllReduce( const CFloatHandle& handle, int size ) = 0;
virtual void Broadcast( const CFloatHandle& handle, int size, int root ) = 0;
virtual void AbortDistributed() {};
virtual bool IsDistributed() const { return false; }

protected:
virtual void CleanUpSpecial() = 0;
};

//------------------------------------------------------------------------------------------------------------
Expand Down
2 changes: 2 additions & 0 deletions NeoMathEngine/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ set(CPU_COMMON_SOURCES
MathEngineDnnDropout.cpp
MathEngine.cpp
MathEngineHostStackAllocator.cpp
MemoryEngineMixin.cpp
MemoryPool.cpp
ThreadPool.cpp
common.cpp
Expand All @@ -50,6 +51,7 @@ target_sources(${PROJECT_NAME}
MathEngineDnnLrn.h
MathEngineDnnPoolings.h
MathEngineHostStackAllocator.h
MemoryEngineMixin.h
MemoryHandleInternal.h
MemoryPool.h
RawMemoryManager.h
Expand Down
146 changes: 11 additions & 135 deletions NeoMathEngine/src/CPU/CpuMathEngine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,12 @@ CCpuMathEngine::CCpuMathEngine( size_t _memoryLimit,
std::shared_ptr<CMultiThreadDistributedCommunicator> communicator,
const CMathEngineDistributedInfo& distributedInfo ) :
floatAlignment( FloatAlignment ),
memoryAlignment( floatAlignment * sizeof(float) ),
communicator( communicator ),
distributedInfo( distributedInfo ),
memoryPool( new CMemoryPool( _memoryLimit == 0 ? SIZE_MAX : _memoryLimit, this, distributedInfo.Threads > 1 ) ),
stackAllocator( new CDeviceStackAllocator( *memoryPool, memoryAlignment ) ),
dllLoader( CDllLoader::AVX_DLL ),
simdMathEngine( nullptr ),
customSgemmFunction( nullptr )
dllLoader( CDllLoader::AVX_DLL )
{
InitializeMemory( this, _memoryLimit, static_cast<int>( floatAlignment * sizeof( float ) ),
/*reuse*/IsDistributed(), /*hostStack*/false );
#ifdef NEOML_USE_AVX
if( dllLoader.IsLoaded( CDllLoader::AVX_DLL ) ) {
simdMathEngine = std::unique_ptr<ISimdMathEngine>( CDllLoader::avxDll->CreateSimdMathEngine( this ) );
Expand All @@ -73,7 +70,7 @@ CCpuMathEngine::CCpuMathEngine( size_t _memoryLimit,
}
#else // !NEOML_USE_AVX
// warning fix
(void)customSgemmFunction;
( void ) customSgemmFunction;
#endif // !NEOML_USE_AVX
#ifdef NEOML_USE_MKL
vmlSetMode( VML_ERRMODE_NOERR );
Expand All @@ -85,125 +82,16 @@ CCpuMathEngine::~CCpuMathEngine()
CleanUp();
}

void CCpuMathEngine::SetReuseMemoryMode( bool enable )
void CCpuMathEngine::CleanUpSpecial()
{
// Distributed CPU math engine always uses memory pools
// because big simultaneous allocations on multiple (20+) threads are extremely slow
if( IsDistributed() ) {
return;
}

std::lock_guard<std::mutex> lock( mutex );
memoryPool->SetReuseMemoryMode( enable );
}

bool CCpuMathEngine::GetReuseMemoryMode() const
{
// Distributed CPU math engine always uses memory pools
if( IsDistributed() ) {
return true;
}
std::lock_guard<std::mutex> lock( mutex );
return memoryPool->GetReuseMemoryMode();
}

void CCpuMathEngine::SetThreadBufferMemoryThreshold( size_t threshold )
{
std::lock_guard<std::mutex> lock( mutex );
memoryPool->SetThreadBufferMemoryThreshold( threshold );
}

size_t CCpuMathEngine::GetThreadBufferMemoryThreshold() const
{
std::lock_guard<std::mutex> lock( mutex );
return memoryPool->GetThreadBufferMemoryThreshold();
}

CMemoryHandle CCpuMathEngine::HeapAlloc( size_t size )
{
std::lock_guard<std::mutex> lock( mutex );
CMemoryHandle result = memoryPool->Alloc( size );
if( result.IsNull() ) {
THROW_MEMORY_EXCEPTION;
}
return result;
}

void CCpuMathEngine::HeapFree( const CMemoryHandle& handle )
{
ASSERT_EXPR( handle.GetMathEngine() == this );

std::lock_guard<std::mutex> lock( mutex );
memoryPool->Free( handle );
}

void CCpuMathEngine::TransferHandleToThisThread( const CMemoryHandle& handle, size_t size )
{
ASSERT_EXPR( handle.GetMathEngine() == this );

std::lock_guard<std::mutex> lock( mutex );
memoryPool->TransferHandleToThisThread( handle, size );
}

CMemoryHandle CCpuMathEngine::StackAlloc( size_t size )
{
std::lock_guard<std::mutex> lock( mutex );
CMemoryHandle result = stackAllocator->Alloc(size);
if( result.IsNull() ) {
THROW_MEMORY_EXCEPTION;
}
return result;
}

void CCpuMathEngine::StackFree( const CMemoryHandle& ptr )
{
std::lock_guard<std::mutex> lock( mutex );
stackAllocator->Free( ptr );
}

size_t CCpuMathEngine::GetFreeMemorySize() const
{
std::lock_guard<std::mutex> lock( mutex );
return memoryPool->GetFreeMemorySize();
}

size_t CCpuMathEngine::GetPeakMemoryUsage() const
{
std::lock_guard<std::mutex> lock( mutex );
return memoryPool->GetPeakMemoryUsage();
}

void CCpuMathEngine::ResetPeakMemoryUsage()
{
std::lock_guard<std::mutex> lock( mutex );
memoryPool->ResetPeakMemoryUsage();
}

size_t CCpuMathEngine::GetCurrentMemoryUsage() const
{
std::lock_guard<std::mutex> lock( mutex );
return memoryPool->GetCurrentMemoryUsage();
}

size_t CCpuMathEngine::GetMemoryInPools() const
{
std::lock_guard<std::mutex> lock( mutex );
return memoryPool->GetMemoryInPools();
}

void CCpuMathEngine::CleanUp()
{
std::lock_guard<std::mutex> lock( mutex );
stackAllocator->CleanUp();
memoryPool->CleanUp();
#ifdef NEOML_USE_MKL
mkl_thread_free_buffers();
#endif // NEOML_USE_MKL
}

void* CCpuMathEngine::GetBuffer( const CMemoryHandle& handle, size_t pos, size_t, bool exchange )
{
(void) exchange; // always returned, no need to copy
( void ) exchange; // always returned, no need to copy
return reinterpret_cast<char*>( GetRaw( handle ) ) + pos;
}

Expand All @@ -215,38 +103,26 @@ void CCpuMathEngine::ReleaseBuffer( const CMemoryHandle&, void*, bool )
void CCpuMathEngine::DataExchangeRaw( const CMemoryHandle& handle, const void* data, size_t size )
{
ASSERT_EXPR( handle.GetMathEngine() == this );

::memcpy( GetRaw( handle ), data, size );
}

void CCpuMathEngine::DataExchangeRaw( void* data, const CMemoryHandle& handle, size_t size )
{
ASSERT_EXPR( handle.GetMathEngine() == this );

::memcpy( data, GetRaw( handle ), size );
}

CMemoryHandle CCpuMathEngine::CopyFrom( const CMemoryHandle& handle, size_t size )
{
CMemoryHandle result = HeapAlloc( size );

IMathEngine* otherMathEngine = handle.GetMathEngine();
otherMathEngine->DataExchangeRaw( GetRaw( result ), handle, size );

return result;
}

CMemoryHandle CCpuMathEngine::Alloc( size_t size )
{
// Ensure the correct alignment
void* ptr = 0;
if( MEMORY_ALLOCATION_ALIGNMENT % memoryAlignment == 0 ) {
if( MEMORY_ALLOCATION_ALIGNMENT % MemoryAlignment == 0 ) {
ptr = malloc(size);
} else {
char* p = static_cast<char*>(malloc(size + memoryAlignment));
char* p = static_cast<char*>(malloc(size + MemoryAlignment));
if( p != 0 ) {
const intptr_t delta = memoryAlignment - std::abs( ( reinterpret_cast<intptr_t>( p ) % memoryAlignment ) );
ASSERT_EXPR( delta > 0 && delta <= static_cast<intptr_t>( memoryAlignment ) );
const intptr_t delta = MemoryAlignment - std::abs( ( reinterpret_cast<intptr_t>( p ) % MemoryAlignment ) );
ASSERT_EXPR( delta > 0 && delta <= static_cast<intptr_t>( MemoryAlignment ) );

p[delta - 1] = static_cast<char>( delta - 1 );
ptr = p + delta;
Expand All @@ -266,7 +142,7 @@ void CCpuMathEngine::Free( const CMemoryHandle& handle )

char* ptr = GetRaw( CTypedMemoryHandle<char>( handle ) );

if( MEMORY_ALLOCATION_ALIGNMENT % memoryAlignment == 0 ) {
if( MEMORY_ALLOCATION_ALIGNMENT % MemoryAlignment == 0 ) {
free(ptr);
return;
}
Expand Down
38 changes: 9 additions & 29 deletions NeoMathEngine/src/CPU/CpuMathEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@ limitations under the License.

#include <NeoMathEngine/NeoMathEngine.h>
#include <NeoMathEngine/SimdMathEngine.h>
#include <RawMemoryManager.h>
#include <MemoryEngineMixin.h>
#include <DllLoader.h>
#include <mutex>
#include <memory>
#include <CpuMathEngineDnnDistributed.h>

Expand All @@ -30,12 +29,10 @@ struct CCommon2DPoolingDesc;
struct CCommonMaxPoolingDesc;
struct CCommon3dConvolutionDesc;
struct CCommonChannelwiseConvolutionDesc;
class CDeviceStackAllocator;
class CMemoryPool;
class ISimdMathEngine;

// Math engine that uses a CPU for calculations
class CCpuMathEngine : public IMathEngine, public IRawMemoryManager {
class CCpuMathEngine : public CMemoryEngineMixin, public IRawMemoryManager {
public:
CCpuMathEngine( size_t memoryLimit,
std::shared_ptr<CMultiThreadDistributedCommunicator> communicator = nullptr,
Expand All @@ -44,27 +41,12 @@ class CCpuMathEngine : public IMathEngine, public IRawMemoryManager {

// IMathEngine interface methods
TMathEngineType GetType() const override { return MET_Cpu; }
void SetReuseMemoryMode( bool enabled ) override;
bool GetReuseMemoryMode() const override;
void SetThreadBufferMemoryThreshold( size_t threshold ) override;
size_t GetThreadBufferMemoryThreshold() const override;
CMemoryHandle HeapAlloc( size_t count ) override;
void HeapFree( const CMemoryHandle& handle ) override;
void TransferHandleToThisThread( const CMemoryHandle& handle, size_t size ) override;
CMemoryHandle StackAlloc( size_t count ) override;
void StackFree( const CMemoryHandle& handle ) override;
size_t GetFreeMemorySize() const override;
size_t GetPeakMemoryUsage() const override;
void ResetPeakMemoryUsage() override;
size_t GetCurrentMemoryUsage() const override;
size_t GetMemoryInPools() const override;
void CleanUp() override;
void* GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) override;
void ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) override;
void GetMathEngineInfo( CMathEngineInfo& info ) const override;

void* GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) override; // specialize
void ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) override; // specialize
void DataExchangeRaw( const CMemoryHandle& handle, const void* data, size_t size ) override;
void DataExchangeRaw( void* data, const CMemoryHandle& handle, size_t size ) override;
CMemoryHandle CopyFrom( const CMemoryHandle& handle, size_t size ) override;
void GetMathEngineInfo( CMathEngineInfo& info ) const override;

// IVectorMathEngine interface methods
void VectorFill( const CFloatHandle& result, float value, int vectorSize ) override;
Expand Down Expand Up @@ -637,18 +619,16 @@ class CCpuMathEngine : public IMathEngine, public IRawMemoryManager {
CMemoryHandle Alloc( size_t size ) override;
void Free( const CMemoryHandle& handle ) override;

void CleanUpSpecial() override;

private:
const int floatAlignment; // float alignment
const int memoryAlignment; // allocation alignment
std::shared_ptr<CMultiThreadDistributedCommunicator> communicator;
CMathEngineDistributedInfo distributedInfo;
const std::unique_ptr<CMemoryPool> memoryPool; // the memory manager
const std::unique_ptr<CDeviceStackAllocator> stackAllocator; // the stack memory allocator
mutable std::mutex mutex; // to protect the allocations

CDllLoader dllLoader; // loading library for simd instructions
std::unique_ptr<ISimdMathEngine> simdMathEngine; // interface for using simd instructions
SgemmFunc customSgemmFunction; // Used when it is availabled and is faster then default sgemm
SgemmFunc customSgemmFunction = nullptr; // Used when it is availabled and is faster then default sgemm

IMathEngine& mathEngine() { IMathEngine* engine = this; return *engine; }

Expand Down
3 changes: 2 additions & 1 deletion NeoMathEngine/src/CPU/CpuMathEngineDnnDistributed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,9 @@ void CMultiThreadDistributedCommunicator::Broadcast( const CFloatHandle& handle,
void CreateDistributedCpuMathEngines( IMathEngine** mathEngines, int count, size_t memoryLimit )
{
auto communicator = std::make_shared<CMultiThreadDistributedCommunicator>( count );
for( int i = 0; i < count; i++ ){
for( int i = 0; i < count; ++i ) {
mathEngines[i] = new CCpuMathEngine( memoryLimit, communicator, CMathEngineDistributedInfo( i, count ) );
ASSERT_EXPR( mathEngines[i] && mathEngines[i]->IsInitialized() ); // Fails, if no call CMemoryEngineMixin::InitializeMemory in some child ctor
}
}

Expand Down
Loading

0 comments on commit 8ec20ce

Please sign in to comment.