diff --git a/NeoMathEngine/include/NeoMathEngine/NeoMathEngine.h b/NeoMathEngine/include/NeoMathEngine/NeoMathEngine.h
index 4f4a45325..9992023e5 100644
--- a/NeoMathEngine/include/NeoMathEngine/NeoMathEngine.h
+++ b/NeoMathEngine/include/NeoMathEngine/NeoMathEngine.h
@@ -1149,15 +1149,19 @@ struct CMathEngineInfo {
 	CMathEngineInfo( TMathEngineType type, size_t availableMemory, int id ) : Type( type ), AvailableMemory( availableMemory ), Id( id ) { Name[0] = 0; }
 };
 
+//------------------------------------------------------------------------------------------------------------
+
 // CMathEngine class implements an engine to perform calculations on data specified by CMemoryHandle (CFloatHandle)
 class NEOMATHENGINE_API IMathEngine : public IDnnEngine {
 public:
 	virtual ~IMathEngine();
+
 	// Gets the device type
 	virtual TMathEngineType GetType() const = 0;
-	
 	// Gets the device information
 	virtual void GetMathEngineInfo( CMathEngineInfo& info ) const = 0;
+	// CMemoryEngineMixin has a delayed initialization after the device initialization
+	virtual bool IsInitialized() const = 0;
 
 	// Memory management
 
@@ -1165,6 +1169,7 @@ class NEOMATHENGINE_API IMathEngine : public IDnnEngine {
 	// In this mode, the allocated memory blocks will not be deleted on HeapFree() and may be used until CleanUp()
 	virtual void SetReuseMemoryMode( bool enable ) = 0;
 	virtual bool GetReuseMemoryMode() const = 0;
+
 	// Specialize the size threshold in bytes for the current thread, so
 	// memory blocks of a size <= this threshold would be allocated in buffers if 'reuse' mode enabled
 	// memory blocks of a size >  this threshold would be allocated in raw RAM memory (malloc/free)
@@ -1216,9 +1221,11 @@ class NEOMATHENGINE_API IMathEngine : public IDnnEngine {
 
 	// Typed data exchange
 	template<class T>
-	void DataExchangeTyped( const CTypedMemoryHandle<T>& result, const T* source, size_t size ) { DataExchangeRaw( result, source, size * sizeof(T) ); }
+	void DataExchangeTyped( const CTypedMemoryHandle<T>& result, const T* source, size_t size )
+		{ DataExchangeRaw( result, source, size * sizeof(T) ); }
 	template<class T>
-	void DataExchangeTyped( T* result, const CTypedMemoryHandle<const T>& source, size_t size ) { DataExchangeRaw( result, source, size * sizeof(T) ); }
+	void DataExchangeTyped( T* result, const CTypedMemoryHandle<const T>& source, size_t size )
+		{ DataExchangeRaw( result, source, size * sizeof(T) ); }
 
 	// Creates a handle with data from another math engine
 	virtual CMemoryHandle CopyFrom( const CMemoryHandle& handle, size_t size ) = 0;
@@ -1227,11 +1234,15 @@ class NEOMATHENGINE_API IMathEngine : public IDnnEngine {
 	// This object should be destroyed using the standard delete operator after use.
 	virtual IPerformanceCounters* CreatePerformanceCounters( bool isTimeOnly = false ) const = 0;
 
+	// For Distributed only
 	virtual CMathEngineDistributedInfo GetDistributedInfo() { return CMathEngineDistributedInfo(); }
 	virtual void AllReduce( const CFloatHandle& handle, int size ) = 0;
 	virtual void Broadcast( const CFloatHandle& handle, int size, int root ) = 0;
 	virtual void AbortDistributed() {};
 	virtual bool IsDistributed() const { return false; }
+
+protected:
+	virtual void CleanUpSpecial() = 0;
 };
 
 //------------------------------------------------------------------------------------------------------------
diff --git a/NeoMathEngine/src/CMakeLists.txt b/NeoMathEngine/src/CMakeLists.txt
index 916f322e8..90a7d71a9 100644
--- a/NeoMathEngine/src/CMakeLists.txt
+++ b/NeoMathEngine/src/CMakeLists.txt
@@ -31,6 +31,7 @@ set(CPU_COMMON_SOURCES
     MathEngineDnnDropout.cpp
     MathEngine.cpp
     MathEngineHostStackAllocator.cpp
+    MemoryEngineMixin.cpp
     MemoryPool.cpp
     ThreadPool.cpp
     common.cpp
@@ -50,6 +51,7 @@ target_sources(${PROJECT_NAME}
     MathEngineDnnLrn.h
     MathEngineDnnPoolings.h
     MathEngineHostStackAllocator.h
+    MemoryEngineMixin.h
     MemoryHandleInternal.h
     MemoryPool.h
     RawMemoryManager.h
diff --git a/NeoMathEngine/src/CPU/CpuMathEngine.cpp b/NeoMathEngine/src/CPU/CpuMathEngine.cpp
index d85bc38ef..8e82a4132 100644
--- a/NeoMathEngine/src/CPU/CpuMathEngine.cpp
+++ b/NeoMathEngine/src/CPU/CpuMathEngine.cpp
@@ -54,15 +54,12 @@ CCpuMathEngine::CCpuMathEngine( size_t _memoryLimit,
 		std::shared_ptr<CMultiThreadDistributedCommunicator> communicator,
 		const CMathEngineDistributedInfo& distributedInfo ) :
 	floatAlignment( FloatAlignment ),
-	memoryAlignment( floatAlignment * sizeof(float) ),
 	communicator( communicator ),
 	distributedInfo( distributedInfo ),
-	memoryPool( new CMemoryPool( _memoryLimit == 0 ? SIZE_MAX : _memoryLimit, this, distributedInfo.Threads > 1 ) ),
-	stackAllocator( new CDeviceStackAllocator( *memoryPool, memoryAlignment ) ),
-	dllLoader( CDllLoader::AVX_DLL ),
-	simdMathEngine( nullptr ),
-	customSgemmFunction( nullptr )
+	dllLoader( CDllLoader::AVX_DLL )
 {
+	InitializeMemory( this, _memoryLimit, static_cast<int>( floatAlignment * sizeof( float ) ),
+		/*reuse*/IsDistributed(), /*hostStack*/false );
 #ifdef NEOML_USE_AVX
 	if( dllLoader.IsLoaded( CDllLoader::AVX_DLL ) ) {
 		simdMathEngine = std::unique_ptr<ISimdMathEngine>( CDllLoader::avxDll->CreateSimdMathEngine( this ) );
@@ -73,7 +70,7 @@ CCpuMathEngine::CCpuMathEngine( size_t _memoryLimit,
 	}
 #else  // !NEOML_USE_AVX
 	// warning fix
-	(void)customSgemmFunction;
+	( void ) customSgemmFunction;
 #endif // !NEOML_USE_AVX
 #ifdef NEOML_USE_MKL
 	vmlSetMode( VML_ERRMODE_NOERR );
@@ -85,117 +82,8 @@ CCpuMathEngine::~CCpuMathEngine()
 	CleanUp();
 }
 
-void CCpuMathEngine::SetReuseMemoryMode( bool enable )
+void CCpuMathEngine::CleanUpSpecial()
 {
-	// Distributed CPU math engine always uses memory pools
-	// because big simultaneous allocations on multiple (20+) threads are extremely slow
-	if( IsDistributed() ) {
-		return;
-	}
-
-	std::lock_guard<std::mutex> lock( mutex );
-	memoryPool->SetReuseMemoryMode( enable );
-}
-
-bool CCpuMathEngine::GetReuseMemoryMode() const
-{
-	// Distributed CPU math engine always uses memory pools
-	if( IsDistributed() ) {
-		return true;
-	}
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetReuseMemoryMode();
-}
-
-void CCpuMathEngine::SetThreadBufferMemoryThreshold( size_t threshold )
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	memoryPool->SetThreadBufferMemoryThreshold( threshold );
-}
-
-size_t CCpuMathEngine::GetThreadBufferMemoryThreshold() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetThreadBufferMemoryThreshold();
-}
-
-CMemoryHandle CCpuMathEngine::HeapAlloc( size_t size )
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	CMemoryHandle result = memoryPool->Alloc( size );
-	if( result.IsNull() ) {
-		THROW_MEMORY_EXCEPTION;
-	}
-	return result;
-}
-
-void CCpuMathEngine::HeapFree( const CMemoryHandle& handle )
-{
-	ASSERT_EXPR( handle.GetMathEngine() == this );
-
-	std::lock_guard<std::mutex> lock( mutex );
-	memoryPool->Free( handle );
-}
-
-void CCpuMathEngine::TransferHandleToThisThread( const CMemoryHandle& handle, size_t size )
-{
-	ASSERT_EXPR( handle.GetMathEngine() == this );
-
-	std::lock_guard<std::mutex> lock( mutex );
-	memoryPool->TransferHandleToThisThread( handle, size );
-}
-
-CMemoryHandle CCpuMathEngine::StackAlloc( size_t size )
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	CMemoryHandle result = stackAllocator->Alloc(size);
-	if( result.IsNull() ) {
-		THROW_MEMORY_EXCEPTION;
-	}
-	return result;
-}
-
-void CCpuMathEngine::StackFree( const CMemoryHandle& ptr )
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	stackAllocator->Free( ptr );
-}
-
-size_t CCpuMathEngine::GetFreeMemorySize() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetFreeMemorySize();
-}
-
-size_t CCpuMathEngine::GetPeakMemoryUsage() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetPeakMemoryUsage();
-}
-
-void CCpuMathEngine::ResetPeakMemoryUsage()
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	memoryPool->ResetPeakMemoryUsage();
-}
-
-size_t CCpuMathEngine::GetCurrentMemoryUsage() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetCurrentMemoryUsage();
-}
-
-size_t CCpuMathEngine::GetMemoryInPools() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetMemoryInPools();
-}
-
-void CCpuMathEngine::CleanUp()
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	stackAllocator->CleanUp();
-	memoryPool->CleanUp();
 #ifdef NEOML_USE_MKL
 	mkl_thread_free_buffers();
 #endif // NEOML_USE_MKL
@@ -203,7 +91,7 @@ void CCpuMathEngine::CleanUp()
 
 void* CCpuMathEngine::GetBuffer( const CMemoryHandle& handle, size_t pos, size_t, bool exchange )
 {
-	(void) exchange; // always returned, no need to copy
+	( void ) exchange; // always returned, no need to copy
 	return reinterpret_cast<char*>( GetRaw( handle ) ) + pos;
 }
 
@@ -215,38 +103,26 @@ void CCpuMathEngine::ReleaseBuffer( const CMemoryHandle&, void*, bool )
 void CCpuMathEngine::DataExchangeRaw( const CMemoryHandle& handle, const void* data, size_t size )
 {
 	ASSERT_EXPR( handle.GetMathEngine() == this );
-
 	::memcpy( GetRaw( handle ), data, size );
 }
 
 void CCpuMathEngine::DataExchangeRaw( void* data, const CMemoryHandle& handle, size_t size )
 {
 	ASSERT_EXPR( handle.GetMathEngine() == this );
-
 	::memcpy( data, GetRaw( handle ), size );
 }
 
-CMemoryHandle CCpuMathEngine::CopyFrom( const CMemoryHandle& handle, size_t size )
-{
-	CMemoryHandle result = HeapAlloc( size );
-
-	IMathEngine* otherMathEngine = handle.GetMathEngine();
-	otherMathEngine->DataExchangeRaw( GetRaw( result ), handle, size );
-
-	return result;
-}
-
 CMemoryHandle CCpuMathEngine::Alloc( size_t size )
 {
 	// Ensure the correct alignment
 	void* ptr = 0;
-	if( MEMORY_ALLOCATION_ALIGNMENT % memoryAlignment == 0 ) {
+	if( MEMORY_ALLOCATION_ALIGNMENT % MemoryAlignment == 0 ) {
 		ptr = malloc(size);
 	} else {
-		char* p = static_cast<char*>(malloc(size + memoryAlignment));
+		char* p = static_cast<char*>(malloc(size + MemoryAlignment));
 		if( p != 0 ) {
-			const intptr_t delta = memoryAlignment - std::abs( ( reinterpret_cast<intptr_t>( p ) % memoryAlignment ) );
-			ASSERT_EXPR( delta > 0 && delta <= static_cast<intptr_t>( memoryAlignment ) );
+			const intptr_t delta = MemoryAlignment - std::abs( ( reinterpret_cast<intptr_t>( p ) % MemoryAlignment ) );
+			ASSERT_EXPR( delta > 0 && delta <= static_cast<intptr_t>( MemoryAlignment ) );
 
 			p[delta - 1] = static_cast<char>( delta - 1 );
 			ptr = p + delta;
@@ -266,7 +142,7 @@ void CCpuMathEngine::Free( const CMemoryHandle& handle )
 
 	char* ptr = GetRaw( CTypedMemoryHandle<char>( handle ) );
 
-	if( MEMORY_ALLOCATION_ALIGNMENT % memoryAlignment == 0 ) {
+	if( MEMORY_ALLOCATION_ALIGNMENT % MemoryAlignment == 0 ) {
 		free(ptr);
 		return;
 	}
diff --git a/NeoMathEngine/src/CPU/CpuMathEngine.h b/NeoMathEngine/src/CPU/CpuMathEngine.h
index 4eef40de5..3b402c70a 100644
--- a/NeoMathEngine/src/CPU/CpuMathEngine.h
+++ b/NeoMathEngine/src/CPU/CpuMathEngine.h
@@ -17,9 +17,8 @@ limitations under the License.
 
 #include <NeoMathEngine/NeoMathEngine.h>
 #include <NeoMathEngine/SimdMathEngine.h>
-#include <RawMemoryManager.h>
+#include <MemoryEngineMixin.h>
 #include <DllLoader.h>
-#include <mutex>
 #include <memory>
 #include <CpuMathEngineDnnDistributed.h>
 
@@ -30,12 +29,10 @@ struct CCommon2DPoolingDesc;
 struct CCommonMaxPoolingDesc;
 struct CCommon3dConvolutionDesc;
 struct CCommonChannelwiseConvolutionDesc;
-class CDeviceStackAllocator;
-class CMemoryPool;
 class ISimdMathEngine;
 
 // Math engine that uses a CPU for calculations
-class CCpuMathEngine : public IMathEngine, public IRawMemoryManager {
+class CCpuMathEngine : public CMemoryEngineMixin, public IRawMemoryManager {
 public:
 	CCpuMathEngine( size_t memoryLimit,
 		std::shared_ptr<CMultiThreadDistributedCommunicator> communicator = nullptr,
@@ -44,27 +41,12 @@ class CCpuMathEngine : public IMathEngine, public IRawMemoryManager {
 
 	// IMathEngine interface methods
 	TMathEngineType GetType() const override { return MET_Cpu; }
-	void SetReuseMemoryMode( bool enabled ) override;
-	bool GetReuseMemoryMode() const override;
-	void SetThreadBufferMemoryThreshold( size_t threshold ) override;
-	size_t GetThreadBufferMemoryThreshold() const override;
-	CMemoryHandle HeapAlloc( size_t count ) override;
-	void HeapFree( const CMemoryHandle& handle ) override;
-	void TransferHandleToThisThread( const CMemoryHandle& handle, size_t size ) override;
-	CMemoryHandle StackAlloc( size_t count ) override;
-	void StackFree( const CMemoryHandle& handle ) override;
-	size_t GetFreeMemorySize() const override;
-	size_t GetPeakMemoryUsage() const override;
-	void ResetPeakMemoryUsage() override;
-	size_t GetCurrentMemoryUsage() const override;
-	size_t GetMemoryInPools() const override;
-	void CleanUp() override;
-	void* GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) override;
-	void ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) override;
+	void GetMathEngineInfo( CMathEngineInfo& info ) const override;
+
+	void* GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) override; // specialize
+	void ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) override; // specialize
 	void DataExchangeRaw( const CMemoryHandle& handle, const void* data, size_t size ) override;
 	void DataExchangeRaw( void* data, const CMemoryHandle& handle, size_t size ) override;
-	CMemoryHandle CopyFrom( const CMemoryHandle& handle, size_t size ) override;
-	void GetMathEngineInfo( CMathEngineInfo& info ) const override;
 
 	// IVectorMathEngine interface methods
 	void VectorFill( const CFloatHandle& result, float value, int vectorSize ) override;
@@ -637,18 +619,16 @@ class CCpuMathEngine : public IMathEngine, public IRawMemoryManager {
 	CMemoryHandle Alloc( size_t size ) override;
 	void Free( const CMemoryHandle& handle ) override;
 
+	void CleanUpSpecial() override;
+
 private:
 	const int floatAlignment; // float alignment
-	const int memoryAlignment; // allocation alignment
 	std::shared_ptr<CMultiThreadDistributedCommunicator> communicator;
 	CMathEngineDistributedInfo distributedInfo;
-	const std::unique_ptr<CMemoryPool> memoryPool; // the memory manager
-	const std::unique_ptr<CDeviceStackAllocator> stackAllocator; // the stack memory allocator
-	mutable std::mutex mutex; // to protect the allocations
 
 	CDllLoader dllLoader; // loading library for simd instructions
 	std::unique_ptr<ISimdMathEngine> simdMathEngine; // interface for using simd instructions
-	SgemmFunc customSgemmFunction; // Used when it is availabled and is faster then default sgemm
+	SgemmFunc customSgemmFunction = nullptr; // Used when it is availabled and is faster then default sgemm
 
 	IMathEngine& mathEngine() { IMathEngine* engine = this; return *engine; }
 
diff --git a/NeoMathEngine/src/CPU/CpuMathEngineDnnDistributed.cpp b/NeoMathEngine/src/CPU/CpuMathEngineDnnDistributed.cpp
index 92b859707..774efea32 100644
--- a/NeoMathEngine/src/CPU/CpuMathEngineDnnDistributed.cpp
+++ b/NeoMathEngine/src/CPU/CpuMathEngineDnnDistributed.cpp
@@ -90,8 +90,9 @@ void CMultiThreadDistributedCommunicator::Broadcast( const CFloatHandle& handle,
 void CreateDistributedCpuMathEngines( IMathEngine** mathEngines, int count, size_t memoryLimit )
 {
 	auto communicator = std::make_shared<CMultiThreadDistributedCommunicator>( count );
-	for( int i = 0; i < count; i++ ){
+	for( int i = 0; i < count; ++i ) {
 		mathEngines[i] = new CCpuMathEngine( memoryLimit, communicator, CMathEngineDistributedInfo( i, count ) );
+		ASSERT_EXPR( mathEngines[i] && mathEngines[i]->IsInitialized() ); // Fails, if no call CMemoryEngineMixin::InitializeMemory in some child ctor
 	}
 }
 
diff --git a/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.cpp b/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.cpp
index c8d5538ea..6c173a307 100644
--- a/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.cpp
+++ b/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.cpp
@@ -37,145 +37,6 @@ limitations under the License.
 
 namespace NeoML {
 
-void CCudaMathEngine::CleanUp()
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	deviceStackRunTime->CleanUp();
-	hostStackRunTime->CleanUp();
-	memoryPool->CleanUp();
-}
-
-size_t CCudaMathEngine::GetFreeMemorySize() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetFreeMemorySize();
-}
-
-size_t CCudaMathEngine::GetPeakMemoryUsage() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetPeakMemoryUsage();
-}
-
-void CCudaMathEngine::ResetPeakMemoryUsage()
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	memoryPool->ResetPeakMemoryUsage();
-}
-
-size_t CCudaMathEngine::GetCurrentMemoryUsage() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetCurrentMemoryUsage();
-}
-
-size_t CCudaMathEngine::GetMemoryInPools() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetMemoryInPools();
-}
-
-void CCudaMathEngine::SetReuseMemoryMode( bool )
-{
-	// Always true, because allocation is sync
-}
-
-bool CCudaMathEngine::GetReuseMemoryMode() const
-{
-	// Always true, because allocation is sync
-	return true;
-}
-
-void CCudaMathEngine::SetThreadBufferMemoryThreshold( size_t threshold )
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	memoryPool->SetThreadBufferMemoryThreshold( threshold );
-}
-
-size_t CCudaMathEngine::GetThreadBufferMemoryThreshold() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetThreadBufferMemoryThreshold();
-}
-
-CMemoryHandle CCudaMathEngine::HeapAlloc( size_t size )
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	CMemoryHandle result = memoryPool->Alloc( size );
-	if( result.IsNull() ) {
-		THROW_MEMORY_EXCEPTION;
-	}
-	return result;
-}
-
-void CCudaMathEngine::HeapFree( const CMemoryHandle& handle )
-{
-	ASSERT_EXPR( handle.GetMathEngine() == this );
-
-	std::lock_guard<std::mutex> lock( mutex );
-	memoryPool->Free( handle );
-}
-
-void CCudaMathEngine::TransferHandleToThisThread( const CMemoryHandle& handle, size_t size )
-{
-	ASSERT_EXPR( handle.GetMathEngine() == this );
-
-	std::lock_guard<std::mutex> lock( mutex );
-	memoryPool->TransferHandleToThisThread( handle, size );
-}
-
-CMemoryHandle CCudaMathEngine::StackAlloc( size_t size )
-{
-	ASSERT_EXPR( deviceStackRunTime != 0 );
-
-	std::lock_guard<std::mutex> lock( mutex );
-	CMemoryHandle result = deviceStackRunTime->Alloc( size );
-	if( result.IsNull() ) {
-		THROW_MEMORY_EXCEPTION;
-	}
-	return result;
-}
-
-void CCudaMathEngine::StackFree( const CMemoryHandle& ptr )
-{
-	ASSERT_EXPR(ptr.GetMathEngine() == this);
-
-	std::lock_guard<std::mutex> lock( mutex );
-	deviceStackRunTime->Free( ptr );
-}
-
-void* CCudaMathEngine::GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange )
-{
-	ASSERT_EXPR(handle.GetMathEngine() == this);
-
-	size_t realSize = size + 16;
-	char* result = reinterpret_cast<char*>( hostStackRunTime->Alloc( realSize ) );
-	size_t* posPtr = reinterpret_cast<size_t*>( result );
-	*posPtr = pos;
-	size_t* sizePtr = reinterpret_cast<size_t*>( result ) + 1;
-	*sizePtr = size;
-	if( exchange ) {
-		DataExchangeRaw( result + 16, handle, size );
-	}
-	return result + 16;
-}
-
-void CCudaMathEngine::ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange )
-{
-	ASSERT_EXPR(handle.GetMathEngine() == this);
-
-	if( exchange ) {
-		size_t* posPtr = reinterpret_cast<size_t*>( reinterpret_cast<char*>( ptr ) - 16 );
-		size_t pos = *posPtr;
-		size_t* sizePtr = posPtr + 1;
-		size_t size = *sizePtr;
-
-		DataExchangeRaw( CTypedMemoryHandle<char>( handle ) + pos, ptr, size );
-	}
-
-	hostStackRunTime->Free( reinterpret_cast<char*>( ptr ) - 16 );
-}
-
 void CCudaMathEngine::DataExchangeRaw(const CMemoryHandle& handle, const void* data, size_t size)
 {
 	ASSERT_EXPR(handle.GetMathEngine() == this);
@@ -188,20 +49,6 @@ void CCudaMathEngine::DataExchangeRaw(void* data, const CMemoryHandle& handle, s
 	ASSERT_CUDA(cudaMemcpy(data, GetRaw(handle), size, cudaMemcpyDeviceToHost));
 }
 
-CMemoryHandle CCudaMathEngine::CopyFrom( const CMemoryHandle& handle, size_t size )
-{
-	CMemoryHandle result = HeapAlloc( size );
-
-	IMathEngine* otherMathEngine = handle.GetMathEngine();
-	void* ptr = otherMathEngine->GetBuffer( handle, 0, size, true );
-
-	DataExchangeRaw( result, ptr, size );
-
-	otherMathEngine->ReleaseBuffer( handle, ptr, false );
-
-	return result;
-}
-
 CMemoryHandle CCudaMathEngine::Alloc( size_t size )
 {
 	SetCudaDevice( device->DeviceNumber );
@@ -237,7 +84,7 @@ void CCudaMathEngine::AllReduce( const CFloatHandle& handle, int size )
 	if( ncclCommunicator != nullptr ){
 		ncclCommunicator->AllReduce( handle, size );
 	}
-#endif
+#endif //NEOML_USE_NCCL
 }
 
 void CCudaMathEngine::AbortDistributed()
@@ -246,7 +93,7 @@ void CCudaMathEngine::AbortDistributed()
 	if( ncclCommunicator != nullptr ){
 		ncclCommunicator->Abort();
 	}
-#endif
+#endif //NEOML_USE_NCCL
 }
 
 void CCudaMathEngine::Broadcast( const CFloatHandle& handle, int size, int root )
@@ -258,7 +105,7 @@ void CCudaMathEngine::Broadcast( const CFloatHandle& handle, int size, int root
 	if( ncclCommunicator != nullptr ){
 		ncclCommunicator->Broadcast( handle, size, root );
 	}
-#endif
+#endif //NEOML_USE_NCCL
 }
 
 #ifdef NEOML_USE_NCCL
@@ -268,7 +115,7 @@ void CCudaMathEngine::SetDistributedCommunicator( const ncclUniqueId& uniqueId,
 	ncclCommunicator = std::make_unique<CCudaDistributedCommunicator>( uniqueId, info, isAbort );
 	distributedInfo = info;
 }
-#endif
+#endif //NEOML_USE_NCCL
 
 } // namespace NeoML
 
diff --git a/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.cu b/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.cu
index c69c4f8e1..3beb99b6a 100644
--- a/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.cu
+++ b/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.cu
@@ -40,7 +40,8 @@ const int CudaMemoryAlignment = 4;
 
 //------------------------------------------------------------------------------------------------------------
 
-CCudaMathEngine::CCudaMathEngine( const CCusparse* _cusparse, const CCublas* _cublas, std::unique_ptr<CCudaDevice>& _device, int flags ) :
+CCudaMathEngine::CCudaMathEngine( const CCusparse* _cusparse, const CCublas* _cublas,
+		std::unique_ptr<CCudaDevice>& _device, int flags ) :
 	loader( CDllLoader::CUDA_DLL ),
 	cusparse( _cusparse ),
 	cublas( _cublas ),
@@ -72,16 +73,14 @@ CCudaMathEngine::CCudaMathEngine( const CCusparse* _cusparse, const CCublas* _cu
 	ASSERT_CUDA( cudaGetSymbolAddress((void**)&cudaConstZero, ZeroDev) );
 	ASSERT_CUDA( cudaGetSymbolAddress((void**)&cudaConstOne, OneDev) );
 
-	memoryPool = std::unique_ptr<CMemoryPool>( new CMemoryPool( device->MemoryLimit, this, true ) );
-	deviceStackRunTime = std::unique_ptr<CDeviceStackAllocator>( new CDeviceStackAllocator( *memoryPool, CudaMemoryAlignment ) );
-	hostStackRunTime = std::unique_ptr<CHostStackAllocator>( new CHostStackAllocator( CudaMemoryAlignment ) );
+	InitializeMemory( this, device->MemoryLimit, CudaMemoryAlignment, /*reuse*/true, /*hostStack*/true );
 }
 
 CCudaMathEngine::~CCudaMathEngine()
 {
-	hostStackRunTime.reset();
-	deviceStackRunTime.reset();
-	memoryPool.reset();
+	HostStackAllocator.reset();
+	DeviceStackAllocator.reset();
+	MemoryPool.reset();
 
 	cusparse->Destroy( cusparseHandle );
 	cublas->Destroy( cublasHandle );
diff --git a/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.h b/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.h
index 88cab943b..895bd0a3b 100644
--- a/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.h
+++ b/NeoMathEngine/src/GPU/CUDA/CudaMathEngine.h
@@ -21,10 +21,9 @@ limitations under the License.
 
 #include <NeoMathEngine/NeoMathEngine.h>
 #include <DllLoader.h>
-#include <RawMemoryManager.h>
+#include <MemoryEngineMixin.h>
 #include <cusparse.h>
 #include <cublas.h>
-#include <mutex>
 #include <memory>
 #include <PerformanceCountersDefault.h>
 #include <CudaMathEngineDnnDistributed.h>
@@ -38,12 +37,9 @@ struct CCuda3dConvolutionDescInternal;
 struct CCusparse;
 struct CCublas;
 struct CCudaDevice;
-class CDeviceStackAllocator;
-class CHostStackAllocator;
-class CMemoryPool;
 
 // CUDA math engine
-class CCudaMathEngine : public IMathEngine, public IRawMemoryManager {
+class CCudaMathEngine : public CMemoryEngineMixin, public IRawMemoryManager {
 public:
 	CCudaMathEngine( const CCusparse* cusparse, const CCublas* cublas, std::unique_ptr<CCudaDevice>& device, int flags = 0 );
 	~CCudaMathEngine() override;
@@ -51,26 +47,9 @@ class CCudaMathEngine : public IMathEngine, public IRawMemoryManager {
 	// IMathEngine interface methods
 	TMathEngineType GetType() const override { return MET_Cuda; }
 	void GetMathEngineInfo( CMathEngineInfo& info ) const override;
-	void SetReuseMemoryMode( bool enable ) override;
-	bool GetReuseMemoryMode() const override;
-	void SetThreadBufferMemoryThreshold( size_t threshold ) override;
-	size_t GetThreadBufferMemoryThreshold() const override;
-	CMemoryHandle HeapAlloc( size_t count ) override;
-	void HeapFree( const CMemoryHandle& handle ) override;
-	void TransferHandleToThisThread( const CMemoryHandle& handle, size_t size ) override;
-	CMemoryHandle StackAlloc( size_t count ) override;
-	void StackFree( const CMemoryHandle& handle ) override;
-	size_t GetFreeMemorySize() const override;
-	size_t GetPeakMemoryUsage() const override;
-	void ResetPeakMemoryUsage() override;
-	size_t GetCurrentMemoryUsage() const override;
-	size_t GetMemoryInPools() const override;
-	void CleanUp() override;
-	void* GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) override;
-	void ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) override;
+
 	void DataExchangeRaw( const CMemoryHandle& handle, const void* data, size_t size ) override;
 	void DataExchangeRaw( void* data, const CMemoryHandle& handle, size_t size ) override;
-	CMemoryHandle CopyFrom( const CMemoryHandle& handle, size_t size ) override;
 
 	// IVectorMathematicsEngine interface methods
 	void VectorFill( const CFloatHandle& result, float value, int vectorSize ) override;
@@ -656,13 +635,9 @@ class CCudaMathEngine : public IMathEngine, public IRawMemoryManager {
 	const float* cudaConstZero; // pointer to __constant__ == 0.f
 	const float* cudaConstOne; // pointer to __constant__ == 1.f
 
-	mutable std::mutex mutex; // protects the data below
 	std::unique_ptr<CCudaDevice> device; // the device descriptor
 	cublasHandle_t cublasHandle; // cublas library handle
 	cusparseHandle_t cusparseHandle; // cusparse library handle
-	std::unique_ptr<CMemoryPool> memoryPool; // memory manager
-	std::unique_ptr<CDeviceStackAllocator> deviceStackRunTime; // GPU memory stack allocator
-	std::unique_ptr<CHostStackAllocator> hostStackRunTime; // regular memory stack allocator
 	CMathEngineDistributedInfo distributedInfo;
 #ifdef NEOML_USE_NCCL
 	std::unique_ptr<CCudaDistributedCommunicator> ncclCommunicator = nullptr;
diff --git a/NeoMathEngine/src/GPU/Metal/MetalMathEngine.h b/NeoMathEngine/src/GPU/Metal/MetalMathEngine.h
index d7af3e537..7035ac1ce 100644
--- a/NeoMathEngine/src/GPU/Metal/MetalMathEngine.h
+++ b/NeoMathEngine/src/GPU/Metal/MetalMathEngine.h
@@ -22,15 +22,12 @@ limitations under the License.
 #include <NeoMathEngine/CrtAllocatedObject.h>
 #include <NeoMathEngine/NeoMathEngine.h>
 #include <MathEngineCommon.h>
-#include <RawMemoryManager.h>
+#include <MemoryEngineMixin.h>
 #include <PerformanceCountersDefault.h>
 
 namespace NeoML {
 
 class CMetalCommandQueue;
-class CMemoryPool;
-class CDeviceStackAllocator;
-class CMutex;
 struct CMetalRleConvolutionDesc;
 
 // Gets the information about an available device
@@ -39,34 +36,19 @@ bool LoadMetalEngineInfo( CMathEngineInfo& info );
 //------------------------------------------------------------------------------------------------------------
 
 // The math engine using metal
-class CMetalMathEngine : public IMathEngine, public IRawMemoryManager {
+class CMetalMathEngine : public CMemoryEngineMixin, public IRawMemoryManager {
 public:
 	explicit CMetalMathEngine( size_t memoryLimit );
 	~CMetalMathEngine() override;
 
 	// IMathEngine interface methods
 	TMathEngineType GetType() const override { return MET_Metal; }
-	void SetReuseMemoryMode( bool enable ) override;
-	bool GetReuseMemoryMode() const override;
-	void SetThreadBufferMemoryThreshold( size_t threshold ) override;
-	size_t GetThreadBufferMemoryThreshold() const override;
-	CMemoryHandle HeapAlloc( size_t count ) override;
-	void HeapFree( const CMemoryHandle& handle ) override;
-	void TransferHandleToThisThread( const CMemoryHandle& /*handle*/, size_t /*size*/ ) override { ASSERT_EXPR( false ); }
-	CMemoryHandle StackAlloc( size_t count ) override;
-	void StackFree( const CMemoryHandle& handle ) override;
-	size_t GetFreeMemorySize() const override;
-	size_t GetPeakMemoryUsage() const override;
-	void ResetPeakMemoryUsage() override;
-	size_t GetCurrentMemoryUsage() const override;
-	size_t GetMemoryInPools() const override;
-	void CleanUp() override;
-	void* GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) override;
-	void ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) override;
+	void GetMathEngineInfo( CMathEngineInfo& info ) const override;
+
+	void* GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) override; // specialize
+	void ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) override; // specialize
 	void DataExchangeRaw( const CMemoryHandle& handle, const void* data, size_t size ) override;
 	void DataExchangeRaw( void* data, const CMemoryHandle& handle, size_t size ) override;
-	CMemoryHandle CopyFrom( const CMemoryHandle& handle, size_t size ) override;
-	void GetMathEngineInfo( CMathEngineInfo& info ) const override;
 
 	// IVectorMathematicsEngine interface methods
 	void VectorFill(const CFloatHandle& result, float value, int vectorSize) override;
@@ -643,28 +625,9 @@ class CMetalMathEngine : public IMathEngine, public IRawMemoryManager {
 	void Free( const CMemoryHandle& handle ) override;
 
 private:
-	// STL cannot be used here
-	template<class T>
-	class CUniquePtr : public CCrtAllocatedObject {
-	public:
-		explicit CUniquePtr( T* _ptr ) : ptr( _ptr ) {}
-		~CUniquePtr() { delete ptr; }
-
-		operator T*() { return ptr; }
-		operator const T*() const { return ptr; }
-		T* operator ->() { return ptr; }
-		const T* operator ->() const { return ptr; }
-
-	private:
-		T* ptr;
-	};
-
-	CUniquePtr<CMetalCommandQueue> queue; // the default command queue for a metal device
-	CUniquePtr<CMemoryPool> memoryPool; // the memory manager
-	CUniquePtr<CDeviceStackAllocator> deviceStackAllocator; // the stack allocator of GPU memory
-	mutable CUniquePtr<CMutex> mutex; // protecting allocations
+	std::unique_ptr<CMetalCommandQueue> queue; // the default command queue for a metal device
 
-	IMathEngine& mathEngine() { IMathEngine* engine = this; return *engine; }
+	IMathEngine& mathEngine() { return *this; }
 
 	void blobMergeByDim(int dimNum, const CBlobDesc* from, const CFloatHandle* fromData, int fromCount,
 		const CBlobDesc& to, const CFloatHandle& toData);
diff --git a/NeoMathEngine/src/GPU/Metal/MetalMathEngine.mm b/NeoMathEngine/src/GPU/Metal/MetalMathEngine.mm
index c2c803acc..6b70be7fc 100644
--- a/NeoMathEngine/src/GPU/Metal/MetalMathEngine.mm
+++ b/NeoMathEngine/src/GPU/Metal/MetalMathEngine.mm
@@ -64,121 +64,19 @@ bool LoadMetalEngineInfo( CMathEngineInfo& info )
 
 //----------------------------------------------------------------------------------------------------------------------------
 
-// Not using STL in headers
-class CMutex : public std::mutex {
-};
-
-//----------------------------------------------------------------------------------------------------------------------------
-
 const int MetalMemoryAlignment = 16;
 
 CMetalMathEngine::CMetalMathEngine( size_t memoryLimit ) :
-	queue( new CMetalCommandQueue() ),
-	memoryPool( new CMemoryPool( MIN( memoryLimit == 0 ? SIZE_MAX : memoryLimit, defineMemoryLimit() ), this, false ) ),
-	deviceStackAllocator( new CDeviceStackAllocator( *memoryPool, MetalMemoryAlignment ) ),
-	mutex( new CMutex() )
+	queue( new CMetalCommandQueue() )
 {
 	ASSERT_EXPR( queue->Create() );
-}
-
-CMetalMathEngine::~CMetalMathEngine()
-{
-}
+	memoryLimit = MIN( memoryLimit == 0 ? SIZE_MAX : memoryLimit, defineMemoryLimit() );
 
-void CMetalMathEngine::SetReuseMemoryMode( bool enable )
-{
-	std::lock_guard<CMutex> lock( *mutex );
-	memoryPool->SetReuseMemoryMode( enable );
-}
-
-bool CMetalMathEngine::GetReuseMemoryMode() const
-{
-	std::lock_guard<CMutex> lock( *mutex );
-	return memoryPool->GetReuseMemoryMode();
+	InitializeMemory( this, memoryLimit, MetalMemoryAlignment, /*reuse*/false, /*hostStack*/false );
 }
 
-void CMetalMathEngine::SetThreadBufferMemoryThreshold( size_t threshold )
-{
-	std::lock_guard<CMutex> lock( *mutex );
-	memoryPool->SetThreadBufferMemoryThreshold( threshold );
-}
-
-size_t CMetalMathEngine::GetThreadBufferMemoryThreshold() const
-{
-	std::lock_guard<CMutex> lock( *mutex );
-	return memoryPool->GetThreadBufferMemoryThreshold();
-}
-
-CMemoryHandle CMetalMathEngine::HeapAlloc( size_t size )
-{
-	std::lock_guard<CMutex> lock( *mutex );
-	CMemoryHandle result = memoryPool->Alloc( size );
-	if( result.IsNull() ) {
-		THROW_MEMORY_EXCEPTION;
-	}
-
-	return result;
-}
-
-void CMetalMathEngine::HeapFree( const CMemoryHandle& handle )
-{
-	ASSERT_EXPR( handle.GetMathEngine() == this );
-
-	std::lock_guard<CMutex> lock( *mutex );
-	return memoryPool->Free( handle );
-}
-
-CMemoryHandle CMetalMathEngine::StackAlloc( size_t size )
-{
-	std::lock_guard<CMutex> lock( *mutex );
-	CMemoryHandle result = deviceStackAllocator->Alloc( size );
-	if( result.IsNull() ) {
-		THROW_MEMORY_EXCEPTION;
-	}
-	return result;
-}
-
-void CMetalMathEngine::StackFree( const CMemoryHandle& ptr )
-{
-	std::lock_guard<CMutex> lock( *mutex );
-	deviceStackAllocator->Free( ptr );
-}
-
-size_t CMetalMathEngine::GetFreeMemorySize() const
-{
-	std::lock_guard<CMutex> lock( *mutex );
-	return memoryPool->GetFreeMemorySize();
-}
-
-size_t CMetalMathEngine::GetPeakMemoryUsage() const
-{
-	std::lock_guard<CMutex> lock( *mutex );
-	return memoryPool->GetPeakMemoryUsage();
-}
-
-void CMetalMathEngine::ResetPeakMemoryUsage()
-{
-	std::lock_guard<CMutex> lock( *mutex );
-	memoryPool->ResetPeakMemoryUsage();
-}
-
-size_t CMetalMathEngine::GetCurrentMemoryUsage() const
-{
-	std::lock_guard<CMutex> lock( *mutex );
-	return memoryPool->GetCurrentMemoryUsage();
-}
-
-size_t CMetalMathEngine::GetMemoryInPools() const
-{
-	std::lock_guard<CMutex> lock( *mutex );
-	return memoryPool->GetMemoryInPools();
-}
-
-void CMetalMathEngine::CleanUp()
+CMetalMathEngine::~CMetalMathEngine()
 {
-	std::lock_guard<CMutex> lock( *mutex );
-	deviceStackAllocator->CleanUp();
-	memoryPool->CleanUp();
 }
 
 static void* getBufferPtr( void* buffer, ptrdiff_t offset )
@@ -218,20 +116,6 @@ bool LoadMetalEngineInfo( CMathEngineInfo& info )
 	memcpy( data, buf, size );
 }
 
-CMemoryHandle CMetalMathEngine::CopyFrom( const CMemoryHandle& handle, size_t size )
-{
-	CMemoryHandle result = HeapAlloc( size );
-
-	IMathEngine* otherMathEngine = handle.GetMathEngine();
-	void* ptr = otherMathEngine->GetBuffer( handle, 0, size, true );
-
-	DataExchangeRaw( result, ptr, size );
-
-	otherMathEngine->ReleaseBuffer( handle, ptr, false );
-
-	return result;
-}
-
 void CMetalMathEngine::VectorCopy( const CFloatHandle& first, const CConstFloatHandle& second, int vectorSize )
 {
 	ASSERT_EXPR( first.GetMathEngine() == this );
diff --git a/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngine.cpp b/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngine.cpp
index c2a793397..227e422fa 100644
--- a/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngine.cpp
+++ b/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngine.cpp
@@ -79,150 +79,22 @@ CVulkanMathEngine::CVulkanMathEngine( std::unique_ptr<const CVulkanDevice>& _dev
 	shaderLoader = std::unique_ptr<CVulkanShaderLoader>( new CVulkanShaderLoader( *device ) );
 	commandQueue = std::unique_ptr<CVulkanCommandQueue>( new CVulkanCommandQueue( *device ) );
 	memoryLimit = std::min<size_t>( memoryLimit == 0 ? SIZE_MAX : memoryLimit, device->AvailableMemory );
-	memoryPool = std::unique_ptr<CMemoryPool>( new CMemoryPool( memoryLimit, this, false ) );
-	deviceStackAllocator = std::unique_ptr<CDeviceStackAllocator>( new CDeviceStackAllocator( *memoryPool, VulkanMemoryAlignment ) );
-	hostStackAllocator = std::unique_ptr<CHostStackAllocator>( new CHostStackAllocator( VulkanMemoryAlignment ) );
-}
 
-CVulkanMathEngine::~CVulkanMathEngine()
-{
-	for( auto cur : tmpImages ) {
-		delete cur;
-	}
-}
-
-void CVulkanMathEngine::SetReuseMemoryMode( bool enable )
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	memoryPool->SetReuseMemoryMode( enable );
-}
-
-bool CVulkanMathEngine::GetReuseMemoryMode() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetReuseMemoryMode();
-}
-
-void CVulkanMathEngine::SetThreadBufferMemoryThreshold( size_t threshold )
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	memoryPool->SetThreadBufferMemoryThreshold( threshold );
-}
-
-size_t CVulkanMathEngine::GetThreadBufferMemoryThreshold() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetThreadBufferMemoryThreshold();
+	InitializeMemory( this, memoryLimit, VulkanMemoryAlignment, /*reuse*/false, /*hostStack*/true );
 }
 
-CMemoryHandle CVulkanMathEngine::HeapAlloc( size_t size )
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	CMemoryHandle result = memoryPool->Alloc( size );
-	if( result.IsNull() ) {
-		THROW_MEMORY_EXCEPTION;
-	}
-
-	return result;
-}
-
-void CVulkanMathEngine::HeapFree( const CMemoryHandle& handle )
-{
-	ASSERT_EXPR( handle.GetMathEngine() == this );
-
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->Free( handle );
-}
-
-CMemoryHandle CVulkanMathEngine::StackAlloc( size_t size )
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	CMemoryHandle result = deviceStackAllocator->Alloc( size );
-	if( result.IsNull() ) {
-		THROW_MEMORY_EXCEPTION;
-	}
-	return result;
-}
-
-void CVulkanMathEngine::StackFree( const CMemoryHandle& ptr )
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	deviceStackAllocator->Free( ptr );
-}
-
-size_t CVulkanMathEngine::GetFreeMemorySize() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetFreeMemorySize();
-}
-
-size_t CVulkanMathEngine::GetPeakMemoryUsage() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetPeakMemoryUsage();
-}
-
-void CVulkanMathEngine::ResetPeakMemoryUsage()
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	memoryPool->ResetPeakMemoryUsage();
-}
-
-size_t CVulkanMathEngine::GetCurrentMemoryUsage() const
-{
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetCurrentMemoryUsage();
-}
-
-size_t CVulkanMathEngine::GetMemoryInPools() const
+CVulkanMathEngine::~CVulkanMathEngine()
 {
-	std::lock_guard<std::mutex> lock( mutex );
-	return memoryPool->GetMemoryInPools();
+	CleanUp();
 }
 
-void CVulkanMathEngine::CleanUp()
+void CVulkanMathEngine::CleanUpSpecial()
 {
-	std::lock_guard<std::mutex> lock( mutex );
-	deviceStackAllocator->CleanUp();
-	hostStackAllocator->CleanUp();
 	commandQueue->CleanUp();
 	for( auto& cur : tmpImages ) {
 		delete cur;
 		cur = 0;
 	}
-	memoryPool->CleanUp();
-}
-
-void* CVulkanMathEngine::GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange )
-{
-	ASSERT_EXPR(handle.GetMathEngine() == this);
-
-	size_t realSize = size + 16;
-	char* result = reinterpret_cast<char*>( hostStackAllocator->Alloc( realSize ) );
-	size_t* posPtr = reinterpret_cast<size_t*>( result );
-	*posPtr = pos;
-	size_t* sizePtr = reinterpret_cast<size_t*>( result ) + 1;
-	*sizePtr = size;
-	if( exchange ) {
-		DataExchangeRaw( result + 16, handle, size );
-	}
-	return result + 16;
-}
-
-void CVulkanMathEngine::ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange )
-{
-	ASSERT_EXPR(handle.GetMathEngine() == this);
-
-	if( exchange ) {
-		size_t* posPtr = reinterpret_cast<size_t*>( reinterpret_cast<char*>( ptr ) - 16 );
-		size_t pos = *posPtr;
-		size_t* sizePtr = posPtr + 1;
-		size_t size = *sizePtr;
-
-		DataExchangeRaw( CTypedMemoryHandle<char>( handle ) + pos, ptr, size );
-	}
-
-	hostStackAllocator->Free( reinterpret_cast<char*>( ptr ) - 16 );
 }
 
 void CVulkanMathEngine::DataExchangeRaw( const CMemoryHandle& to, const void* from, size_t size )
@@ -232,7 +104,7 @@ void CVulkanMathEngine::DataExchangeRaw( const CMemoryHandle& to, const void* fr
 	CTypedMemoryHandle<char> toPtr( to );
 	const char* fromPtr = reinterpret_cast<const char*>( from );
 
-	std::lock_guard<std::mutex> lock( mutex );
+	std::lock_guard<std::mutex> lock( Mutex );
 	while( size != 0 ) {
 		CVulkanMemory* vulkanMemory = GetRawAllocation( toPtr );
 		ptrdiff_t vulkanOffset = GetRawOffset( toPtr );
@@ -284,7 +156,7 @@ void CVulkanMathEngine::DataExchangeRaw( void* to, const CMemoryHandle& from, si
 	CTypedMemoryHandle<char> fromPtr( from );
 	char* toPtr = reinterpret_cast<char*>( to );
 
-	std::lock_guard<std::mutex> lock( mutex );
+	std::lock_guard<std::mutex> lock( Mutex );
 	while( size != 0 ) {
 		CVulkanMemory* vulkanMemory = GetRawAllocation( fromPtr );
 		ptrdiff_t vulkanOffset = GetRawOffset( fromPtr );
@@ -323,20 +195,6 @@ void CVulkanMathEngine::DataExchangeRaw( void* to, const CMemoryHandle& from, si
 	}
 }
 
-CMemoryHandle CVulkanMathEngine::CopyFrom( const CMemoryHandle& handle, size_t size )
-{
-	CMemoryHandle result = HeapAlloc( size );
-
-	IMathEngine* otherMathEngine = handle.GetMathEngine();
-	void* ptr = otherMathEngine->GetBuffer( handle, 0, size, true );
-
-	DataExchangeRaw( result, ptr, size );
-
-	otherMathEngine->ReleaseBuffer( handle, ptr, false );
-
-	return result;
-}
-
 CMemoryHandle CVulkanMathEngine::Alloc( size_t size )
 {
 	VkBufferUsageFlags usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | 
@@ -425,7 +283,7 @@ void CVulkanMathEngine::runShader( const CVulkanShaderData& shader, const void*
 	const CMemoryHandle* dataBuffers, const size_t* dataSizes, int dataBufferCount,
 	int countX, int countY, int countZ )
 {
-	std::lock_guard<std::mutex> lock( mutex );
+	std::lock_guard<std::mutex> lock( Mutex );
 	commandQueue->RunComputeShader( shader, Ceil(countX, shader.GroupSizeX), Ceil(countY, shader.GroupSizeY), Ceil(countZ, shader.GroupSizeZ),
 		param, paramSize, images, imageCount, samplers, samplerCount,
 		dataBuffers, dataSizes, dataBufferCount );
@@ -441,7 +299,7 @@ void CVulkanMathEngine::runVectorShader( const CVulkanShaderData& shader, const
 
 	ASSERT_EXPR(shader.GroupSizeY == 1 && shader.GroupSizeZ == 1);
 
-	std::lock_guard<std::mutex> lock( mutex );
+	std::lock_guard<std::mutex> lock( Mutex );
 	commandQueue->RunComputeShader( shader, groupCountX, groupCountY, 1,  param, paramSize, images, imageCount, samplers, samplerCount,
 		dataBuffers, dataSizes, dataBufferCount );
 }
diff --git a/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngine.h b/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngine.h
index 530ca9e63..acca19dab 100644
--- a/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngine.h
+++ b/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngine.h
@@ -20,13 +20,12 @@ limitations under the License.
 #ifdef NEOML_USE_VULKAN
 
 #include <vector>
-#include <mutex>
 #include <memory>
 #include <vulkan/vulkan.h>
 #include <NeoMathEngine/NeoMathEngine.h>
 #include <MathEngineAllocator.h>
 #include <VulkanImage.h>
-#include <RawMemoryManager.h>
+#include <MemoryEngineMixin.h>
 #include <PerformanceCountersDefault.h>
 #include <DllLoader.h>
 
@@ -39,10 +38,7 @@ struct CVulkanDevice;
 struct CVulkanRleConvolutionDesc;
 class CVulkanCommandQueue;
 class CVulkanShaderLoader;
-class CDeviceStackAllocator;
-class CHostStackAllocator;
 class CVulkanImage;
-class CMemoryPool;
 
 // Adds the information about available vulkan devices into the result array
 // Returns true if at least one device has been added
@@ -51,34 +47,17 @@ bool LoadVulkanEngineInfo( const CVulkanDll& dll, std::vector< CMathEngineInfo,
 //------------------------------------------------------------------------------------------------------------
 
 // The math engine on vulkan
-class CVulkanMathEngine : public IMathEngine, public IRawMemoryManager {
+class CVulkanMathEngine : public CMemoryEngineMixin, public IRawMemoryManager {
 public:
 	CVulkanMathEngine( std::unique_ptr<const CVulkanDevice>& device, size_t memoryLimit );
 	~CVulkanMathEngine() override;
 
 	// IMathEngine interface methods
 	TMathEngineType GetType() const override { return MET_Vulkan; }
-	void SetReuseMemoryMode( bool enable ) override;
-	bool GetReuseMemoryMode() const override;
-	void SetThreadBufferMemoryThreshold( size_t threshold ) override;
-	size_t GetThreadBufferMemoryThreshold() const override;
-	CMemoryHandle HeapAlloc( size_t count ) override;
-	void HeapFree( const CMemoryHandle& handle ) override;
-	void TransferHandleToThisThread( const CMemoryHandle& /*handle*/, size_t /*size*/ ) override { ASSERT_EXPR( false ); }
-	CMemoryHandle StackAlloc( size_t count ) override;
-	void StackFree( const CMemoryHandle& handle ) override;
-	size_t GetFreeMemorySize() const override;
-	size_t GetPeakMemoryUsage() const override;
-	void ResetPeakMemoryUsage() override;
-	size_t GetCurrentMemoryUsage() const override;
-	size_t GetMemoryInPools() const override;
-	void CleanUp() override;
-	void* GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) override;
-	void ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) override;
+	void GetMathEngineInfo( CMathEngineInfo& info ) const override;
+
 	void DataExchangeRaw( const CMemoryHandle& handle, const void* data, size_t size ) override;
 	void DataExchangeRaw( void* data, const CMemoryHandle& handle, size_t size ) override;
-	CMemoryHandle CopyFrom( const CMemoryHandle& handle, size_t size ) override;
-	void GetMathEngineInfo( CMathEngineInfo& info ) const override;
 
 	// IVectorMathematicsEngine interface methods
 	void VectorFill( const CFloatHandle& result, float value, int vectorSize ) override;
@@ -654,15 +633,13 @@ class CVulkanMathEngine : public IMathEngine, public IRawMemoryManager {
 	CMemoryHandle Alloc( size_t size ) override;
 	void Free( const CMemoryHandle& handle ) override;
 
+	void CleanUpSpecial() override;
+
 private:
 	CDllLoader dllLoader; // vulkan dll wrapper
-	mutable std::mutex mutex; // protecting the data below from non-thread-safe use
 	std::unique_ptr<const CVulkanDevice> device; // device descriptor
 	std::unique_ptr<CVulkanShaderLoader> shaderLoader; // shader loader
 	std::unique_ptr<CVulkanCommandQueue> commandQueue; // shader execution queue
-	std::unique_ptr<CMemoryPool> memoryPool; // memory manager
-	std::unique_ptr<CDeviceStackAllocator> deviceStackAllocator; // stack allocator for GPU memory
-	std::unique_ptr<CHostStackAllocator> hostStackAllocator; // stack allocator for host memory
 	std::vector< CVulkanImage*, CrtAllocator<CVulkanImage*> > tmpImages; // temporary images
 
 	IMathEngine& mathEngine() { IMathEngine* engine = this; return *engine; }
diff --git a/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngineVectorMath.cpp b/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngineVectorMath.cpp
index 811defae7..a1560d69e 100644
--- a/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngineVectorMath.cpp
+++ b/NeoMathEngine/src/GPU/Vulkan/VulkanMathEngineVectorMath.cpp
@@ -108,7 +108,7 @@ void CVulkanMathEngine::VectorFill( const CFloatHandle& result, float value, int
 
 	CVulkanMemory* vulkanMemory = GetRawAllocation( result );
 
-	std::lock_guard<std::mutex> lock( mutex );
+	std::lock_guard<std::mutex> lock( Mutex );
 	commandQueue->RunFillBuffer( vulkanMemory->Buffer(), GetRawOffset( result ), size, data );
 }
 
@@ -119,7 +119,7 @@ void CVulkanMathEngine::VectorFill( const CIntHandle& result, int value, int vec
 
 	CVulkanMemory* vulkanMemory = GetRawAllocation( result );
 
-	std::lock_guard<std::mutex> lock( mutex );
+	std::lock_guard<std::mutex> lock( Mutex );
 	commandQueue->RunFillBuffer( vulkanMemory->Buffer(), GetRawOffset(result), size, data );
 }
 
@@ -186,7 +186,7 @@ void CVulkanMathEngine::VectorCopy( const CFloatHandle& to, const CConstFloatHan
 	CVulkanMemory* vulkanMemoryFrom = GetRawAllocation(from);
 	CVulkanMemory* vulkanMemoryTo = GetRawAllocation(to);
 
-	std::lock_guard<std::mutex> lock( mutex );
+	std::lock_guard<std::mutex> lock( Mutex );
 	commandQueue->RunCopyBuffer( vulkanMemoryFrom->Buffer(), vulkanMemoryTo->Buffer(), region );
 }
 
@@ -200,7 +200,7 @@ void CVulkanMathEngine::VectorCopy(const CIntHandle& to, const CConstIntHandle&
 	CVulkanMemory* vulkanMemoryFrom = GetRawAllocation(from);
 	CVulkanMemory* vulkanMemoryTo = GetRawAllocation(to);
 
-	std::lock_guard<std::mutex> lock( mutex );
+	std::lock_guard<std::mutex> lock( Mutex );
 	commandQueue->RunCopyBuffer( vulkanMemoryFrom->Buffer(), vulkanMemoryTo->Buffer(), region );
 }
 
diff --git a/NeoMathEngine/src/MathEngine.cpp b/NeoMathEngine/src/MathEngine.cpp
index 2e3f9ed55..9ce300f22 100644
--- a/NeoMathEngine/src/MathEngine.cpp
+++ b/NeoMathEngine/src/MathEngine.cpp
@@ -141,32 +141,38 @@ IMathEngine* CGpuMathEngineManager::CreateMathEngine( int index, size_t memoryLi
 	if( size == 0 || index >= size ) {
 		return nullptr;
 	}
+	IMathEngine* mathEngine = nullptr;
 	switch(info[index >= 0 ? index : 0].Type) {
 #ifdef NEOML_USE_CUDA
 	case MET_Cuda:
 	{
 		std::unique_ptr<CCudaDevice> device( CaptureCudaDevice( index >= 0 ? info[index].Id : -1, memoryLimit ) );
-		if( device == nullptr ) {
-			return nullptr;
+		if( device != nullptr ) {
+			mathEngine = new CCudaMathEngine( CDllLoader::cusparseDll->GetFunctions(), CDllLoader::cublasDll->GetFunctions(), device, flags );
+			break;
 		}
-		return new CCudaMathEngine( CDllLoader::cusparseDll->GetFunctions(), CDllLoader::cublasDll->GetFunctions(), device, flags );
+		return nullptr;
 	}
-#endif
+#endif //NEOML_USE_CUDA
 #ifdef NEOML_USE_VULKAN
 	case MET_Vulkan:
 	{
 		const auto& deviceInfo = loader.vulkanDll->GetDevices()[index >= 0 ? info[index].Id : 0];
 		std::unique_ptr<const CVulkanDevice> device (loader.vulkanDll->CreateDevice( deviceInfo ) );
-		if( !device ) {
-			return nullptr;
+		if( device != nullptr ) {
+			mathEngine = new CVulkanMathEngine( device, memoryLimit );
+			break;
 		}
-		return new CVulkanMathEngine( device, memoryLimit );
+		return nullptr;
 	}
-#endif
+#endif //NEOML_USE_VULKAN
 #ifdef NEOML_USE_METAL
 	case MET_Metal:
-		return new CMetalMathEngine( memoryLimit );
-#endif
+	{
+		mathEngine = new CMetalMathEngine( memoryLimit );
+		break;
+	}
+#endif //NEOML_USE_METAL
 	case MET_Undefined:
 	default:
 	{
@@ -174,6 +180,8 @@ IMathEngine* CGpuMathEngineManager::CreateMathEngine( int index, size_t memoryLi
 		return nullptr;
 	}
 	}
+	ASSERT_EXPR( mathEngine && mathEngine->IsInitialized() ); // Fails, if no call CMemoryEngineMixin::InitializeMemory in some child ctor
+	return mathEngine;
 }
 
 //------------------------------------------------------------------------------------------------------------
@@ -218,7 +226,9 @@ IMathEngineExceptionHandler* GetMathEngineExceptionHandler()
 
 IMathEngine* CreateCpuMathEngine( size_t memoryLimit )
 {
-	return new CCpuMathEngine( memoryLimit );
+	IMathEngine *mathEngine = new CCpuMathEngine( memoryLimit );
+	ASSERT_EXPR( mathEngine && mathEngine->IsInitialized() ); // Fails, if no call CMemoryEngineMixin::InitializeMemory in some child ctor
+	return mathEngine;
 }
 
 // deprecated
diff --git a/NeoMathEngine/src/MemoryEngineMixin.cpp b/NeoMathEngine/src/MemoryEngineMixin.cpp
new file mode 100644
index 000000000..9b967980b
--- /dev/null
+++ b/NeoMathEngine/src/MemoryEngineMixin.cpp
@@ -0,0 +1,231 @@
+/* Copyright © 2024 ABBYY
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+--------------------------------------------------------------------------------------------------------------*/
+
+#include <common.h>
+#pragma hdrstop
+
+#include <NeoMathEngine/NeoMathEngine.h>
+#include <MathEngineAllocator.h>
+#include <MathEngineCommon.h>
+#include <MathEngineDeviceStackAllocator.h>
+#include <MathEngineHostStackAllocator.h>
+#include <MemoryEngineMixin.h>
+#include <MemoryHandleInternal.h>
+#include <MemoryPool.h>
+
+namespace NeoML {
+
+void CMemoryEngineMixin::InitializeMemory( IRawMemoryManager* _rawManager, size_t _memoryLimit, int _memoryAlignment,
+		bool _reuse, bool _hostStack )
+{
+	MemoryAlignment = _memoryAlignment;
+	MemoryPool.reset( new CMemoryPool( _memoryLimit == 0 ? SIZE_MAX : _memoryLimit, _rawManager, _reuse ) );
+	DeviceStackAllocator.reset( new CDeviceStackAllocator( *MemoryPool, MemoryAlignment ) );
+	if( _hostStack == true ) {
+		HostStackAllocator.reset( new CHostStackAllocator( MemoryAlignment ) );
+	}
+}
+
+void CMemoryEngineMixin::SetReuseMemoryMode( bool enable )
+{
+	switch( GetType() ) {
+	case MET_Cuda:
+		// Always true, because allocation is sync
+		break;
+	case MET_Cpu:
+		// Distributed CPU math engine always uses memory pools
+		// because big simultaneous allocations on multiple (20+) threads are extremely slow
+		if( IsDistributed() ) {
+			break;
+		}
+		// fallthrough
+	case MET_Metal:
+	case MET_Vulkan:
+	{
+		std::lock_guard<std::mutex> lock( Mutex );
+		MemoryPool->SetReuseMemoryMode( enable );
+		break;
+	}
+	default:
+		ASSERT_EXPR( false );
+	}
+}
+
+bool CMemoryEngineMixin::GetReuseMemoryMode() const
+{
+	switch( GetType() ) {
+	case MET_Cuda:
+		// Always true, because allocation is sync
+		return true;
+	case MET_Cpu:
+		// Distributed CPU math engine always uses memory pools
+		if( IsDistributed() ) {
+			return true;
+		}
+		// fallthrough
+	case MET_Metal:
+	case MET_Vulkan:
+	{
+		std::lock_guard<std::mutex> lock( Mutex );
+		return MemoryPool->GetReuseMemoryMode();
+	}
+	default:
+		ASSERT_EXPR( false );
+	}
+	return false;
+}
+
+void CMemoryEngineMixin::SetThreadBufferMemoryThreshold( size_t threshold )
+{
+	std::lock_guard<std::mutex> lock( Mutex );
+	MemoryPool->SetThreadBufferMemoryThreshold( threshold );
+}
+
+size_t CMemoryEngineMixin::GetThreadBufferMemoryThreshold() const
+{
+	std::lock_guard<std::mutex> lock( Mutex );
+	return MemoryPool->GetThreadBufferMemoryThreshold();
+}
+
+CMemoryHandle CMemoryEngineMixin::HeapAlloc( size_t size )
+{
+	std::lock_guard<std::mutex> lock( Mutex );
+	CMemoryHandle result = MemoryPool->Alloc( size );
+	if( result.IsNull() ) {
+		THROW_MEMORY_EXCEPTION;
+	}
+	return result;
+}
+
+void CMemoryEngineMixin::HeapFree( const CMemoryHandle& handle )
+{
+	ASSERT_EXPR( handle.GetMathEngine() == this );
+
+	std::lock_guard<std::mutex> lock( Mutex );
+	MemoryPool->Free( handle );
+}
+
+void CMemoryEngineMixin::TransferHandleToThisThread( const CMemoryHandle& handle, size_t size )
+{
+	ASSERT_EXPR( GetType() == MET_Cpu || GetType() == MET_Cuda );
+	ASSERT_EXPR( handle.GetMathEngine() == this );
+
+	std::lock_guard<std::mutex> lock( Mutex );
+	MemoryPool->TransferHandleToThisThread( handle, size );
+}
+
+CMemoryHandle CMemoryEngineMixin::StackAlloc( size_t size )
+{
+	std::lock_guard<std::mutex> lock( Mutex );
+	CMemoryHandle result = DeviceStackAllocator->Alloc( size );
+	if( result.IsNull() ) {
+		THROW_MEMORY_EXCEPTION;
+	}
+	return result;
+}
+
+void CMemoryEngineMixin::StackFree( const CMemoryHandle& ptr )
+{
+	std::lock_guard<std::mutex> lock( Mutex );
+	DeviceStackAllocator->Free( ptr );
+}
+
+size_t CMemoryEngineMixin::GetFreeMemorySize() const
+{
+	std::lock_guard<std::mutex> lock( Mutex );
+	return MemoryPool->GetFreeMemorySize();
+}
+
+size_t CMemoryEngineMixin::GetPeakMemoryUsage() const
+{
+	std::lock_guard<std::mutex> lock( Mutex );
+	return MemoryPool->GetPeakMemoryUsage();
+}
+
+void CMemoryEngineMixin::ResetPeakMemoryUsage()
+{
+	std::lock_guard<std::mutex> lock( Mutex );
+	MemoryPool->ResetPeakMemoryUsage();
+}
+
+size_t CMemoryEngineMixin::GetCurrentMemoryUsage() const
+{
+	std::lock_guard<std::mutex> lock( Mutex );
+	return MemoryPool->GetCurrentMemoryUsage();
+}
+
+size_t CMemoryEngineMixin::GetMemoryInPools() const
+{
+	std::lock_guard<std::mutex> lock( Mutex );
+	return MemoryPool->GetMemoryInPools();
+}
+
+void CMemoryEngineMixin::CleanUp()
+{
+	std::lock_guard<std::mutex> lock( Mutex );
+	DeviceStackAllocator->CleanUp();
+	if( HostStackAllocator != nullptr ) {
+		HostStackAllocator->CleanUp();
+	}
+	CleanUpSpecial();
+	MemoryPool->CleanUp();
+}
+
+constexpr size_t bufferHeaderSize = 2 * sizeof( size_t );
+
+void* CMemoryEngineMixin::GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange )
+{
+	ASSERT_EXPR( HostStackAllocator != nullptr );
+	ASSERT_EXPR( handle.GetMathEngine() == this );
+
+	const size_t realSize = size + bufferHeaderSize;
+	char* result = static_cast<char*>( HostStackAllocator->Alloc( realSize ) );
+
+	size_t* header = reinterpret_cast<size_t*>( result );
+	header[0] = pos;
+	header[1] = size;
+
+	if( exchange ) {
+		DataExchangeRaw( result + bufferHeaderSize, handle, size );
+	}
+	return result + bufferHeaderSize;
+}
+
+void CMemoryEngineMixin::ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange )
+{
+	ASSERT_EXPR( HostStackAllocator != nullptr );
+	ASSERT_EXPR( handle.GetMathEngine() == this );
+
+	if( exchange ) {
+		size_t* header = reinterpret_cast<size_t*>( static_cast<char*>( ptr ) - bufferHeaderSize );
+		size_t pos = header[0];
+		size_t size = header[1];
+
+		DataExchangeRaw( CTypedMemoryHandle<char>( handle ) + pos, ptr, size );
+	}
+	HostStackAllocator->Free( static_cast<char*>( ptr ) - bufferHeaderSize );
+}
+
+CMemoryHandle CMemoryEngineMixin::CopyFrom( const CMemoryHandle& handle, size_t size )
+{
+	CMemoryHandle result = HeapAlloc( size );
+
+	IMathEngine* otherMathEngine = handle.GetMathEngine();
+	otherMathEngine->DataExchangeRaw( GetRaw( result ), handle, size );
+
+	return result;
+}
+
+} // namespace NeoML
diff --git a/NeoMathEngine/src/MemoryEngineMixin.h b/NeoMathEngine/src/MemoryEngineMixin.h
new file mode 100644
index 000000000..bc4091363
--- /dev/null
+++ b/NeoMathEngine/src/MemoryEngineMixin.h
@@ -0,0 +1,66 @@
+/* Copyright © 2024 ABBYY
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+--------------------------------------------------------------------------------------------------------------*/
+
+#pragma once
+
+#include <NeoMathEngine/NeoMathEngineDefs.h>
+#include <RawMemoryManager.h>
+#include <memory>
+#include <mutex>
+
+namespace NeoML {
+
+class CDeviceStackAllocator;
+class CHostStackAllocator;
+class CMemoryPool;
+
+// Memory management engine base class
+class CMemoryEngineMixin : public IMathEngine {
+public:
+	void InitializeMemory( IRawMemoryManager* _rawManager, size_t _memoryLimit, int _memoryAlignment,
+		bool _reuse, bool _hostStack );
+	bool IsInitialized() const override { return MemoryAlignment > 0 && MemoryPool != 0 && DeviceStackAllocator != 0; }
+
+	void SetReuseMemoryMode( bool enable ) override;
+	bool GetReuseMemoryMode() const override;
+	void SetThreadBufferMemoryThreshold( size_t threshold ) override;
+	size_t GetThreadBufferMemoryThreshold() const override;
+	CMemoryHandle HeapAlloc( size_t count ) override;
+	void HeapFree( const CMemoryHandle& handle ) override;
+	void TransferHandleToThisThread( const CMemoryHandle& handle, size_t size ) override;
+	CMemoryHandle StackAlloc( size_t count ) override;
+	void StackFree( const CMemoryHandle& handle ) override;
+	size_t GetFreeMemorySize() const override;
+	size_t GetPeakMemoryUsage() const override;
+	void ResetPeakMemoryUsage() override;
+	size_t GetCurrentMemoryUsage() const override;
+	size_t GetMemoryInPools() const override;
+
+	void CleanUp() override;
+	void* GetBuffer( const CMemoryHandle& handle, size_t pos, size_t size, bool exchange ) override;
+	void ReleaseBuffer( const CMemoryHandle& handle, void* ptr, bool exchange ) override;
+	CMemoryHandle CopyFrom( const CMemoryHandle& handle, size_t size ) override;
+
+protected:
+	int MemoryAlignment = 0; // allocation alignment
+	mutable std::mutex Mutex; // protecting the data below from non-thread-safe use
+	std::unique_ptr<CMemoryPool> MemoryPool; // memory manager
+	std::unique_ptr<CDeviceStackAllocator> DeviceStackAllocator; // stack allocator for GPU memory
+	std::unique_ptr<CHostStackAllocator> HostStackAllocator; // stack allocator for regular memory
+
+	void CleanUpSpecial() override {}
+};
+
+} // namespace NeoML