diff --git a/hipamd/src/hip_context.cpp b/hipamd/src/hip_context.cpp index 4b7f5c809..06e700ee1 100644 --- a/hipamd/src/hip_context.cpp +++ b/hipamd/src/hip_context.cpp @@ -95,7 +95,7 @@ hip::Stream* getStream(hipStream_t stream, bool wait) { hip::Stream* hip_stream = reinterpret_cast(stream); if (wait && !(hip_stream->Flags() & hipStreamNonBlocking)) { constexpr bool WaitNullStreamOnly = true; - iHipWaitActiveStreams(hip_stream, WaitNullStreamOnly); + hip_stream->GetDevice()->WaitActiveStreams(hip_stream, WaitNullStreamOnly); } return hip_stream; } diff --git a/hipamd/src/hip_device.cpp b/hipamd/src/hip_device.cpp index 019ee223e..5ea357f2f 100644 --- a/hipamd/src/hip_device.cpp +++ b/hipamd/src/hip_device.cpp @@ -43,7 +43,7 @@ hip::Stream* Device::NullStream(bool wait) { } if (wait == true) { // Wait for all active streams before executing commands on the default - iHipWaitActiveStreams(null_stream_); + WaitActiveStreams(null_stream_); } return null_stream_; } @@ -149,11 +149,150 @@ void Device::Reset() { mem_pools_.clear(); } flags_ = hipDeviceScheduleSpin; - hip::Stream::destroyAllStreams(deviceId_); + destroyAllStreams(); amd::MemObjMap::Purge(devices()[0]); Create(); } +// ================================================================================================ +void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stream) { + amd::Command::EventWaitList eventWaitList(0); + bool submitMarker = 0; + + auto waitForStream = [&submitMarker, + &eventWaitList](hip::Stream* stream) { + if (amd::Command *command = stream->getLastQueuedCommand(true)) { + amd::Event &event = command->event(); + // Check HW status of the ROCcrl event. + // Note: not all ROCclr modes support HW status + bool ready = stream->device().IsHwEventReady(event); + if (!ready) { + ready = (command->status() == CL_COMPLETE); + } + submitMarker |= stream->vdev()->isFenceDirty(); + // Check the current active status + if (!ready) { + command->notifyCmdQueue(); + eventWaitList.push_back(command); + } else { + command->release(); + } + } + }; + + if (wait_null_stream) { + if (null_stream_) { + waitForStream(null_stream_); + } + } else { + amd::ScopedLock lock(streamSetLock); + + for (const auto& active_stream : streamSet) { + // If it's the current device + if (// Make sure it's a default stream + ((active_stream->Flags() & hipStreamNonBlocking) == 0) && + // and it's not the current stream + (active_stream != blocking_stream)) { + // Get the last valid command + waitForStream(active_stream); + } + } + } + + // Check if we have to wait anything + if (eventWaitList.size() > 0 || submitMarker) { + amd::Command* command = new amd::Marker(*blocking_stream, kMarkerDisableFlush, eventWaitList); + if (command != nullptr) { + command->enqueue(); + command->release(); + } + } + + // Release all active commands. It's safe after the marker was enqueued + for (const auto& it : eventWaitList) { + it->release(); + } +} + +// ================================================================================================ +void Device::AddStream(Stream* stream) { + amd::ScopedLock lock(streamSetLock); + streamSet.insert(stream); +} + +// ================================================================================================ +void Device::RemoveStream(Stream* stream){ + amd::ScopedLock lock(streamSetLock); + streamSet.erase(stream); +} + +// ================================================================================================ +bool Device::StreamExists(Stream* stream){ + amd::ScopedLock lock(streamSetLock); + if (streamSet.find(stream) != streamSet.end()) { + return true; + } + return false; +} + +// ================================================================================================ +void Device::destroyAllStreams() { + std::vector toBeDeleted; + { + amd::ScopedLock lock(streamSetLock); + for (auto& it : streamSet) { + if (it->Null() == false ) { + toBeDeleted.push_back(it); + } + } + } + for (auto& it : toBeDeleted) { + hip::Stream::Destroy(it); + } +} + +// ================================================================================================ +void Device::SyncAllStreams( bool cpu_wait) { + // Make a local copy to avoid stalls for GPU finish with multiple threads + std::vector streams; + streams.reserve(streamSet.size()); + { + amd::ScopedLock lock(streamSetLock); + for (auto it : streamSet) { + streams.push_back(it); + it->retain(); + } + } + for (auto it : streams) { + it->finish(cpu_wait); + it->release(); + } + // Release freed memory for all memory pools on the device + ReleaseFreedMemory(); +} + +// ================================================================================================ +bool Device::StreamCaptureBlocking() { + amd::ScopedLock lock(streamSetLock); + for (auto& it : streamSet) { + if (it->GetCaptureStatus() == hipStreamCaptureStatusActive && it->Flags() != hipStreamNonBlocking) { + return true; + } + } + return false; +} + +// ================================================================================================ +bool Device::existsActiveStreamForDevice() { + amd::ScopedLock lock(streamSetLock); + for (const auto& active_stream : streamSet) { + if (active_stream->GetQueueStatus()) { + return true; + } + } + return false; +} + // ================================================================================================ Device::~Device() { if (default_mem_pool_ != nullptr) { diff --git a/hipamd/src/hip_device_runtime.cpp b/hipamd/src/hip_device_runtime.cpp index 19a045dba..b199bcdde 100644 --- a/hipamd/src/hip_device_runtime.cpp +++ b/hipamd/src/hip_device_runtime.cpp @@ -610,7 +610,7 @@ hipError_t hipDeviceSetSharedMemConfig(hipSharedMemConfig config) { hipError_t hipDeviceSynchronize() { HIP_INIT_API(hipDeviceSynchronize); constexpr bool kDoWaitForCpu = true; - hip::Stream::SyncAllStreams(hip::getCurrentDevice()->deviceId(), kDoWaitForCpu); + hip::getCurrentDevice()->SyncAllStreams(kDoWaitForCpu); HIP_RETURN(hipSuccess); } diff --git a/hipamd/src/hip_graph.cpp b/hipamd/src/hip_graph.cpp index 0fff3323c..30ac1fc33 100644 --- a/hipamd/src/hip_graph.cpp +++ b/hipamd/src/hip_graph.cpp @@ -484,7 +484,7 @@ hipError_t capturehipMemcpyParam2DAsync(hipStream_t& stream, const hip_Memcpy2D* } p.dstArray = pCopy->dstArray; p.dstPos = {pCopy->dstXInBytes, pCopy->dstY, 0}; - p.dstPtr.pitch = pCopy->srcPitch; + p.dstPtr.pitch = pCopy->dstPitch; if (pCopy->dstDevice != nullptr) { p.dstPtr.ptr = pCopy->dstDevice; } @@ -719,6 +719,7 @@ hipError_t capturehipMemset2DAsync(hipStream_t& stream, void*& dst, size_t& pitc memsetParams.width = width; memsetParams.height = height; memsetParams.pitch = pitch; + memsetParams.elementSize = 1; hip::Stream* s = reinterpret_cast(stream); hip::GraphNode* pGraphNode; hipError_t status = diff --git a/hipamd/src/hip_internal.hpp b/hipamd/src/hip_internal.hpp index 46a03e0b4..643d2c234 100644 --- a/hipamd/src/hip_internal.hpp +++ b/hipamd/src/hip_internal.hpp @@ -306,15 +306,9 @@ class stream_per_thread { /// Returns the CU mask for the current stream const std::vector GetCUMask() const { return cuMask_; } - /// Sync all streams - static void SyncAllStreams(int deviceId, bool cpu_wait = true); - /// Check whether any blocking stream running static bool StreamCaptureBlocking(); - /// Destroy all streams on a given device - static void destroyAllStreams(int deviceId); - static void Destroy(hip::Stream* stream); /// Check Stream Capture status to make sure it is done @@ -416,7 +410,6 @@ class stream_per_thread { parallelCaptureStreams_.erase(it); } } - static bool existsActiveStreamForDevice(hip::Device* device); /// The stream should be destroyed via release() rather than delete private: @@ -426,6 +419,8 @@ class stream_per_thread { /// HIP Device class class Device { amd::Monitor lock_{"Device lock", true}; + amd::Monitor streamSetLock{"Guards device stream set"}; + std::unordered_set streamSet; /// ROCclr context amd::Context* context_; /// Device's ID @@ -499,7 +494,7 @@ class stream_per_thread { amd::ScopedLock lock(lock_); /// Either stream is active or device is active if (isActive_) return true; - if (Stream::existsActiveStreamForDevice(this)) { + if (existsActiveStreamForDevice()) { isActive_ = true; return true; } @@ -540,6 +535,22 @@ class stream_per_thread { /// Returns true if memory pool is valid on this device bool IsMemoryPoolValid(MemoryPool* pool); + void AddStream(Stream* stream); + + void RemoveStream(Stream* stream); + + bool StreamExists(Stream* stream); + + void destroyAllStreams(); + + void SyncAllStreams( bool cpu_wait = true); + + bool StreamCaptureBlocking(); + + bool existsActiveStreamForDevice(); + /// Wait all active streams on the blocking queue. The method enqueues a wait command and + /// doesn't stall the current thread + void WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stream = false); }; /// Thread Local Storage Variables Aggregator Class @@ -589,10 +600,6 @@ class stream_per_thread { extern void WaitThenDecrementSignal(hipStream_t stream, hipError_t status, void* user_data); - /// Wait all active streams on the blocking queue. The method enqueues a wait command and - /// doesn't stall the current thread - extern void iHipWaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stream = false); - extern std::vector g_devices; extern hipError_t ihipDeviceGetCount(int* count); extern int ihipGetDevice(); diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp index 613908501..c95cdefe9 100644 --- a/hipamd/src/hip_memory.cpp +++ b/hipamd/src/hip_memory.cpp @@ -76,7 +76,7 @@ hipError_t ihipFree(void *ptr) { if (memory_object != nullptr) { // Wait on the device, associated with the current memory object during allocation auto device_id = memory_object->getUserData().deviceId; - hip::Stream::SyncAllStreams(device_id); + g_devices[device_id]->SyncAllStreams(); // Find out if memory belongs to any memory pool if (!g_devices[device_id]->FreeMemory(memory_object, nullptr)) { @@ -743,7 +743,7 @@ hipError_t ihipArrayDestroy(hipArray_t array) { auto image = as_amd(memObj); // Wait on the device, associated with the current memory object during allocation - hip::Stream::SyncAllStreams(image->getUserData().deviceId); + g_devices[image->getUserData().deviceId]->SyncAllStreams(); image->release(); delete array; @@ -1252,7 +1252,7 @@ hipError_t ihipHostUnregister(void* hostPtr) { if (mem != nullptr) { // Wait on the device, associated with the current memory object during allocation - hip::Stream::SyncAllStreams(mem->getUserData().deviceId); + g_devices[mem->getUserData().deviceId]->SyncAllStreams(); amd::MemObjMap::RemoveMemObj(hostPtr); for (const auto& device: g_devices) { @@ -4304,7 +4304,7 @@ hipError_t ihipMipmappedArrayDestroy(hipMipmappedArray_t mipmapped_array_ptr) { auto image = as_amd(mem_obj); // Wait on the device, associated with the current memory object during allocation - hip::Stream::SyncAllStreams(image->getUserData().deviceId); + g_devices[image->getUserData().deviceId]->SyncAllStreams(); image->release(); delete mipmapped_array_ptr; diff --git a/hipamd/src/hip_platform.cpp b/hipamd/src/hip_platform.cpp index e88458dc4..1d5e61dba 100644 --- a/hipamd/src/hip_platform.cpp +++ b/hipamd/src/hip_platform.cpp @@ -175,7 +175,13 @@ void __hipRegisterTexture( } void __hipUnregisterFatBinary(hip::FatBinaryInfo** modules) { - hipError_t err = PlatformState::instance().removeFatBinary(modules); + // By calling hipDeviceSynchronize ensure that all HSA signal handlers + // complete before removeFatBinary + hipError_t err = hipDeviceSynchronize(); + if (err != hipSuccess) { + LogPrintfError("Error during hipDeviceSynchronize, error: %d", err); + } + err = PlatformState::instance().removeFatBinary(modules); guarantee((err == hipSuccess), "Cannot Unregister Fat Binary, error:%d", err); } diff --git a/hipamd/src/hip_stream.cpp b/hipamd/src/hip_stream.cpp index 9d0475bc9..a78fc093d 100644 --- a/hipamd/src/hip_stream.cpp +++ b/hipamd/src/hip_stream.cpp @@ -25,8 +25,6 @@ #include "hip_prof_api.h" namespace hip { -static amd::Monitor streamSetLock{"Guards global stream set"}; -static std::unordered_set streamSet; // ================================================================================================ Stream::Stream(hip::Device* dev, Priority p, unsigned int f, bool null_stream, @@ -43,8 +41,7 @@ Stream::Stream(hip::Device* dev, Priority p, unsigned int f, bool null_stream, originStream_(false), captureID_(0) { - amd::ScopedLock lock(streamSetLock); - streamSet.insert(this); + device_->AddStream(this); } // ================================================================================================ @@ -76,10 +73,7 @@ bool Stream::Create() { // ================================================================================================ void Stream::Destroy(hip::Stream* stream) { - { - amd::ScopedLock lock(streamSetLock); - streamSet.erase(stream); - } + stream->device_->RemoveStream(stream); stream->release(); } @@ -95,11 +89,12 @@ bool isValid(hipStream_t& stream) { } hip::Stream* s = reinterpret_cast(stream); - amd::ScopedLock lock(streamSetLock); - if (streamSet.find(s) == streamSet.end()) { - return false; + for (auto& device : g_devices) { + if (device->StreamExists(s)) { + return true; + } } - return true; + return false; } // ================================================================================================ @@ -122,53 +117,17 @@ int Stream::DeviceId(const hipStream_t hStream) { } // ================================================================================================ -void Stream::SyncAllStreams(int deviceId, bool cpu_wait) { - // Make a local copy to avoid stalls for GPU finish with multiple threads - std::vector streams; - streams.reserve(streamSet.size()); - { - amd::ScopedLock lock(streamSetLock); - for (auto it : streamSet) { - if (it->DeviceId() == deviceId) { - streams.push_back(it); - it->retain(); - } - } - } - for (auto it : streams) { - it->finish(cpu_wait); - it->release(); - } - // Release freed memory for all memory pools on the device - g_devices[deviceId]->ReleaseFreedMemory(); -} // ================================================================================================ bool Stream::StreamCaptureBlocking() { - amd::ScopedLock lock(streamSetLock); - for (auto& it : streamSet) { - if (it->GetCaptureStatus() == hipStreamCaptureStatusActive && it->Flags() != hipStreamNonBlocking) { + for (auto& device : g_devices) { + if (device->StreamCaptureBlocking()) { return true; } } return false; } -void Stream::destroyAllStreams(int deviceId) { - std::vector toBeDeleted; - { - amd::ScopedLock lock(streamSetLock); - for (auto& it : streamSet) { - if (it->Null() == false && it->DeviceId() == deviceId) { - toBeDeleted.push_back(it); - } - } - } - for (auto& it : toBeDeleted) { - hip::Stream::Destroy(it); - } -} - bool Stream::StreamCaptureOngoing(hipStream_t hStream) { hip::Stream* s = reinterpret_cast(hStream); // Allow capture to be less restrictive one one changes the stream capture interaction @@ -188,80 +147,6 @@ bool Stream::StreamCaptureOngoing(hipStream_t hStream) { } } -bool Stream::existsActiveStreamForDevice(hip::Device* device) { - - amd::ScopedLock lock(streamSetLock); - - for (const auto& active_stream : streamSet) { - if ((active_stream->GetDevice() == device) && - active_stream->GetQueueStatus()) { - return true; - } - } - return false; -} - -// ================================================================================================ -void iHipWaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stream) { - amd::Command::EventWaitList eventWaitList(0); - bool submitMarker = 0; - - auto waitForStream = [&submitMarker, - &eventWaitList](hip::Stream* stream) { - if (amd::Command *command = stream->getLastQueuedCommand(true)) { - amd::Event &event = command->event(); - // Check HW status of the ROCcrl event. - // Note: not all ROCclr modes support HW status - bool ready = stream->device().IsHwEventReady(event); - if (!ready) { - ready = (command->status() == CL_COMPLETE); - } - submitMarker |= stream->vdev()->isFenceDirty(); - // Check the current active status - if (!ready) { - command->notifyCmdQueue(); - eventWaitList.push_back(command); - } else { - command->release(); - } - } - }; - - if (wait_null_stream) { - if (hip::Stream* null_stream = blocking_stream->GetDevice()->GetNullStream()) { - waitForStream(null_stream); - } - } else { - amd::ScopedLock lock(streamSetLock); - - for (const auto& active_stream : streamSet) { - // If it's the current device - if ((&active_stream->device() == &blocking_stream->device()) && - // Make sure it's a default stream - ((active_stream->Flags() & hipStreamNonBlocking) == 0) && - // and it's not the current stream - (active_stream != blocking_stream)) { - // Get the last valid command - waitForStream(active_stream); - } - } - } - - // Check if we have to wait anything - if (eventWaitList.size() > 0 || submitMarker) { - amd::Command* command = new amd::Marker(*blocking_stream, kMarkerDisableFlush, eventWaitList); - if (command != nullptr) { - command->enqueue(); - command->release(); - } - } - - // Release all active commands. It's safe after the marker was enqueued - for (const auto& it : eventWaitList) { - it->release(); - } -} - // ================================================================================================ void CL_CALLBACK ihipStreamCallback(cl_event event, cl_int command_exec_status, void* user_data) { StreamCallback* cbo = reinterpret_cast(user_data); diff --git a/opencl/CMakeLists.txt b/opencl/CMakeLists.txt index 523eb2b5c..658547f11 100644 --- a/opencl/CMakeLists.txt +++ b/opencl/CMakeLists.txt @@ -15,7 +15,7 @@ set(CMAKE_INSTALL_LIBDIR "lib" CACHE STRING "Library install directory") include(GNUInstallDirs) option(BUILD_TESTS "Enable building OpenCL tests" OFF) -option(BUILD_ICD "Enable building OpenCL ICD Loader" OFF) +option(BUILD_ICD "Enable building OpenCL ICD Loader" ON) option(EMU_ENV "Enable building for emulation environment" OFF) option(FILE_REORG_BACKWARD_COMPATIBILITY "Enable File Reorganization backward compatibility" OFF) @@ -26,10 +26,6 @@ if(MSVC) endif() set(OPENCL_ICD_LOADER_HEADERS_DIR "${CMAKE_CURRENT_LIST_DIR}/khronos/headers/opencl2.2" CACHE PATH "") - -###--- Packaging ------------------------------------------------------------### - -# DEV package if(BUILD_ICD) add_subdirectory(khronos/icd) else() @@ -42,6 +38,19 @@ if(BUILD_TESTS) add_subdirectory(tests/ocltst) endif() +###--- Packaging ------------------------------------------------------------### + +# DEV package +install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/khronos/headers/opencl2.2/CL" + DESTINATION include + COMPONENT DEV + PATTERN cl_d3d10.h EXCLUDE + PATTERN cl_d3d11.h EXCLUDE + PATTERN cl_dx9_media_sharing.h EXCLUDE + PATTERN cl_egl.h EXCLUDE + PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE + ) + ############################# # Packaging steps ############################# diff --git a/opencl/packaging/CMakeLists.txt b/opencl/packaging/CMakeLists.txt index 3643f88e7..d9329c415 100644 --- a/opencl/packaging/CMakeLists.txt +++ b/opencl/packaging/CMakeLists.txt @@ -22,6 +22,12 @@ install(TARGETS amdocl DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT asan) install(FILES ${opencl_SOURCE_DIR}/LICENSE.txt DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT binary) install(FILES ${opencl_SOURCE_DIR}/LICENSE.txt DESTINATION ${CMAKE_INSTALL_DOCDIR}-asan COMPONENT asan) +install(DIRECTORY ${opencl_SOURCE_DIR}/khronos/headers/opencl2.2/CL + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} COMPONENT dev + USE_SOURCE_PERMISSIONS + PATTERN cl_d3d10.h EXCLUDE + PATTERN cl_d3d11.h EXCLUDE + PATTERN cl_dx9_media_sharing.h EXCLUDE ) if(BUILD_ICD) install(TARGETS OpenCL DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT icd ) diff --git a/opencl/tests/ocltst/env/CMakeLists.txt b/opencl/tests/ocltst/env/CMakeLists.txt index 13dec123f..308acc7d6 100644 --- a/opencl/tests/ocltst/env/CMakeLists.txt +++ b/opencl/tests/ocltst/env/CMakeLists.txt @@ -34,11 +34,10 @@ target_include_directories(ocltst PRIVATE $) -target_link_libraries(ocltst PRIVATE OpenCL::OpenCL ${CMAKE_DL_LIBS}) - -if(NOT WIN32) - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") -endif() +target_link_libraries(ocltst + PRIVATE + OpenCL + ) set_target_properties(ocltst PROPERTIES INSTALL_RPATH "$ORIGIN") diff --git a/opencl/tests/ocltst/module/gl/CMakeLists.txt b/opencl/tests/ocltst/module/gl/CMakeLists.txt index 41dffa15b..115879cf1 100644 --- a/opencl/tests/ocltst/module/gl/CMakeLists.txt +++ b/opencl/tests/ocltst/module/gl/CMakeLists.txt @@ -45,7 +45,9 @@ target_include_directories(oclgl PRIVATE $) -target_link_libraries(oclgl PRIVATE +target_link_libraries(oclgl + PRIVATE + OpenCL ${GLEW_LIBRARIES} ${OPENGL_LIBRARIES}) diff --git a/opencl/tests/ocltst/module/perf/CMakeLists.txt b/opencl/tests/ocltst/module/perf/CMakeLists.txt index e0134c922..b9780283c 100644 --- a/opencl/tests/ocltst/module/perf/CMakeLists.txt +++ b/opencl/tests/ocltst/module/perf/CMakeLists.txt @@ -95,10 +95,9 @@ target_include_directories(oclperf PRIVATE $) -target_link_libraries(oclperf PRIVATE OpenCL::OpenCL ${CMAKE_DL_LIBS}) -if(NOT WIN32) - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") -endif() +target_link_libraries(oclperf + PRIVATE + OpenCL) add_custom_command( TARGET oclperf POST_BUILD diff --git a/opencl/tests/ocltst/module/runtime/CMakeLists.txt b/opencl/tests/ocltst/module/runtime/CMakeLists.txt index 1ffd58ad0..0b5de9417 100644 --- a/opencl/tests/ocltst/module/runtime/CMakeLists.txt +++ b/opencl/tests/ocltst/module/runtime/CMakeLists.txt @@ -68,10 +68,9 @@ target_include_directories(oclruntime PRIVATE $) -target_link_libraries(oclruntime PRIVATE OpenCL::OpenCL ${CMAKE_DL_LIBS}) -if(NOT WIN32) - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") -endif() +target_link_libraries(oclruntime + PRIVATE + OpenCL) add_custom_command( TARGET oclruntime POST_BUILD diff --git a/opencl/tools/clinfo/CMakeLists.txt b/opencl/tools/clinfo/CMakeLists.txt index da795267d..217f18225 100644 --- a/opencl/tools/clinfo/CMakeLists.txt +++ b/opencl/tools/clinfo/CMakeLists.txt @@ -4,7 +4,7 @@ target_compile_definitions(clinfo PRIVATE CL_TARGET_OPENCL_VERSION=220 HAVE_CL2_ target_include_directories(clinfo PRIVATE ${OPENCL_ICD_LOADER_HEADERS_DIR}) -target_link_libraries(clinfo OpenCL::OpenCL) +target_link_libraries(clinfo OpenCL) INSTALL(TARGETS clinfo RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) diff --git a/rocclr/device/devprogram.cpp b/rocclr/device/devprogram.cpp index 07d4914c2..c030e76b0 100644 --- a/rocclr/device/devprogram.cpp +++ b/rocclr/device/devprogram.cpp @@ -1172,10 +1172,6 @@ bool Program::linkImplLC(amd::option::Options* options) { if (options->oVariables->FP32RoundDivideSqrt) { linkOptions.push_back("correctly_rounded_sqrt"); } - if (options->oVariables->DenormsAreZero || AMD_GPU_FORCE_SINGLE_FP_DENORM == 0 || - (device().isa().versionMajor() < 9 && AMD_GPU_FORCE_SINGLE_FP_DENORM < 0)) { - linkOptions.push_back("daz_opt"); - } if (options->oVariables->FiniteMathOnly || options->oVariables->FastRelaxedMath) { linkOptions.push_back("finite_only"); } diff --git a/rocclr/device/pal/palresource.cpp b/rocclr/device/pal/palresource.cpp index 5f8a86375..40c512364 100644 --- a/rocclr/device/pal/palresource.cpp +++ b/rocclr/device/pal/palresource.cpp @@ -1512,21 +1512,21 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin, (size[0] < dev().settings().cpDmaCopySizeMax_)); if (cp_dma) { // Make sure compute is done before CP DMA start - gpu.addBarrier(RgpSqqtBarrierReason::MemDependency); + gpu.addBarrier(RgpSqqtBarrierReason::MemDependency, BarrierType::KernelToCopy); } else { gpu.releaseGpuMemoryFence(); gpu.engineID_ = SdmaEngine; + + if (gpu.validateSdmaOverlap(*this, dstResource)) { + // Note: PAL should insert a NOP into the command buffer for synchronization + gpu.addBarrier(RgpSqqtBarrierReason::MemDependency, BarrierType::CopyToCopy); + } } // Wait for the resources, since runtime may use async transfers wait(gpu, waitOnBusyEngine); dstResource.wait(gpu, waitOnBusyEngine); - if (gpu.validateSdmaOverlap(*this, dstResource)) { - // Note: PAL should insert a NOP into the command buffer for synchronization - gpu.addBarrier(); - } - Pal::ImageLayout imgLayout = {}; gpu.eventBegin(gpu.engineID_); gpu.queue(gpu.engineID_).addCmdMemRef(memRef()); @@ -1626,7 +1626,7 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin, if (cp_dma) { // Make sure CP dma is done - gpu.addBarrier(RgpSqqtBarrierReason::MemDependency); + gpu.addBarrier(RgpSqqtBarrierReason::MemDependency, BarrierType::CopyToKernel); } gpu.eventEnd(gpu.engineID_, event); diff --git a/rocclr/device/pal/palvirtual.cpp b/rocclr/device/pal/palvirtual.cpp index a19d3aa1c..da61f8231 100644 --- a/rocclr/device/pal/palvirtual.cpp +++ b/rocclr/device/pal/palvirtual.cpp @@ -2404,8 +2404,7 @@ void VirtualGPU::PostDeviceEnqueue(const amd::Kernel& kernel, const HSAILKernel& static_cast(gpuDefQueue->blitMgr()) .runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0, gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); - const static bool FlushL2 = true; - gpuDefQueue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue, FlushL2); + gpuDefQueue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue, BarrierType::FlushL2); // Get the address of PM4 template and add write it to params //! @note DMA flush must not occur between patch and the scheduler @@ -3020,8 +3019,7 @@ void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) { engineID_ = static_cast(pGpuMemory->getGpuEvent(*this)->engineId_); // Make sure GPU finished operation and data reached memory before the marker write - static constexpr bool FlushL2 = true; - addBarrier(RgpSqqtBarrierReason::SignalSubmit, FlushL2); + addBarrier(RgpSqqtBarrierReason::SignalSubmit, BarrierType::FlushL2); // Workarounds: We had systems where an extra delay was necessary. { // Flush CB associated with the DGMA buffer diff --git a/rocclr/device/pal/palvirtual.hpp b/rocclr/device/pal/palvirtual.hpp index 170753426..bf46db927 100644 --- a/rocclr/device/pal/palvirtual.hpp +++ b/rocclr/device/pal/palvirtual.hpp @@ -66,6 +66,14 @@ struct AqlPacketMgmt : public amd::EmbeddedObject { std::atomic packet_index_; //!< The active packet slot index }; + enum class BarrierType : uint8_t { + KernelToKernel = 0, + KernelToCopy, + CopyToKernel, + CopyToCopy, + FlushL2 +}; + //! Virtual GPU class VirtualGPU : public device::VirtualDevice { public: @@ -478,18 +486,29 @@ class VirtualGPU : public device::VirtualDevice { //! Returns queue, associated with VirtualGPU Queue& queue(EngineType id) const { return *queues_[id]; } - void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::Unknown, - bool flushL2 = false) const { + void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::MemDependency, + BarrierType type = BarrierType::KernelToKernel) const { Pal::BarrierInfo barrier = {}; barrier.pipePointWaitCount = 1; Pal::HwPipePoint point = Pal::HwPipePostCs; barrier.pPipePoints = &point; barrier.transitionCount = 1; - uint32_t cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader; - Pal::BarrierTransition trans = { - cacheMask, - cacheMask, - {nullptr, {{0, 0, 0}, 0, 0, 0}, Pal::LayoutShaderRead, Pal::LayoutShaderRead}}; + Pal::BarrierTransition trans = {}; + trans.srcCacheMask = Pal::CoherShader; + trans.dstCacheMask = Pal::CoherShader; + trans.imageInfo.oldLayout.usages = Pal::LayoutShaderRead; + trans.imageInfo.oldLayout.engines = Pal::LayoutComputeEngine; + trans.imageInfo.newLayout.usages = Pal::LayoutShaderRead; + trans.imageInfo.newLayout.engines = Pal::LayoutComputeEngine; + if (type == BarrierType::KernelToCopy) { + trans.dstCacheMask = Pal::CoherCopy; + } else if (type == BarrierType::CopyToKernel) { + trans.srcCacheMask = Pal::CoherCopy; + } else if (type == BarrierType::CopyToCopy) { + trans.dstCacheMask = trans.srcCacheMask = Pal::CoherCopy; + } else if (type == BarrierType::FlushL2) { + trans.dstCacheMask = trans.srcCacheMask = Pal::CoherCopy | Pal::CoherCpu; + } barrier.pTransitions = &trans; barrier.waitPoint = Pal::HwPipePreCs; barrier.reason = static_cast(reason); diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 5eee23e65..093ff435e 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -183,6 +183,7 @@ Device::Device(hsa_agent_t bkendDevice) , xferQueue_(nullptr) , xferRead_(nullptr) , xferWrite_(nullptr) + , p2p_agents_list_(nullptr) , freeMem_(0) , vgpusAccess_("Virtual GPU List Ops Lock", true) , hsa_exclusive_gpu_access_(false) @@ -298,7 +299,8 @@ Device::~Device() { context_->release(); } - delete[] p2p_agents_list_; + if (p2p_agents_list_ != nullptr) + delete[] p2p_agents_list_; if (coopHostcallBuffer_) { disableHostcalls(coopHostcallBuffer_);