diff --git a/cmake/ROCclr.cmake b/cmake/ROCclr.cmake index 4d8f6a25..c7795ba8 100644 --- a/cmake/ROCclr.cmake +++ b/cmake/ROCclr.cmake @@ -100,7 +100,6 @@ endif() target_compile_definitions(rocclr PUBLIC LITTLEENDIAN_CPU - WITH_LIQUID_FLASH=0 ${AMD_OPENCL_DEFS}) target_include_directories(rocclr PUBLIC diff --git a/device/blit.cpp b/device/blit.cpp index 3c6b5dbf..a2c25376 100644 --- a/device/blit.cpp +++ b/device/blit.cpp @@ -729,14 +729,16 @@ bool HostBlitManager::FillBufferInfo::PackInfo(const device::Memory& memory, siz std::vector& packed_info) { // 1. Validate input arguments - guarantee(fill_size >= pattern_size, "Pattern Size cannot be greater than fill size"); - guarantee(fill_size <= memory.size(), "Cannot fill more than the mem object size"); + guarantee(fill_size >= pattern_size, "Pattern Size: %u cannot be greater than fill size: %u \n", + pattern_size, fill_size); + guarantee(fill_size <= memory.size(), "Cannot fill: %u more than the mem object size:%u \n", + fill_size, memory.size()); // 2. Calculate the next closest dword aligned address for faster processing size_t dst_addr = memory.virtualAddress() + fill_origin; size_t aligned_dst_addr = amd::alignUp(dst_addr, sizeof(size_t)); - guarantee(aligned_dst_addr >= dst_addr, "Aligned address cannot be greater than destination" - "address"); + guarantee(aligned_dst_addr >= dst_addr, "Aligned address: %u cannot be greater than destination" + "address :%u \n", aligned_dst_addr, dst_addr); // 3. If given address is not aligned calculate head and tail size. size_t head_size = std::min(aligned_dst_addr - dst_addr, fill_size); diff --git a/device/devhcprintf.cpp b/device/devhcprintf.cpp index b36f7758..a5aba4a4 100644 --- a/device/devhcprintf.cpp +++ b/device/devhcprintf.cpp @@ -23,6 +23,7 @@ #include #include +#include #include #include #include diff --git a/device/devhostcall.cpp b/device/devhostcall.cpp index 8226e3da..1729e5eb 100644 --- a/device/devhostcall.cpp +++ b/device/devhostcall.cpp @@ -84,10 +84,11 @@ static void handlePayload(MessageHandler& messages, uint32_t service, uint64_t* if (!messages.handlePayload(service, payload)) { ClPrint(amd::LOG_ERROR, amd::LOG_ALWAYS, "Hostcall: invalid request for service \"%d\".", service); - amd::report_fatal(__FILE__, __LINE__, "Hostcall: invalid service request."); + guarantee(false, "Hostcall: invalid service request %d \n", service); } return; case SERVICE_DEVMEM: { + guarantee(payload[0] != 0 || payload[1] != 0, "Both payloads cannot be 0 \n"); if (payload[0]) { amd::Memory* mem = amd::MemObjMap::FindMemObj(reinterpret_cast(payload[0])); if (mem) { @@ -114,9 +115,7 @@ static void handlePayload(MessageHandler& messages, uint32_t service, uint64_t* return; } default: - ClPrint(amd::LOG_ERROR, amd::LOG_ALWAYS, "Hostcall: no handler found for service ID \"%d\".", - service); - amd::report_fatal(__FILE__, __LINE__, "Hostcall service not supported."); + guarantee(false, "Hostcall: no handler found for service ID %d \n", service); return; } } diff --git a/device/device.cpp b/device/device.cpp index 2709febd..f87452db 100644 --- a/device/device.cpp +++ b/device/device.cpp @@ -289,11 +289,8 @@ void MemObjMap::AddMemObj(const void* k, amd::Memory* v) { void MemObjMap::RemoveMemObj(const void* k) { amd::ScopedLock lock(AllocatedLock_); auto rval = MemObjMap_.erase(reinterpret_cast(k)); - if (rval != 1) { - DevLogPrintfError("Memobj map does not have ptr: 0x%x", - reinterpret_cast(k)); - guarantee(false, "Memobj map does not have ptr"); - } + guarantee(rval == 1, "Memobj map does not have ptr: 0x%x", + reinterpret_cast(k)); } amd::Memory* MemObjMap::FindMemObj(const void* k, size_t* offset) { @@ -328,11 +325,8 @@ void MemObjMap::AddVirtualMemObj(const void* k, amd::Memory* v) { void MemObjMap::RemoveVirtualMemObj(const void* k) { amd::ScopedLock lock(AllocatedLock_); auto rval = VirtualMemObjMap_.erase(reinterpret_cast(k)); - if (rval != 1) { - DevLogPrintfError("Virtual Memobj map does not have ptr: 0x%x", - reinterpret_cast(k)); - guarantee(false, "VirtualMemobj map does not have ptr"); - } + guarantee(rval == 1, "Virtual Memobj map does not have ptr: 0x%x", + reinterpret_cast(k)); } amd::Memory* MemObjMap::FindVirtualMemObj(const void* k) { diff --git a/device/device.hpp b/device/device.hpp index 64d67465..3e65a331 100644 --- a/device/device.hpp +++ b/device/device.hpp @@ -89,7 +89,6 @@ class SvmFillMemoryCommand; class SvmMapMemoryCommand; class SvmUnmapMemoryCommand; class SvmPrefetchAsyncCommand; -class TransferBufferFileCommand; class StreamOperationCommand; class VirtualMapCommand; class ExternalSemaphoreCmd; @@ -154,7 +153,6 @@ enum OclExtensions { ClKhrD3d9Sharing, #endif ClKhrImage2dFromBuffer, - ClAmdSemaphore, ClAMDBusAddressableMemory, ClAMDC11Atomics, ClKhrSpir, @@ -163,8 +161,6 @@ enum OclExtensions { ClKhrDepthImages, ClKhrMipMapImage, ClKhrMipMapImageWrites, - ClKhrIlProgram, - ClAMDLiquidFlash, ClAmdCopyBufferP2P, ClAmdAssemblyProgram, #if defined(_WIN32) @@ -200,7 +196,6 @@ static constexpr const char* OclExtensionsString[] = {"cl_khr_fp64 ", "cl_khr_dx9_media_sharing ", #endif "cl_khr_image2d_from_buffer ", - "", "cl_amd_bus_addressable_memory ", "cl_amd_c11_atomics ", "cl_khr_spir ", @@ -209,8 +204,6 @@ static constexpr const char* OclExtensionsString[] = {"cl_khr_fp64 ", "cl_khr_depth_images ", "cl_khr_mipmap_image ", "cl_khr_mipmap_image_writes ", - "", - "cl_amd_liquid_flash ", "cl_amd_copy_buffer_p2p ", "cl_amd_assembly_program ", #if defined(_WIN32) @@ -1241,9 +1234,6 @@ class VirtualDevice : public amd::HeapObject { /// Optional extensions virtual void submitSignal(amd::SignalCommand& cmd) = 0; virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) = 0; - virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) { - ShouldNotReachHere(); - } virtual void submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) { ShouldNotReachHere(); } @@ -1270,6 +1260,9 @@ class VirtualDevice : public amd::HeapObject { //! Returns fence state of the VirtualGPU virtual bool isFenceDirty() const = 0; + //! Resets fence state of the VirtualGPU + virtual void resetFenceDirty() = 0; + private: //! Disable default copy constructor VirtualDevice& operator=(const VirtualDevice&); @@ -1796,9 +1789,14 @@ class Device : public RuntimeObject { // Returns the status of HW event, associated with amd::Event virtual bool IsHwEventReady( - const amd::Event& event, //!< AMD event for HW status validation - bool wait = false //!< If true then forces the event completion - ) const { + const amd::Event& event, //!< AMD event for HW status validation + bool wait = false) const { //!< If true then forces the event completion + return false; + }; + + // Returns the status of HW event, associated with amd::Event + virtual bool IsHwEventReadyForcedWait( + const amd::Event& event) const { //!< AMD event for HW status validation return false; }; diff --git a/device/devprogram.cpp b/device/devprogram.cpp index 5455d80c..f1a2cb2e 100644 --- a/device/devprogram.cpp +++ b/device/devprogram.cpp @@ -2977,7 +2977,7 @@ bool Program::runInitFiniKernel(kernel_kind_t kind) const { amd::HostQueue* queue = nullptr; for (const auto& i : kernels_) { - LogPrintfInfo("For Init/Fini: Kernel Name: %s", i.first.c_str()); + ClPrint(amd::LOG_INFO, amd::LOG_INIT, "For Init/Fini: Kernel Name: %s", i.first.c_str()); const auto &kernel = i.second; if ((kernel->isInitKernel() && kind == kernel_kind_t::InitKernel) || (kernel->isFiniKernel() && kind == kernel_kind_t::FiniKernel)) { diff --git a/device/devwavelimiter.cpp b/device/devwavelimiter.cpp index beec1bce..deaadd98 100644 --- a/device/devwavelimiter.cpp +++ b/device/devwavelimiter.cpp @@ -328,6 +328,7 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback( // ================================================================================================ void WaveLimiterManager::enable(bool isSupported) { if (fixed_ > 0) { + enable_ = GPU_WAVE_LIMIT_ENABLE; return; } diff --git a/device/pal/paldevice.cpp b/device/pal/paldevice.cpp index 75cd3c3e..9adfabff 100644 --- a/device/pal/paldevice.cpp +++ b/device/pal/paldevice.cpp @@ -108,6 +108,7 @@ static constexpr PalDevice supportedPalDevices[] = { {11, 0, 1, Pal::GfxIpLevel::GfxIp11_0, "gfx1101", Pal::AsicRevision::Navi32}, {11, 0, 2, Pal::GfxIpLevel::GfxIp11_0, "gfx1102", Pal::AsicRevision::Navi33}, {11, 0, 3, Pal::GfxIpLevel::GfxIp11_0, "gfx1103", Pal::AsicRevision::Phoenix1}, + {11, 0, 3, Pal::GfxIpLevel::GfxIp11_0, "gfx1103", Pal::AsicRevision::Phoenix2}, }; static std::tuple findIsa(Pal::AsicRevision asicRevision, @@ -632,7 +633,9 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp, info_.cooperativeGroups_ = settings().enableCoopGroups_; info_.cooperativeMultiDeviceGroups_ = settings().enableCoopMultiDeviceGroups_; - if (heaps[Pal::GpuHeapInvisible].logicalSize == 0) { + if (amd::IS_HIP) { + info_.largeBar_ = false; + } else if (heaps[Pal::GpuHeapInvisible].logicalSize == 0) { info_.largeBar_ = true; ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Resizable bar enabled"); } @@ -2356,8 +2359,8 @@ void Device::ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const { } // ================================================================================================ -void Device::HiddenHeapAlloc() { - auto HeapAlloc = [this]() -> bool { +void Device::HiddenHeapAlloc(const VirtualGPU& gpu) { + auto HeapAlloc = [this, &gpu]() -> bool { // Allocate initial heap for device memory allocator static constexpr size_t HeapBufferSize = 128 * Ki; heap_buffer_ = createMemory(HeapBufferSize); @@ -2369,7 +2372,7 @@ void Device::HiddenHeapAlloc() { LogError("Heap buffer allocation failed!"); return false; } - bool result = static_cast(xferMgr()).initHeap( + bool result = static_cast(gpu.blitMgr()).initHeap( heap_buffer_, initial_heap_buffer_, HeapBufferSize, initial_heap_size_ / (2 * Mi)); return result; diff --git a/device/pal/paldevice.hpp b/device/pal/paldevice.hpp index 5b3b4af5..0226ea40 100644 --- a/device/pal/paldevice.hpp +++ b/device/pal/paldevice.hpp @@ -629,7 +629,7 @@ class Device : public NullDevice { #endif #endif //! Allocates hidden heap for device memory allocations - void HiddenHeapAlloc(); + void HiddenHeapAlloc(const VirtualGPU& gpu); private: static void PAL_STDCALL PalDeveloperCallback(void* pPrivateData, const Pal::uint32 deviceIndex, diff --git a/device/pal/palgpuopen.cpp b/device/pal/palgpuopen.cpp index 7c6357e9..e5f7dd2e 100644 --- a/device/pal/palgpuopen.cpp +++ b/device/pal/palgpuopen.cpp @@ -54,9 +54,7 @@ RgpCaptureMgr::RgpCaptureMgr(Pal::IPlatform* platform, const Device& device) se_mask_(0), perf_counter_mem_limit_(0), perf_counter_frequency_(0), - trace_enabled_(false), - inst_tracing_enabled_(false), - perf_counters_enabled_(false) { + value_(0) { memset(&trace_, 0, sizeof(trace_)); } @@ -176,6 +174,8 @@ bool RgpCaptureMgr::Update(Pal::IPlatform* platform) { PostDeviceCreate(); } + static_vm_id_ = device_.properties().gfxipProperties.flags.supportStaticVmid; + return result; } @@ -189,12 +189,12 @@ bool RgpCaptureMgr::RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue, // Get the OS context handle for this queue (this is a thing that RGP needs on DX clients; // it may be optional for Vulkan, but we provide it anyway if available). Pal::KernelContextInfo kernelContextInfo = {}; - Pal::Result palResult = iQueue->QueryKernelContextInfo(&kernelContextInfo); // Ensure we've acquired the debug VMID (note that some platforms do not // implement this function, so don't fail the whole trace if so) *debug_vmid = kernelContextInfo.flags.hasDebugVmid; + assert((static_vm_id_ || *debug_vmid) && "Can't capture multiple queues!"); // Register the queue with the GPA session class for timed queue operation support. if (trace_.gpa_session_->RegisterTimedQueue( @@ -278,6 +278,21 @@ void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu) { // continue until we find the right queue... } else if (Pal::Result::Success == res) { trace_.sqtt_disp_count_ = 0; + // Stop the trace and save the result. Currently runtime can't delay upload in HIP, + // because default stream doesn't have explicit destruction and + // OS kills all threads on exit without any notification. That includes PAL RGP threads. + { + if (trace_.status_ == TraceStatus::WaitingForSqtt) { + auto result = EndRGPTrace(gpu); + } + // Check if runtime is waiting for the final trace results + if (trace_.status_ == TraceStatus::WaitingForResults) { + // If results are ready, then finish the trace + if (CheckForTraceResults() == Pal::Result::Success) { + FinishRGPTrace(gpu, false); + } + } + } } else { FinishRGPTrace(gpu, true); } @@ -517,11 +532,17 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu) { } } - // Notify the RGP server that we are starting a trace - if (rgp_server_->BeginTrace() != DevDriver::Result::Success) { - result = Pal::Result::ErrorUnknown; + if (static_vm_id_) { + result = device_.iDev()->SetStaticVmidMode(true); + assert(result == Pal::Result::Success && "Static VM ID setup failed!"); } + if (result == Pal::Result::Success) { + // Notify the RGP server that we are starting a trace + if (rgp_server_->BeginTrace() != DevDriver::Result::Success) { + result = Pal::Result::ErrorUnknown; + } + } // Tell the GPA session class we're starting a trace if (result == Pal::Result::Success) { GpuUtil::GpaSessionBeginInfo info = {}; @@ -707,6 +728,7 @@ void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) { return; } + auto disp_count = trace_.sqtt_disp_count_; // Finish the trace if the queue was destroyed before OCL reached // the number of captured dispatches if (trace_.sqtt_disp_count_ != 0) { @@ -736,9 +758,18 @@ void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) { } else { rgp_server_->EndTrace(); } + + if (static_vm_id_) { + auto result = device_.iDev()->SetStaticVmidMode(false); + assert(result == Pal::Result::Success && "Static VM ID setup failed!"); + } + if (trace_.gpa_session_ != nullptr) { trace_.gpa_session_->Reset(); } + // If applicaiton exits, then Windows kills all threads and + // RGP can't finish data write into a file. + amd::Os::sleep(10 * disp_count + 500); // Reset tracing state to idle trace_.prepared_disp_count_ = 0; trace_.sqtt_disp_count_ = 0; diff --git a/device/pal/palgpuopen.hpp b/device/pal/palgpuopen.hpp index ddae799b..33e9d35d 100644 --- a/device/pal/palgpuopen.hpp +++ b/device/pal/palgpuopen.hpp @@ -410,9 +410,10 @@ class RgpCaptureMgr { union { struct { - uint32_t trace_enabled_ : 1; // True if tracing is currently enabled (master flag) - uint32_t inst_tracing_enabled_; // Enable instruction-level SQTT tokens - uint32_t perf_counters_enabled_; // True if perf counters are enabled + uint32_t trace_enabled_: 1; // True if tracing is currently enabled (master flag) + uint32_t inst_tracing_enabled_: 1; // Enable instruction-level SQTT tokens + uint32_t perf_counters_enabled_: 1; // True if perf counters are enabled + uint32_t static_vm_id_: 1; // Static VM ID can be used for capture }; uint32_t value_; }; diff --git a/device/pal/palkernel.cpp b/device/pal/palkernel.cpp index 19c6d9fe..035a6288 100644 --- a/device/pal/palkernel.cpp +++ b/device/pal/palkernel.cpp @@ -362,7 +362,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const case amd::KernelParameterDescriptor::HiddenHeap: // Allocate hidden heap for HIP applications only if ((amd::IS_HIP) && (palDevice().HeapBuffer() == nullptr)) { - const_cast(palDevice()).HiddenHeapAlloc(); + const_cast(palDevice()).HiddenHeapAlloc(gpu); } if (palDevice().HeapBuffer() != nullptr) { // Add heap pointer to the code diff --git a/device/pal/palsettings.cpp b/device/pal/palsettings.cpp index 1f4dc8e9..080654c1 100644 --- a/device/pal/palsettings.cpp +++ b/device/pal/palsettings.cpp @@ -78,9 +78,10 @@ Settings::Settings() { // By default use host blit blitEngine_ = BlitEngineHost; - pinnedXferSize_ = GPU_PINNED_MIN_XFER_SIZE * Mi; + pinnedXferSize_ = GPU_PINNED_XFER_SIZE * Mi; + size_t defaultMinXferSize = amd::IS_HIP ? 128: 4; pinnedMinXferSize_ = flagIsDefault(GPU_PINNED_MIN_XFER_SIZE) - ? 128 * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi; + ? defaultMinXferSize * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi; // Disable FP_FAST_FMA defines by default reportFMAF_ = false; @@ -205,6 +206,7 @@ bool Settings::create(const Pal::DeviceProperties& palProp, case Pal::AsicRevision::Navi31: // Fall through for Navi2x ... case Pal::AsicRevision::Phoenix1: + case Pal::AsicRevision::Phoenix2: case Pal::AsicRevision::Raphael: case Pal::AsicRevision::Rembrandt: case Pal::AsicRevision::Navi24: diff --git a/device/pal/palvirtual.cpp b/device/pal/palvirtual.cpp index 64b5bfa6..624379d1 100644 --- a/device/pal/palvirtual.cpp +++ b/device/pal/palvirtual.cpp @@ -1072,15 +1072,15 @@ bool VirtualGPU::allocHsaQueueMem() { } VirtualGPU::~VirtualGPU() { + // Not safe to remove a queue. So lock the device + amd::ScopedLock k(dev().lockAsyncOps()); + amd::ScopedLock lock(dev().vgpusAccess()); + // Destroy RGP trace if (rgpCaptureEna()) { dev().rgpCaptureMgr()->FinishRGPTrace(this, true); } - // Not safe to remove a queue. So lock the device - amd::ScopedLock k(dev().lockAsyncOps()); - amd::ScopedLock lock(dev().vgpusAccess()); - while (!freeCbQueue_.empty()) { auto cb = freeCbQueue_.front(); delete cb; @@ -3741,49 +3741,7 @@ bool VirtualGPU::validateSdmaOverlap(const Resource& src, const Resource& dst) { return false; } -void VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) { - size_t copySize = cmd.size()[0]; - size_t fileOffset = cmd.fileOffset(); - Memory* mem = dev().getGpuMemory(&cmd.memory()); - uint idx = 0; - - assert((cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD) || - (cmd.type() == CL_COMMAND_WRITE_SSG_FILE_AMD)); - const bool writeBuffer(cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD); - - if (writeBuffer) { - size_t dstOffset = cmd.origin()[0]; - while (copySize > 0) { - Memory* staging = dev().getGpuMemory(&cmd.staging(idx)); - size_t dstSize = amd::TransferBufferFileCommand::StagingBufferSize; - dstSize = std::min(dstSize, copySize); - void* dstBuffer = staging->cpuMap(*this); - staging->cpuUnmap(*this); - - blitMgr().copyBuffer(*staging, *mem, 0, dstOffset, dstSize, false); - flushDMA(staging->getGpuEvent(*this)->engineId_); - fileOffset += dstSize; - dstOffset += dstSize; - copySize -= dstSize; - } - } else { - size_t srcOffset = cmd.origin()[0]; - while (copySize > 0) { - Memory* staging = dev().getGpuMemory(&cmd.staging(idx)); - size_t srcSize = amd::TransferBufferFileCommand::StagingBufferSize; - srcSize = std::min(srcSize, copySize); - blitMgr().copyBuffer(*mem, *staging, srcOffset, 0, srcSize, false); - - void* srcBuffer = staging->cpuMap(*this); - staging->cpuUnmap(*this); - - fileOffset += srcSize; - srcOffset += srcSize; - copySize -= srcSize; - } - } -} - +// ================================================================================================ void* VirtualGPU::getOrCreateHostcallBuffer() { if (hostcallBuffer_ != nullptr) { return hostcallBuffer_; diff --git a/device/pal/palvirtual.hpp b/device/pal/palvirtual.hpp index 1bbfe8e9..74fb039f 100644 --- a/device/pal/palvirtual.hpp +++ b/device/pal/palvirtual.hpp @@ -345,7 +345,6 @@ class VirtualGPU : public device::VirtualDevice { virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd); virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd); virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd); - virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd); virtual void submitVirtualMap(amd::VirtualMapCommand& cmd); virtual void submitStreamOperation(amd::StreamOperationCommand& cmd); void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd); @@ -360,6 +359,8 @@ class VirtualGPU : public device::VirtualDevice { bool isFenceDirty() const { return false; } + void resetFenceDirty() {} + //! Returns GPU device object associated with this kernel const Device& dev() const { return gpuDevice_; } diff --git a/device/rocm/pro/lnxheaders.h b/device/rocm/pro/lnxheaders.h deleted file mode 100644 index 8c9fbe14..00000000 --- a/device/rocm/pro/lnxheaders.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. */ - -#pragma once - -// NOTE: Some of the Linux driver stack's headers don't wrap their C-style interface names in 'extern "C" { ... }' -// blocks when building with a C++ compiler, so we need to add that ourselves. -#if __cplusplus -extern "C" -{ -#endif - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -constexpr int32_t InvalidFd = -1; // value representing a invalid file descriptor for Linux - -#if __cplusplus -} // extern "C" -#endif diff --git a/device/rocm/pro/prodevice.cpp b/device/rocm/pro/prodevice.cpp deleted file mode 100644 index 1e366cd2..00000000 --- a/device/rocm/pro/prodevice.cpp +++ /dev/null @@ -1,241 +0,0 @@ -/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. */ - -#ifndef WITHOUT_HSA_BACKEND - -#include "hsa/hsa_ext_amd.h" -#include "lnxheaders.h" -#include "prodevice.hpp" -#include "amdgpu_drm.h" - -namespace roc { - -constexpr uint32_t kMaxDevices = 32; -constexpr uint32_t kAtiVendorId = 0x1002; - -void* ProDevice::lib_drm_handle_ = nullptr; -bool ProDevice::initialized_ = false; -drm::Funcs ProDevice::funcs_; - -IProDevice* IProDevice::Init(uint32_t bus, uint32_t device, uint32_t func) -{ - // Make sure DRM lib is initialized - if (!ProDevice::DrmInit()) { - return nullptr; - } - - ProDevice* pro_device = new ProDevice(); - - if (pro_device == nullptr || !pro_device->Create(bus, dev, func)) { - delete pro_device; - return nullptr; - } - return pro_device; -} - -ProDevice::~ProDevice() { - delete alloc_ops_; - - if (dev_handle_ != nullptr) { - Funcs().AmdgpuDeviceDeinitialize(dev_handle_); - } - if (file_desc_ > 0) { - close(file_desc_); - } -} - -bool ProDevice::DrmInit() -{ - if (initialized_ == false) { - // Find symbols in libdrm_amdgpu.so.1 - lib_drm_handle_ = dlopen("libdrm_amdgpu.so.1", RTLD_NOW); - if (lib_drm_handle_ == nullptr) { - return false; - } else { - funcs_.DrmGetDevices = reinterpret_cast(dlsym( - lib_drm_handle_, - "drmGetDevices")); - if (funcs_.DrmGetDevices == nullptr) return false; - funcs_.AmdgpuDeviceInitialize = reinterpret_cast(dlsym( - lib_drm_handle_, - "amdgpu_device_initialize")); - if (funcs_.AmdgpuDeviceInitialize == nullptr) return false; - funcs_.AmdgpuDeviceDeinitialize = reinterpret_cast(dlsym( - lib_drm_handle_, - "amdgpu_device_deinitialize")); - if (funcs_.AmdgpuDeviceDeinitialize == nullptr) return false; - funcs_.AmdgpuQueryGpuInfo = reinterpret_cast(dlsym( - lib_drm_handle_, - "amdgpu_query_gpu_info")); - if (funcs_.AmdgpuQueryGpuInfo == nullptr) return false; - funcs_.AmdgpuQueryInfo = reinterpret_cast(dlsym( - lib_drm_handle_, - "amdgpu_query_info")); - if (funcs_.AmdgpuQueryInfo == nullptr) return false; - funcs_.AmdgpuBoAlloc = reinterpret_cast(dlsym( - lib_drm_handle_, - "amdgpu_bo_alloc")); - if (funcs_.AmdgpuBoAlloc == nullptr) return false; - funcs_.AmdgpuBoExport = reinterpret_cast(dlsym( - lib_drm_handle_, - "amdgpu_bo_export")); - if (funcs_.AmdgpuBoExport == nullptr) return false; - funcs_.AmdgpuBoFree = reinterpret_cast(dlsym( - lib_drm_handle_, - "amdgpu_bo_free")); - if (funcs_.AmdgpuBoFree == nullptr) return false; - funcs_.AmdgpuBoCpuMap = reinterpret_cast(dlsym( - lib_drm_handle_, - "amdgpu_bo_cpu_map")); - if (funcs_.AmdgpuBoCpuMap == nullptr) return false; - funcs_.AmdgpuBoCpuUnmap = reinterpret_cast(dlsym( - lib_drm_handle_, - "amdgpu_bo_cpu_unmap")); - if (funcs_.AmdgpuBoCpuUnmap == nullptr) return false; - } - } - - initialized_ = true; - return true; -} - -#ifndef AMDGPU_CAPABILITY_SSG_FLAG -#define AMDGPU_CAPABILITY_SSG_FLAG 4 -#endif - -// ================================================================================================ -// Open drm device and initialize it. And also get the drm information. -bool ProDevice::Create(uint32_t bus, uint32_t device, uint32_t func) { - drmDevicePtr devices[kMaxDevices] = { }; - int32_t device_count = Funcs().DrmGetDevices(devices, kMaxDevices); - bool result = false; - - for (int32_t i = 0; i < device_count; i++) { - // Check if the device vendor is AMD - if (devices[i]->deviceinfo.pci->vendor_id != kAtiVendorId) { - continue; - } - if ((devices[i]->businfo.pci->bus == bus) && - (devices[i]->businfo.pci->dev == device) && - (devices[i]->businfo.pci->func == func)) { - - // pDevices[i]->nodes[DRM_NODE_PRIMARY]; - // Using render node here so that we can do the off-screen rendering without authentication - file_desc_ = open(devices[i]->nodes[DRM_NODE_RENDER], O_RDWR, 0); - - if (file_desc_ > 0) { - void* data, *file, *cap; - - // Initialize the admgpu device. - if (Funcs().AmdgpuDeviceInitialize(file_desc_, &major_ver_, - &minor_ver_, &dev_handle_) == 0) { - uint32_t version = 0; - // amdgpu_query_gpu_info will never fail only if it is initialized - Funcs().AmdgpuQueryGpuInfo(dev_handle_, &gpu_info_); - - drm_amdgpu_capability cap = {}; - Funcs().AmdgpuQueryInfo(dev_handle_, AMDGPU_INFO_CAPABILITY, sizeof(drm_amdgpu_capability), &cap); - - // Check if DGMA and SSG are available - if ((cap.flag & (AMDGPU_CAPABILITY_DIRECT_GMA_FLAG | AMDGPU_CAPABILITY_SSG_FLAG)) == - (AMDGPU_CAPABILITY_DIRECT_GMA_FLAG | AMDGPU_CAPABILITY_SSG_FLAG)) { - result = true; - break; - } - } - } - } - } - - if (result) { - alloc_ops_ = new amd::Monitor("DGMA mem alloc lock", true); - if (nullptr == alloc_ops_) { - return true; - } - } - - return result; -} - -void* ProDevice::AllocDmaBuffer(hsa_agent_t agent, size_t size, void** host_ptr) const -{ - amd::ScopedLock l(alloc_ops_); - void* ptr = nullptr; - amdgpu_bo_handle buf_handle = 0; - amdgpu_bo_alloc_request req = {0}; - *host_ptr = nullptr; - - req.alloc_size = size; - req.phys_alignment = 64 * Ki; - req.preferred_heap = AMDGPU_GEM_DOMAIN_DGMA; - - // Allocate buffer in DGMA heap - if (0 == Funcs().AmdgpuBoAlloc(dev_handle_, &req, &buf_handle)) { - amdgpu_bo_handle_type type = amdgpu_bo_handle_type_dma_buf_fd; - uint32_t shared_handle = 0; - // Find the base driver handle - if (0 == Funcs().AmdgpuBoExport(buf_handle, type, &shared_handle)) { - uint32_t flags = 0; - size_t buf_size = 0; - // Map memory object to HSA device - if (0 == hsa_amd_interop_map_buffer(1, &agent, shared_handle, - flags, &buf_size, &ptr, nullptr, nullptr)) { - // Ask GPUPro driver to provide CPU access to allocation - if (0 == Funcs().AmdgpuBoCpuMap(buf_handle, host_ptr)) { - allocs_.insert({ptr, {buf_handle, shared_handle}}); - } - else { - hsa_amd_interop_unmap_buffer(ptr); - close(shared_handle); - Funcs().AmdgpuBoFree(buf_handle); - } - } - else { - close(shared_handle); - Funcs().AmdgpuBoFree(buf_handle); - } - } - else { - Funcs().AmdgpuBoFree(buf_handle); - } - } - - return ptr; -} - -void ProDevice::FreeDmaBuffer(void* ptr) const -{ - amd::ScopedLock l(alloc_ops_); - auto it = allocs_.find(ptr); - if (it != allocs_.end()) { - Funcs().AmdgpuBoCpuUnmap(it->second.first); - // Unmap memory from HSA device - hsa_amd_interop_unmap_buffer(ptr); - // Close shared handle - close(it->second.second); - int error = Funcs().AmdgpuBoFree(it->second.first); - allocs_.erase(it); - } -} - -} - -#endif // WITHOUT_HSA_BACKEND - diff --git a/device/rocm/pro/prodevice.hpp b/device/rocm/pro/prodevice.hpp deleted file mode 100644 index 80ff3600..00000000 --- a/device/rocm/pro/prodevice.hpp +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. */ - -#pragma once - -#ifndef WITHOUT_HSA_BACKEND - -#include "profuncs.hpp" -#include "prodriver.hpp" -#include "thread/monitor.hpp" -#include - -/*! \addtogroup HSA - * @{ - */ - -//! HSA Device Implementation -namespace roc { - -class ProDevice : public IProDevice { -public: - static bool DrmInit(); - - ProDevice() - : file_desc_(0) - , major_ver_(0) - , minor_ver_(0) - , dev_handle_(nullptr) - , alloc_ops_(nullptr) {} - virtual ~ProDevice() override; - - bool Create(uint32_t bus, uint32_t device, uint32_t func); - - virtual void* AllocDmaBuffer( - hsa_agent_t agent, size_t size, void** host_ptr) const override; - virtual void FreeDmaBuffer(void* ptr) const override; - virtual void GetAsicIdAndRevisionId(uint32_t* asic_id, uint32_t* rev_id) const override - { - *asic_id = gpu_info_.asic_id; - *rev_id = gpu_info_.pci_rev_id; - } - -private: - static void* lib_drm_handle_; - static bool initialized_; - static drm::Funcs funcs_; - const drm::Funcs& Funcs() const { return funcs_; } - - int32_t file_desc_; //!< File descriptor for the device - uint32_t major_ver_; //!< Major driver version - uint32_t minor_ver_; //!< Minor driver version - amdgpu_device_handle dev_handle_; //!< AMD gpu device handle - amdgpu_gpu_info gpu_info_; //!< GPU info structure - amdgpu_heap_info heap_info_; //!< Information about memory - mutable std::unordered_map> allocs_; //!< Alloced memory mapping - amd::Monitor* alloc_ops_; //!< Serializes memory allocations/destructions -}; - -} // namespace roc - -/** - * @} - */ -#endif /*WITHOUT_HSA_BACKEND*/ diff --git a/device/rocm/pro/prodriver.hpp b/device/rocm/pro/prodriver.hpp deleted file mode 100644 index 819ade27..00000000 --- a/device/rocm/pro/prodriver.hpp +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. */ - -#pragma once - -#ifndef WITHOUT_HSA_BACKEND - -#include "top.hpp" -#include "hsa/hsa.h" - -/*! \addtogroup HSA - * @{ - */ - -namespace roc { - -//! Pro Device Interface -class IProDevice : public amd::HeapObject { -public: - static IProDevice* Init(uint32_t bus, uint32_t device, uint32_t func); - - virtual void* AllocDmaBuffer(hsa_agent_t agent, size_t size, void** host_ptr) const = 0; - virtual void FreeDmaBuffer(void* ptr) const = 0; - virtual void GetAsicIdAndRevisionId(uint32_t* asic_id, uint32_t* rev_id) const = 0; - - IProDevice() {} - virtual ~IProDevice() {} -}; - -} // namespace roc - -/** - * @} - */ -#endif /*WITHOUT_HSA_BACKEND*/ diff --git a/device/rocm/pro/profuncs.hpp b/device/rocm/pro/profuncs.hpp deleted file mode 100644 index e878df0c..00000000 --- a/device/rocm/pro/profuncs.hpp +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. */ - -#pragma once - -namespace roc -{ -namespace drm -{ -typedef int (*DrmGetDevices)( - drmDevicePtr* pDevices, - int maxDevices); - -typedef int (*AmdgpuDeviceInitialize)( - int fd, - uint32_t* pMajorVersion, - uint32_t* pMinorVersion, - amdgpu_device_handle* pDeviceHandle); - -typedef int (*AmdgpuDeviceDeinitialize)( - amdgpu_device_handle hDevice); - -typedef int (*AmdgpuQueryGpuInfo)( - amdgpu_device_handle hDevice, - struct amdgpu_gpu_info* pInfo); - -typedef int (*AmdgpuQueryInfo)( - amdgpu_device_handle hDevice, - unsigned infoId, - unsigned size, - void* pValue); - -typedef int (*AmdgpuBoAlloc)( - amdgpu_device_handle hDevice, - struct amdgpu_bo_alloc_request* pAllocBuffer, - amdgpu_bo_handle* pBufferHandle); - -typedef int (*AmdgpuBoExport)( - amdgpu_bo_handle hBuffer, - enum amdgpu_bo_handle_type type, - uint32_t* pFd); - -typedef int (*AmdgpuBoFree)( - amdgpu_bo_handle hBuffer); - -typedef int (*AmdgpuBoCpuMap)( - amdgpu_bo_handle hBuffer, - void** ppCpuAddress); - -typedef int (*AmdgpuBoCpuUnmap)( - amdgpu_bo_handle hBuffer); - -struct Funcs -{ - DrmGetDevices DrmGetDevices; - AmdgpuDeviceInitialize AmdgpuDeviceInitialize; - AmdgpuDeviceDeinitialize AmdgpuDeviceDeinitialize; - AmdgpuQueryGpuInfo AmdgpuQueryGpuInfo; - AmdgpuQueryInfo AmdgpuQueryInfo; - AmdgpuBoAlloc AmdgpuBoAlloc; - AmdgpuBoExport AmdgpuBoExport; - AmdgpuBoFree AmdgpuBoFree; - AmdgpuBoCpuMap AmdgpuBoCpuMap; - AmdgpuBoCpuUnmap AmdgpuBoCpuUnmap; -}; - -} //namespace drm -} //namespace roc diff --git a/device/rocm/rocblit.cpp b/device/rocm/rocblit.cpp index f6b447ca..1bd6a0ff 100644 --- a/device/rocm/rocblit.cpp +++ b/device/rocm/rocblit.cpp @@ -457,7 +457,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d // Copy memory line by line ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, - "HSA Asycn Copy Rect wait_event=0x%zx, completion_signal=0x%zx", + "HSA Async Copy Rect wait_event=0x%zx, completion_signal=0x%zx", (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle); hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset, &srcMem, &offset, &dim, agent, direction, wait_events.size(), wait_events.data(), active); @@ -478,7 +478,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d // Copy memory line by line ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, - "HSA Asycn Copy wait_event=0x%zx, completion_signal=0x%zx", + "HSA Async Copy wait_event=0x%zx, completion_signal=0x%zx", (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle); hsa_status_t status = hsa_amd_memory_async_copy( (reinterpret_cast
(dst) + dstOffset), dstAgent, @@ -668,9 +668,8 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, // Use SDMA to transfer the data ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, - "HSA Asycn Copy dst=0x%zx, src=0x%zx, size=%d, wait_event=0x%zx, " - "completion_signal=0x%zx", - dst, src, (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle); + "HSA Async Copy dst=0x%zx, src=0x%zx, size=%ld, wait_event=0x%zx, completion_signal=0x%zx", + dst, src, size[0], (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle); status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, size[0], wait_events.size(), wait_events.data(), active); @@ -2182,7 +2181,7 @@ bool KernelBlitManager::fillBuffer3D(device::Memory& memory, const void* pattern // ================================================================================================ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory, const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, - const amd::Coord3D& sizeIn, bool entire, + const amd::Coord3D& sizeIn, bool entire, amd::CopyMetadata copyMetadata) const { amd::ScopedLock k(lockXferOps_); bool result = false; @@ -2682,35 +2681,20 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, Memory* schedulerMem = dev().getRocMemory(schedulerParam); sp->kernarg_address = reinterpret_cast(schedulerMem->getDeviceMemory()); - - sp->hidden_global_offset_x = 0; - sp->hidden_global_offset_y = 0; - sp->hidden_global_offset_z = 0; sp->thread_counter = 0; sp->child_queue = reinterpret_cast(schedulerQueue); sp->complete_signal = schedulerSignal; hsa_signal_store_relaxed(schedulerSignal, kInitSignalValueOne); - sp->scheduler_aql.header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); - sp->scheduler_aql.setup = 1; - sp->scheduler_aql.workgroup_size_x = 1; - sp->scheduler_aql.workgroup_size_y = 1; - sp->scheduler_aql.workgroup_size_z = 1; - sp->scheduler_aql.grid_size_x = threads; - sp->scheduler_aql.grid_size_y = 1; - sp->scheduler_aql.grid_size_z = 1; - sp->scheduler_aql.kernel_object = gpuKernel.KernelCodeHandle(); - sp->scheduler_aql.kernarg_address = (void*)sp->kernarg_address; - sp->scheduler_aql.private_segment_size = 0; - sp->scheduler_aql.group_segment_size = 0; + sp->vqueue_header = vqVM; sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam); - sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_; + + if (dev().info().maxEngineClockFrequency_ > 0) { + sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_; + } // Use a device side global atomics to workaround the reliance of PCIe 3 atomics sp->write_index = hsa_queue_load_write_index_relaxed(schedulerQueue); @@ -2721,7 +2705,7 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, address parameters = captureArguments(kernels_[Scheduler]); if (!gpu().submitKernelInternal(ndrange, *kernels_[Scheduler], - parameters, nullptr)) { + parameters, nullptr, 0, nullptr, &sp->scheduler_aql)) { return false; } releaseArguments(parameters); diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp index 961227bb..287ece39 100644 --- a/device/rocm/rocdevice.cpp +++ b/device/rocm/rocdevice.cpp @@ -40,9 +40,6 @@ #include "device/rocm/rocmemory.hpp" #include "device/rocm/rocglinterop.hpp" #include "device/rocm/rocsignal.hpp" -#ifdef WITH_AMDGPU_PRO -#include "pro/prodriver.hpp" -#endif #include "platform/sampler.hpp" #if defined(__clang__) @@ -163,8 +160,6 @@ Device::Device(hsa_agent_t bkendDevice) , xferQueue_(nullptr) , xferRead_(nullptr) , xferWrite_(nullptr) - , pro_device_(nullptr) - , pro_ena_(false) , freeMem_(0) , vgpusAccess_("Virtual GPU List Ops Lock", true) , hsa_exclusive_gpu_access_(false) @@ -218,9 +213,6 @@ void Device::checkAtomicSupport() { } Device::~Device() { -#ifdef WITH_AMDGPU_PRO - delete pro_device_; -#endif // Release cached map targets for (uint i = 0; mapCache_ != nullptr && i < mapCache_->size(); ++i) { if ((*mapCache_)[i] != nullptr) { @@ -244,17 +236,17 @@ Device::~Device() { } for (auto& it : queuePool_) { - for (auto& qIter : it) { - hsa_queue_t* queue = qIter.first; - auto& qInfo = qIter.second; + for (auto qIter = it.begin(); qIter != it.end(); ) { + hsa_queue_t* queue = qIter->first; + auto& qInfo = qIter->second; if (qInfo.hostcallBuffer_) { ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p", - qInfo.hostcallBuffer_, qIter.first); + qInfo.hostcallBuffer_, qIter->first); disableHostcalls(qInfo.hostcallBuffer_); context().svmFree(qInfo.hostcallBuffer_); } ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue); - it.erase(queue); + qIter = it.erase(qIter); hsa_queue_destroy(queue); } } @@ -687,18 +679,6 @@ bool Device::create() { } info_.pciDomainID = pci_domain_id; -#ifdef WITH_AMDGPU_PRO - // Create amdgpu-pro device interface for SSG support - pro_device_ = IProDevice::Init( - info_.deviceTopology_.pcie.bus, - info_.deviceTopology_.pcie.device, - info_.deviceTopology_.pcie.function); - if (pro_device_ != nullptr) { - pro_ena_ = true; - pro_device_->GetAsicIdAndRevisionId(&info_.pcieDeviceId_, &info_.pcieRevisionId_); - } -#endif - // Get Agent HDP Flush Register Memory hsa_amd_hdp_flush_t hdpInfo; if (HSA_STATUS_SUCCESS != @@ -1169,7 +1149,9 @@ bool Device::populateOCLDeviceConstants() { //TODO: add the assert statement for Raven if (!(isa().versionMajor() == 9 && isa().versionMinor() == 0 && isa().versionStepping() == 2)) { - assert(info_.maxEngineClockFrequency_ > 0); + if (info_.maxEngineClockFrequency_ <= 0) { + LogError("maxEngineClockFrequency_ is NOT positive!"); + } } if (HSA_STATUS_SUCCESS != @@ -1642,7 +1624,7 @@ bool Device::populateOCLDeviceConstants() { LogError("HSA_AMD_AGENT_INFO_SVM_DIRECT_HOST_ACCESS query failed."); } - LogPrintfInfo("HMM support: %d, xnack: %d, direct host access: %d\n", + ClPrint(amd::LOG_INFO, amd::LOG_INIT, "HMM support: %d, xnack: %d, direct host access: %d\n", info_.hmmSupported_, info_.hmmCpuMemoryAccessible_, info_.hmmDirectHostAccess_); info_.globalCUMask_ = {}; @@ -2229,8 +2211,7 @@ bool Device::IpcCreate(void* dev_ptr, size_t* mem_size, void* handle, size_t* me return false; } - // Pass the pointer and memory size to retrieve the handle - hsa_status = hsa_amd_ipc_memory_create(orig_dev_ptr, amd::alignUp(*mem_size, alloc_granularity()), + hsa_status = hsa_amd_ipc_memory_create(orig_dev_ptr, *mem_size, reinterpret_cast(handle)); if (hsa_status != HSA_STATUS_SUCCESS) { @@ -2331,7 +2312,6 @@ bool Device::IpcDetach (void* dev_ptr) const { // ================================================================================================ void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const { - constexpr bool kForceAllocation = true; amd::Memory* mem = nullptr; if (nullptr == svmPtr) { @@ -2343,7 +2323,7 @@ void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_ return nullptr; } - if (!mem->create(nullptr, false, false, kForceAllocation)) { + if (!mem->create(nullptr)) { LogError("failed to create a svm hidden buffer!"); mem->release(); return nullptr; @@ -2730,10 +2710,22 @@ bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeI return result; } +// ================================================================================================ +bool Device::IsHwEventReadyForcedWait(const amd::Event& event) const { + void* hw_event = + (event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent(); + if (hw_event == nullptr) { + ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event"); + return false; + } + static constexpr bool Timeout = true; + return WaitForSignal(reinterpret_cast(hw_event)->signal_, false, true); +} + // ================================================================================================ bool Device::IsHwEventReady(const amd::Event& event, bool wait) const { - void* hw_event = (event.NotifyEvent() != nullptr) ? - event.NotifyEvent()->HwEvent() : event.HwEvent(); + void* hw_event = + (event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent(); if (hw_event == nullptr) { ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event"); return false; @@ -3210,7 +3202,9 @@ device::Signal* Device::createSignal() const { amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size) { // Only create arena_mem_object if CPU memory is accessible from HMM // or if runtime received an interop from another ROCr's client - if (!info_.hmmCpuMemoryAccessible_ && !IsValidAllocation(ptr, size)) { + hsa_amd_pointer_info_t ptr_info = {}; + ptr_info.size = sizeof(hsa_amd_pointer_info_t); + if (!info_.hmmCpuMemoryAccessible_ && !IsValidAllocation(ptr, size, &ptr_info)) { return nullptr; } @@ -3227,8 +3221,9 @@ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size } // Calculate the offset of the pointer. - const void* dev_ptr = reinterpret_cast(arena_mem_obj_->getDeviceMemory( - *arena_mem_obj_->getContext().devices()[0])->virtualAddress()); + const void* dev_ptr = reinterpret_cast( + arena_mem_obj_->getDeviceMemory(*arena_mem_obj_->getContext().devices()[0]) + ->virtualAddress()); offset = reinterpret_cast(ptr) - reinterpret_cast(dev_ptr); return arena_mem_obj_; @@ -3242,20 +3237,25 @@ void Device::ReleaseGlobalSignal(void* signal) const { } // ================================================================================================ -bool Device::IsValidAllocation(const void* dev_ptr, size_t size) const { - hsa_amd_pointer_info_t ptr_info = {}; - ptr_info.size = sizeof(hsa_amd_pointer_info_t); +bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info) { // Query ptr type to see if it's a HMM allocation - hsa_status_t status = hsa_amd_pointer_info( - const_cast(dev_ptr), &ptr_info, nullptr, nullptr, nullptr); + hsa_status_t status = + hsa_amd_pointer_info(const_cast(dev_ptr), ptr_info, nullptr, nullptr, nullptr); // The call should never fail in ROCR, but just check for an error and continue if (status != HSA_STATUS_SUCCESS) { LogError("hsa_amd_pointer_info() failed"); } - // Check if it's a legacy non-HMM allocation in ROCr - if (ptr_info.type != HSA_EXT_POINTER_TYPE_UNKNOWN) { - if ((size != 0) && ((reinterpret_cast(dev_ptr) - - reinterpret_cast(ptr_info.agentBaseAddress)) > size)) { + + // Return false for pinned memory. A true return may result in a race because + // ROCclr may attempt to do a pin/copy/unpin underneath in a multithreaded environment + if (ptr_info->type == HSA_EXT_POINTER_TYPE_LOCKED) { + return false; + } + + if (ptr_info->type != HSA_EXT_POINTER_TYPE_UNKNOWN) { + if ((size != 0) && + ((reinterpret_cast(dev_ptr) - + reinterpret_cast(ptr_info->agentBaseAddress)) > size)) { return false; } return true; @@ -3264,8 +3264,8 @@ bool Device::IsValidAllocation(const void* dev_ptr, size_t size) const { } // ================================================================================================ -void Device::HiddenHeapAlloc() { - auto HeapAllocZeroOut = [this]() -> bool { +void Device::HiddenHeapAlloc(const VirtualGPU& gpu) { + auto HeapAllocZeroOut = [this, &gpu]() -> bool { // Allocate initial heap for device memory allocator static constexpr size_t HeapBufferSize = 128 * Ki; heap_buffer_ = createMemory(HeapBufferSize); @@ -3277,7 +3277,7 @@ void Device::HiddenHeapAlloc() { LogError("Heap buffer allocation failed!"); return false; } - bool result = static_cast(xferMgr()).initHeap( + bool result = static_cast(gpu.blitMgr()).initHeap( heap_buffer_, initial_heap_buffer_, HeapBufferSize, initial_heap_size_ / (2 * Mi)); return result; diff --git a/device/rocm/rocdevice.hpp b/device/rocm/rocdevice.hpp index b3da3783..b36d21ef 100644 --- a/device/rocm/rocdevice.hpp +++ b/device/rocm/rocdevice.hpp @@ -75,7 +75,6 @@ class Memory; class Resource; class VirtualDevice; class PrintfDbg; -class IProDevice; class ProfilingSignal : public amd::ReferenceCountedObject { public: @@ -258,6 +257,7 @@ class NullDevice : public amd::Device { cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; } virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; } + virtual bool IsHwEventReadyForcedWait(const amd::Event& event) const { return false; } virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {}; virtual void ReleaseGlobalSignal(void* signal) const {} @@ -443,6 +443,7 @@ class Device : public NullDevice { cl_set_device_clock_mode_output_amd* pSetClockModeOutput); virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const; + virtual bool IsHwEventReadyForcedWait(const amd::Event& event) const; virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const; virtual void ReleaseGlobalSignal(void* signal) const; @@ -478,10 +479,6 @@ class Device : public NullDevice { //! Create internal blit program bool createBlitProgram(); - // Returns AMD GPU Pro interfacs - const IProDevice& iPro() const { return *pro_device_; } - bool ProEna() const { return pro_ena_; } - // P2P agents avaialble for this device const std::vector& p2pAgents() const { return p2p_agents_; } @@ -549,10 +546,10 @@ class Device : public NullDevice { const bool isFineGrainSupported() const; //! Returns True if memory pointer is known to ROCr (excludes HMM allocations) - bool IsValidAllocation(const void* dev_ptr, size_t size) const; + bool IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info); //! Allocates hidden heap for device memory allocations - void HiddenHeapAlloc(); + void HiddenHeapAlloc(const VirtualGPU& gpu); private: bool create(); @@ -598,8 +595,6 @@ class Device : public NullDevice { XferBuffers* xferRead_; //!< Transfer buffers read XferBuffers* xferWrite_; //!< Transfer buffers write - const IProDevice* pro_device_; //!< AMDGPUPro device - bool pro_ena_; //!< Extra functionality with AMDGPUPro device, beyond ROCr std::atomic freeMem_; //!< Total of free memory available mutable amd::Monitor vgpusAccess_; //!< Lock to serialise virtual gpu list access bool hsa_exclusive_gpu_access_; //!< TRUE if current device was moved into exclusive GPU access mode diff --git a/device/rocm/rocmemory.cpp b/device/rocm/rocmemory.cpp index fbe6bd01..d267e0d0 100644 --- a/device/rocm/rocmemory.cpp +++ b/device/rocm/rocmemory.cpp @@ -37,9 +37,6 @@ #include "platform/sampler.hpp" #include "amdocl/cl_gl_amd.hpp" #include "amdocl/cl_vk_amd.hpp" -#ifdef WITH_AMDGPU_PRO -#include "pro/prodriver.hpp" -#endif namespace roc { @@ -646,8 +643,9 @@ void Buffer::destroy() { if (memFlags & CL_MEM_ALLOC_HOST_PTR) { if (dev().info().hmmSupported_) { // AMD HMM path. Destroy system memory - amd::Os::uncommitMemory(deviceMemory_, size()); - amd::Os::releaseMemory(deviceMemory_, size()); + if (!(amd::Os::releaseMemory(deviceMemory_, size()))) { + ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "[ROCClr] munmap failed \n"); + } } else { dev().hostFree(deviceMemory_, size()); } @@ -673,12 +671,6 @@ void Buffer::destroy() { return; } -#ifdef WITH_AMDGPU_PRO - if ((memFlags & CL_MEM_USE_PERSISTENT_MEM_AMD) && dev().ProEna()) { - dev().iPro().FreeDmaBuffer(deviceMemory_); - return; - } -#endif if (deviceMemory_ != nullptr) { if (deviceMemory_ != owner()->getHostMem()) { // if they are identical, the host pointer will be @@ -755,7 +747,6 @@ bool Buffer::create(bool alloc_local) { if (deviceMemory_ == NULL) { return false; } - amd::Os::commitMemory(deviceMemory_, size(), amd::Os::MEM_PROT_RW); // Currently HMM requires cirtain initial calls to mark sysmem allocation as // GPU accessible or prefetch memory into GPU if (!dev().SvmAllocInit(deviceMemory_, size())) { @@ -797,7 +788,7 @@ bool Buffer::create(bool alloc_local) { } else { assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!"); deviceMemory_ = dev().deviceLocalAlloc(size(), (memFlags & CL_MEM_SVM_ATOMICS) != 0, - (memFlags & ROCCLR_MEM_HSA_PSEUDO_FINE_GRAIN) != 0); + (memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0); } owner()->setSvmPtr(deviceMemory_); } else { diff --git a/device/rocm/rocurilocator.hpp b/device/rocm/rocurilocator.hpp index 880b6c72..76451087 100644 --- a/device/rocm/rocurilocator.hpp +++ b/device/rocm/rocurilocator.hpp @@ -22,7 +22,7 @@ #if defined(__clang__) #if __has_feature(address_sanitizer) #include "device/devurilocator.hpp" -#include "hsa_ven_amd_loader.h" +#include "hsa/hsa_ven_amd_loader.h" #include namespace roc { diff --git a/device/rocm/rocvirtual.cpp b/device/rocm/rocvirtual.cpp index 56b6022f..f2850b55 100644 --- a/device/rocm/rocvirtual.cpp +++ b/device/rocm/rocvirtual.cpp @@ -78,13 +78,8 @@ static constexpr uint16_t kBarrierPacketHeader = (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); -static constexpr uint16_t kBarrierPacketAgentScopeHeader = - (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); - static constexpr uint16_t kNopPacketHeader = - (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | + (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); @@ -99,14 +94,16 @@ static constexpr uint16_t kBarrierPacketReleaseHeader = (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); static constexpr uint16_t kBarrierVendorPacketHeader = - (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) | + (1 << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); -static constexpr uint16_t kBarrierVendorPacketAgentScopeHeader = - (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); +static constexpr uint16_t kBarrierVendorPacketNopScopeHeader = + (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) | + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); static constexpr hsa_barrier_and_packet_t kBarrierAcquirePacket = { kBarrierPacketAcquireHeader, 0, 0, {{0}}, 0, {0}}; @@ -433,8 +430,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( prof_signal->ts_ = ts; ts->AddProfilingSignal(prof_signal); if (AMD_DIRECT_DISPATCH) { - bool enqueHandler= false; - uint32_t init_value = kInitSignalValueOne; + bool enqueHandler = false; enqueHandler = (ts->command().Callback() != nullptr || ts->command().GetBatchHead() != nullptr ) && !ts->command().CpuWaitRequested(); @@ -990,6 +986,7 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal, uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1); uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_); + fence_dirty_ = true; auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE); if (!skipSignal) { @@ -1002,7 +999,9 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal, } // Reset fence_dirty_ flag if we submit a barrier - fence_dirty_ = false; + if (cache_state == amd::Device::kCacheStateSystem) { + fence_dirty_ = false; + } while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask); hsa_barrier_and_packet_t* aql_loc = @@ -1064,6 +1063,10 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD } } + fence_dirty_ = true; + auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE); + if (completionSignal.handle == 0) { // Get active signal for current dispatch if profiling is necessary barrier_value_packet_.completion_signal = @@ -1073,6 +1076,11 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD barrier_value_packet_.completion_signal = completionSignal; } + // Reset fence_dirty_ flag if we submit a barrier + if (cache_state == amd::Device::kCacheStateSystem) { + fence_dirty_ = false; + } + uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1); while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask); hsa_amd_barrier_value_packet_t* aql_loc = &(reinterpret_cast( @@ -1080,9 +1088,6 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD *aql_loc = barrier_value_packet_; packet_store_release(reinterpret_cast(aql_loc), packetHeader, rest); - auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, - HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE); - hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index); ClPrint(amd::LOG_DEBUG, amd::LOG_AQL, @@ -1817,7 +1822,7 @@ bool VirtualGPU::copyMemory(cl_command_type type, amd::Memory& srcMem, amd::Memo realSize.c[0] *= elemSize; } - result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, realSrcOrigin, realDstOrigin, + result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, realSrcOrigin, realDstOrigin, realSize, entire, copyMetadata); break; } @@ -2781,7 +2786,8 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) // ================================================================================================ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, const_address parameters, void* eventHandle, - uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) { + uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd, + hsa_kernel_dispatch_packet_t* aql_packet) { device::Kernel* devKernel = const_cast(kernel.getDeviceKernel(dev())); Kernel& gpuKernel = static_cast(*devKernel); size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize(); @@ -2957,7 +2963,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, case amd::KernelParameterDescriptor::HiddenHeap: // Allocate hidden heap for HIP applications only if ((amd::IS_HIP) && (dev().HeapBuffer() == nullptr)) { - const_cast(dev()).HiddenHeapAlloc(); + const_cast(dev()).HiddenHeapAlloc(*this); } if (dev().HeapBuffer() != nullptr) { // Add heap pointer to the code @@ -3108,6 +3114,16 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, dispatchPacket.reserved2 = vcmd->profilingInfo().correlation_id_; } + // Copy scheduler's AQL packet for possible relaunch from the scheduler itself + if (aql_packet != nullptr) { + *aql_packet = dispatchPacket; + aql_packet->header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + } + // Dispatch the packet if (!dispatchAqlPacket( &dispatchPacket, aqlHeaderWithOrder, @@ -3242,11 +3258,11 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) { if (timestamp_ != nullptr) { const Settings& settings = dev().settings(); int32_t releaseFlags = vcmd.getEventScope(); - if (releaseFlags == Device::CacheState::kCacheStateAgent) { + if (releaseFlags == Device::CacheState::kCacheStateIgnore) { if (settings.barrier_value_packet_ && vcmd.profilingInfo().marker_ts_) { - dispatchBarrierValuePacket(kBarrierVendorPacketAgentScopeHeader, true); + dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true); } else { - dispatchBarrierPacket(kBarrierPacketAgentScopeHeader, false); + dispatchBarrierPacket(kNopPacketHeader, false); } } else { // Submit a barrier with a cache flushes. @@ -3353,55 +3369,10 @@ amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) { return nullptr; } +// ================================================================================================ void VirtualGPU::enableSyncBlit() const { blitMgr_->enableSynchronization(); } -void VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) { - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - size_t copySize = cmd.size()[0]; - size_t fileOffset = cmd.fileOffset(); - Memory* mem = dev().getRocMemory(&cmd.memory()); - uint idx = 0; - - assert((cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD) || - (cmd.type() == CL_COMMAND_WRITE_SSG_FILE_AMD)); - const bool writeBuffer(cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD); - - if (writeBuffer) { - size_t dstOffset = cmd.origin()[0]; - while (copySize > 0) { - Memory* staging = dev().getRocMemory(&cmd.staging(idx)); - size_t dstSize = amd::TransferBufferFileCommand::StagingBufferSize; - dstSize = std::min(dstSize, copySize); - void* dstBuffer = staging->cpuMap(*this); - - staging->cpuUnmap(*this); - - bool result = blitMgr().copyBuffer(*staging, *mem, 0, dstOffset, dstSize, false); - fileOffset += dstSize; - dstOffset += dstSize; - copySize -= dstSize; - } - } else { - size_t srcOffset = cmd.origin()[0]; - while (copySize > 0) { - Memory* staging = dev().getRocMemory(&cmd.staging(idx)); - size_t srcSize = amd::TransferBufferFileCommand::StagingBufferSize; - srcSize = std::min(srcSize, copySize); - bool result = blitMgr().copyBuffer(*mem, *staging, srcOffset, 0, srcSize, false); - - void* srcBuffer = staging->cpuMap(*this); - - staging->cpuUnmap(*this); - - fileOffset += srcSize; - srcOffset += srcSize; - copySize -= srcSize; - } - } -} - +// ================================================================================================ void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); diff --git a/device/rocm/rocvirtual.hpp b/device/rocm/rocvirtual.hpp index 11a3670e..8c002658 100644 --- a/device/rocm/rocvirtual.hpp +++ b/device/rocm/rocvirtual.hpp @@ -46,10 +46,10 @@ constexpr static uint64_t kUnlimitedWait = std::numeric_limits::max(); // Active wait time out incase same sdma engine is used again, // then just wait instead of adding dependency wait signal. -constexpr static uint64_t kSDMAEngineTimeout = 10; +constexpr static uint64_t kForcedTimeout = 10; template -inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool sdma_wait = false) { +inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool forced_wait = false) { if (hsa_signal_load_relaxed(signal) > 0) { uint64_t timeout = kTimeout100us; if (active_wait) { @@ -57,7 +57,7 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool sd } if (active_wait_timeout) { // If diff engine, wait to 10 ms. Otherwise no wait - timeout = (sdma_wait ? kSDMAEngineTimeout : ROC_ACTIVE_WAIT_TIMEOUT) * K; + timeout = (forced_wait ? kForcedTimeout : ROC_ACTIVE_WAIT_TIMEOUT) * K; if (timeout == 0) { return false; } @@ -312,7 +312,8 @@ class VirtualGPU : public device::VirtualDevice { const_address parameters, //!< Parameters for the kernel void* event_handle, //!< Handle to OCL event for debugging uint32_t sharedMemBytes = 0, //!< Shared memory size - amd::NDRangeKernelCommand* vcmd = nullptr //!< Original launch command + amd::NDRangeKernelCommand* vcmd = nullptr, //!< Original launch command + hsa_kernel_dispatch_packet_t* aql_packet = nullptr //!< Scheduler launch ); void submitNativeFn(amd::NativeFnCommand& cmd); void submitMarker(amd::Marker& cmd); @@ -341,8 +342,6 @@ class VirtualGPU : public device::VirtualDevice { virtual void submitSignal(amd::SignalCommand& cmd) {} virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) {} - virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd); - void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) {} void submitThreadTrace(amd::ThreadTraceCommand& vcmd) {} @@ -408,6 +407,7 @@ class VirtualGPU : public device::VirtualDevice { void* allocKernArg(size_t size, size_t alignment); bool isFenceDirty() const { return fence_dirty_; } + void resetFenceDirty() { fence_dirty_ = false; } // } roc OpenCL integration private: //! Dispatches a barrier with blocking HSA signals diff --git a/platform/command.cpp b/platform/command.cpp index cc35579a..35ec5113 100644 --- a/platform/command.cpp +++ b/platform/command.cpp @@ -436,13 +436,12 @@ NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList profilingInfo_.clear(); profilingInfo_.callback_ = nullptr; profilingInfo_.marker_ts_ = true; - setEventScope(amd::Device::kCacheStateSystem); } kernel_.retain(); } void NDRangeKernelCommand::releaseResources() { - kernel_.parameters().release(parameters_, queue()->device()); + kernel_.parameters().release(parameters_); DEBUG_ONLY(parameters_ = NULL); kernel_.release(); Command::releaseResources(); @@ -726,71 +725,6 @@ bool ThreadTraceMemObjectsCommand::validateMemory() { return true; } -void TransferBufferFileCommand::releaseResources() { - for (uint i = 0; i < NumStagingBuffers; ++i) { - if (NULL != staging_[i]) { - staging_[i]->release(); - } - } - - // Call the parent - OneMemoryArgCommand::releaseResources(); -} - -void TransferBufferFileCommand::submit(device::VirtualDevice& device) { - device::Memory* mem = memory_->getDeviceMemory(queue()->device()); - if (memory_->getMemFlags() & - (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD)) { - void* srcDstBuffer = nullptr; - if (memory_->getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { - // Lock protected multiple maps for persistent memory - amd::ScopedLock lock(mem->owner()->lockMemoryOps()); - srcDstBuffer = mem->cpuMap(device); - } else { - srcDstBuffer = mem->cpuMap(device); - } - // Make HD transfer to the host accessible memory - bool writeBuffer(type() == CL_COMMAND_READ_SSG_FILE_AMD); - if (memory_->getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { - // Lock protected multiple maps for persistent memory - amd::ScopedLock lock(mem->owner()->lockMemoryOps()); - mem->cpuUnmap(device); - } else { - mem->cpuUnmap(device); - } - } else { - device.submitTransferBufferFromFile(*this); - } -} - -bool TransferBufferFileCommand::validateMemory() { - // Check if the destination buffer has direct host access - if (!(memory_->getMemFlags() & - (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD))) { - // Allocate staging buffers - for (uint i = 0; i < NumStagingBuffers; ++i) { - staging_[i] = new (memory_->getContext()) - Buffer(memory_->getContext(), StagingBufferMemType, StagingBufferSize); - if (NULL == staging_[i] || !staging_[i]->create(nullptr)) { - DevLogPrintfError("Staging Create failed, Staging[%d]: 0x%x", i, staging_[i]); - return false; - } - device::Memory* mem = staging_[i]->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate staging buffer - 0x%08X bytes!", staging_[i]->getSize()); - return false; - } - } - } - - device::Memory* mem = memory_->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory_->getSize()); - return false; - } - return true; -} - bool CopyMemoryP2PCommand::validateMemory() { amd::Device* queue_device = &queue()->device(); diff --git a/platform/command.hpp b/platform/command.hpp index 7022ec3d..45d9e365 100644 --- a/platform/command.hpp +++ b/platform/command.hpp @@ -1642,66 +1642,6 @@ class SvmUnmapMemoryCommand : public Command { void* svmPtr() const { return svmPtr_; } }; -/*! \brief A generic transfer memory from/to file command. - * - * \details Currently supports buffers only. Buffers - * are treated as 1D structures so origin_[0] and size_[0] - * are equivalent to offset_ and count_ respectively. - */ -class TransferBufferFileCommand : public OneMemoryArgCommand { - public: - static constexpr uint NumStagingBuffers = 2; - static constexpr size_t StagingBufferSize = 4 * Mi; - static constexpr uint StagingBufferMemType = CL_MEM_USE_PERSISTENT_MEM_AMD; - - protected: - const Coord3D origin_; //!< Origin of the region to write to - const Coord3D size_; //!< Size of the region to write to - LiquidFlashFile* file_; //!< The file object for data read - size_t fileOffset_; //!< Offset in the file for data read - amd::Memory* staging_[NumStagingBuffers]; //!< Staging buffers for transfer - - public: - TransferBufferFileCommand(cl_command_type type, HostQueue& queue, - const EventWaitList& eventWaitList, Memory& memory, - const Coord3D& origin, const Coord3D& size, LiquidFlashFile* file, - size_t fileOffset) - : OneMemoryArgCommand(queue, type, eventWaitList, memory), - origin_(origin), - size_(size), - file_(file), - fileOffset_(fileOffset) { - // Sanity checks - assert(size.c[0] > 0 && "invalid"); - for (uint i = 0; i < NumStagingBuffers; ++i) { - staging_[i] = NULL; - } - } - - virtual void releaseResources(); - - virtual void submit(device::VirtualDevice& device); - - //! Return the memory object to write to - Memory& memory() const { return *memory_; } - - //! Return the host memory to read from - LiquidFlashFile* file() const { return file_; } - - //! Returns file offset - size_t fileOffset() const { return fileOffset_; } - - //! Return the region origin - const Coord3D& origin() const { return origin_; } - //! Return the region size - const Coord3D& size() const { return size_; } - - //! Return the staging buffer for transfer - Memory& staging(uint i) const { return *staging_[i]; } - - bool validateMemory(); -}; - /*! \brief A P2P copy memory command * * \details Used for buffers only. Backends are expected diff --git a/platform/commandqueue.cpp b/platform/commandqueue.cpp index e9e56cb8..ce8ba4ae 100644 --- a/platform/commandqueue.cpp +++ b/platform/commandqueue.cpp @@ -66,8 +66,8 @@ bool HostQueue::terminate() { marker->awaitCompletion(); marker->release(); } - thread_.acceptingCommands_ = false; thread_.Release(); + thread_.acceptingCommands_ = false; } else { if (Os::isThreadAlive(thread_)) { Command* marker = nullptr; @@ -120,7 +120,7 @@ void HostQueue::finish() { return; } } - if (nullptr == command || vdev()->isHandlerPending()) { + if (nullptr == command || vdev()->isHandlerPending() || vdev()->isFenceDirty()) { if (nullptr != command) { command->release(); } diff --git a/platform/commandqueue.hpp b/platform/commandqueue.hpp index 863f8eca..f98332bf 100644 --- a/platform/commandqueue.hpp +++ b/platform/commandqueue.hpp @@ -162,13 +162,13 @@ class HostQueue : public CommandQueue { Thread() : amd::Thread("Command Queue Thread", CQ_THREAD_STACK_SIZE, !AMD_DIRECT_DISPATCH), acceptingCommands_(false), - virtualDevice_(NULL) {} + virtualDevice_(nullptr) {} //! The command queue thread entry point. void run(void* data) { HostQueue* queue = static_cast(data); virtualDevice_ = queue->device().createVirtualDevice(queue); - if (virtualDevice_ != NULL) { + if (virtualDevice_ != nullptr) { queue->loop(virtualDevice_); Release(); } else { diff --git a/platform/context.cpp b/platform/context.cpp index dbb46c38..965269e8 100644 --- a/platform/context.cpp +++ b/platform/context.cpp @@ -34,14 +34,6 @@ #include "CL/cl_dx9_media_sharing.h" #endif //_WIN32 -#ifndef WITH_LIQUID_FLASH -#if (!defined(BUILD_HSA_TARGET) && defined(WITH_HSA_DEVICE) && \ - defined(WITH_AMDGPU_PRO)) || defined(_WIN32) -#define WITH_LIQUID_FLASH 1 -#include "lf.h" -#endif -#endif - namespace amd { Context::Context(const std::vector& devices, const Info& info) @@ -91,10 +83,6 @@ Context::~Context() { delete[] properties_; delete glenv_; - -#if WITH_LIQUID_FLASH - lfTerminate(); -#endif } int Context::checkProperties(const cl_context_properties* properties, Context::Info* info) { @@ -315,10 +303,6 @@ int Context::create(const intptr_t* properties) { } } -#if WITH_LIQUID_FLASH - lfInit(); -#endif - return result; } diff --git a/platform/kernel.cpp b/platform/kernel.cpp index 80cb1a12..52d4936a 100644 --- a/platform/kernel.cpp +++ b/platform/kernel.cpp @@ -253,7 +253,7 @@ bool KernelParameters::boundToSvmPointer(const Device& device, const_address cap return svmBound[index]; } -void KernelParameters::release(address mem, const amd::Device& device) const { +void KernelParameters::release(address mem) const { if (mem == nullptr) { // nothing to do! return; diff --git a/platform/kernel.hpp b/platform/kernel.hpp index e46fc5a7..8cb3b7f7 100644 --- a/platform/kernel.hpp +++ b/platform/kernel.hpp @@ -215,7 +215,7 @@ class KernelParameters : protected HeapObject { //! Capture the state of the parameters and return the stack base pointer. address capture(device::VirtualDevice& vDev, uint64_t lclMemSize, int32_t* error); //! Release the captured state of the parameters. - void release(address parameters, const amd::Device& device) const; + void release(address parameters) const; //! Allocate memory for this instance as well as the required storage for // the values_, defined_, and rawPointer_ arrays. diff --git a/platform/memory.hpp b/platform/memory.hpp index 34ea71f8..ba89e18c 100644 --- a/platform/memory.hpp +++ b/platform/memory.hpp @@ -41,7 +41,7 @@ #define ROCCLR_MEM_HSA_SIGNAL_MEMORY (1u << 30) #define ROCCLR_MEM_INTERNAL_MEMORY (1u << 29) #define CL_MEM_VA_RANGE_AMD (1u << 28) -#define ROCCLR_MEM_HSA_PSEUDO_FINE_GRAIN (1u << 27) +#define ROCCLR_MEM_HSA_UNCACHED (1u << 27) namespace device { class Memory; @@ -663,42 +663,6 @@ class SvmBuffer : AllStatic { static Monitor AllocatedLock_; }; -#ifndef CL_COMMAND_WRITE_SSG_FILE_AMD -#define CL_COMMAND_WRITE_SSG_FILE_AMD 2 -#endif -#ifndef CL_COMMAND_READ_SSG_FILE_AMD -#define CL_COMMAND_READ_SSG_FILE_AMD 1 -#endif -#ifndef cl_file_flags_amd -typedef uint32_t cl_file_flags_amd; -#endif - //! Liquid flash extension -class LiquidFlashFile : public RuntimeObject { - private: - std::wstring name_; - cl_file_flags_amd flags_; - void* handle_; - uint32_t blockSize_; - uint64_t fileSize_; - - public: - LiquidFlashFile(const wchar_t* name, cl_file_flags_amd flags) - : name_(name), flags_(flags), handle_(NULL), blockSize_(0), fileSize_(0) {} - - ~LiquidFlashFile(); - - bool open(); - void close(); - - uint32_t blockSize() const { return blockSize_; }; - uint64_t fileSize() const { return fileSize_; }; - - bool transferBlock(bool read, void* dst, uint64_t bufferSize, uint64_t fileOffset, - uint64_t bufferOffset, uint64_t size) const; - - virtual ObjectType objectType() const { return ObjectTypeLiquidFlashFile; } -}; - class ArenaMemory: public Buffer { public: ArenaMemory(Context& context) diff --git a/platform/object.hpp b/platform/object.hpp index 9fca95ab..0f5ed636 100644 --- a/platform/object.hpp +++ b/platform/object.hpp @@ -41,9 +41,7 @@ #define AMD_CL_TYPES_DO(F) \ F(cl_counter_amd, Counter) \ F(cl_perfcounter_amd, PerfCounter) \ - F(cl_threadtrace_amd, ThreadTrace) \ - F(cl_file_amd, LiquidFlashFile) - + F(cl_threadtrace_amd, ThreadTrace) #define CL_TYPES_DO(F) \ KHR_CL_TYPES_DO(F) \ @@ -144,7 +142,6 @@ class RuntimeObject : public ReferenceCountedObject, public ICDDispatchedObject ObjectTypeQueue = 8, ObjectTypeSampler = 9, ObjectTypeThreadTrace = 10, - ObjectTypeLiquidFlashFile = 11 }; virtual ObjectType objectType() const = 0; diff --git a/utils/debug.cpp b/utils/debug.cpp index fc607564..cdda2516 100644 --- a/utils/debug.cpp +++ b/utils/debug.cpp @@ -49,17 +49,6 @@ extern "C" void breakpoint(void) { } //! \endcond -// ================================================================================================ -void report_fatal(const char* file, int line, const char* message) { - // FIXME_lmoriche: Obfuscate the message string - #if (defined(DEBUG)) - fprintf(outFile, "%s:%d: %s\n", file, line, message); - #else - fprintf(outFile, "%s\n", message); - #endif - ::abort(); -} - // ================================================================================================ void report_warning(const char* message) { fprintf(outFile, "Warning: %s\n", message); } diff --git a/utils/debug.hpp b/utils/debug.hpp index dd3ffd6e..07cb3b61 100644 --- a/utils/debug.hpp +++ b/utils/debug.hpp @@ -66,9 +66,6 @@ extern FILE* outFile; extern "C" void breakpoint(); //! \endcond -//! \brief Report a Fatal exception message and abort. -extern void report_fatal(const char* file, int line, const char* message); - //! \brief Display a warning message. extern void report_warning(const char* message); @@ -94,10 +91,10 @@ extern void log_printf(LogLevel level, const char* file, int line, uint64_t *sta #endif // __INTEL_COMPILER //! \brief Abort the program if the invariant \a cond is false. -#define guarantee(cond, message) \ +#define guarantee(cond, format, ...) \ if (!(cond)) { \ - amd::report_fatal(__FILE__, __LINE__, XSTR(message) ); \ - amd::breakpoint(); \ + amd::log_printf(amd::LOG_NONE, __FILE__, __LINE__, format, ##__VA_ARGS__); \ + ::abort(); \ } #define fixme_guarantee(cond, ...) guarantee(cond, __VA_ARGS__) diff --git a/utils/versions.hpp b/utils/versions.hpp index 2a85df5e..c8c96219 100644 --- a/utils/versions.hpp +++ b/utils/versions.hpp @@ -28,7 +28,7 @@ #endif // AMD_PLATFORM_NAME #ifndef AMD_PLATFORM_BUILD_NUMBER -#define AMD_PLATFORM_BUILD_NUMBER 3558 +#define AMD_PLATFORM_BUILD_NUMBER 3570 #endif // AMD_PLATFORM_BUILD_NUMBER #ifndef AMD_PLATFORM_REVISION_NUMBER