Skip to content

Commit b507d85

Browse files
authored
Merge pull request #76 from RadeonOpenCompute/roc-2.9.x
ROCm 2.9.0 updates
2 parents f446e05 + 8336f95 commit b507d85

File tree

8 files changed

+96
-135
lines changed

8 files changed

+96
-135
lines changed

src/core/inc/amd_gpu_agent.h

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -129,10 +129,7 @@ class GpuAgentInt : public core::Agent {
129129
//
130130
// @param [in] signal Pointer to signal that provides the async copy timing.
131131
// @param [out] time Structure to be populated with the host domain value.
132-
virtual void TranslateTime(core::Signal* signal,
133-
hsa_amd_profiling_async_copy_time_t& time) {
134-
return TranslateTime(signal, (hsa_amd_profiling_dispatch_time_t&)time);
135-
}
132+
virtual void TranslateTime(core::Signal* signal, hsa_amd_profiling_async_copy_time_t& time) = 0;
136133

137134
// @brief Translate timestamp agent domain to host domain.
138135
//
@@ -248,9 +245,6 @@ class GpuAgent : public GpuAgentInt {
248245
// @brief Override from core::Agent.
249246
hsa_status_t DmaFill(void* ptr, uint32_t value, size_t count) override;
250247

251-
// @brief Get the next available end timestamp object.
252-
uint64_t* ObtainEndTsObject();
253-
254248
// @brief Override from core::Agent.
255249
hsa_status_t GetInfo(hsa_agent_info_t attribute, void* value) const override;
256250

@@ -284,6 +278,9 @@ class GpuAgent : public GpuAgentInt {
284278
void TranslateTime(core::Signal* signal,
285279
hsa_amd_profiling_dispatch_time_t& time) override;
286280

281+
// @brief Override from amd::GpuAgentInt.
282+
void TranslateTime(core::Signal* signal, hsa_amd_profiling_async_copy_time_t& time) override;
283+
287284
// @brief Override from amd::GpuAgentInt.
288285
uint64_t TranslateTime(uint64_t tick) override;
289286

@@ -490,9 +487,6 @@ class GpuAgent : public GpuAgentInt {
490487
// @brief Create internal queues and blits.
491488
void InitDma();
492489

493-
// @brief Initialize memory pool for end timestamp object.
494-
// @retval True if the memory pool for end timestamp object is initialized.
495-
bool InitEndTsPool();
496490

497491
// Bind index of peer device that is connected via xGMI links
498492
lazy_ptr<core::Blit>& GetXgmiBlit(const core::Agent& peer_agent);
@@ -503,23 +497,12 @@ class GpuAgent : public GpuAgentInt {
503497

504498
// Bind the Blit object that will drive the copy operation
505499
lazy_ptr<core::Blit>& GetBlitObject(const core::Agent& dst_agent, const core::Agent& src_agent);
506-
507500
// @brief Alternative aperture base address. Only on KV.
508501
uintptr_t ape1_base_;
509502

510503
// @brief Alternative aperture size. Only on KV.
511504
size_t ape1_size_;
512505

513-
// Each end ts is 32 bytes.
514-
static const size_t kTsSize = 32;
515-
516-
// Number of element in the pool.
517-
uint32_t end_ts_pool_size_;
518-
519-
std::atomic<uint32_t> end_ts_pool_counter_;
520-
521-
std::atomic<uint64_t*> end_ts_base_addr_;
522-
523506
DISALLOW_COPY_AND_ASSIGN(GpuAgent);
524507
};
525508

src/core/inc/signal.h

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,12 @@ class Signal;
8282
/// @brief ABI and object conversion struct for signals. May be shared between processes.
8383
struct SharedSignal {
8484
amd_signal_t amd_signal;
85+
uint64_t sdma_start_ts;
8586
Signal* core_signal;
8687
Check<0x71FCCA6A3D5D5276, true> id;
88+
uint8_t reserved[8];
89+
uint64_t sdma_end_ts;
90+
uint8_t reserved2[24];
8791

8892
SharedSignal() {
8993
memset(&amd_signal, 0, sizeof(amd_signal));
@@ -95,6 +99,39 @@ struct SharedSignal {
9599

96100
bool IsIPC() const { return core_signal == nullptr; }
97101

102+
void GetSdmaTsAddresses(uint64_t*& start, uint64_t*& end) {
103+
/*
104+
SDMA timestamps on gfx7xx/8xxx require 32 byte alignment (gfx9xx relaxes
105+
alignment to 8 bytes). This conflicts with the frozen format for amd_signal_t
106+
so we place the time stamps in sdma_start/end_ts instead (amd_signal.start_ts
107+
is also properly aligned). Reading of the timestamps occurs in GetRawTs().
108+
*/
109+
start = &sdma_start_ts;
110+
end = &sdma_end_ts;
111+
}
112+
113+
void CopyPrep() {
114+
// Clear sdma_end_ts before a copy so we can detect if the copy was done via
115+
// SDMA or blit kernel.
116+
sdma_start_ts = 0;
117+
sdma_end_ts = 0;
118+
}
119+
120+
void GetRawTs(bool FetchCopyTs, uint64_t& start, uint64_t& end) {
121+
/*
122+
If the read is for a copy we need to check if it was done by blit kernel or SDMA.
123+
Since we clear sdma_start/end_ts during CopyPrep we know it was a SDMA copy if one
124+
of those is non-zero. Otherwise return compute kernel stamps from amd_signal.
125+
*/
126+
if (FetchCopyTs && sdma_end_ts != 0) {
127+
start = sdma_start_ts;
128+
end = sdma_end_ts;
129+
return;
130+
}
131+
start = amd_signal.start_ts;
132+
end = amd_signal.end_ts;
133+
}
134+
98135
static __forceinline SharedSignal* Convert(hsa_signal_t signal) {
99136
SharedSignal* ret = reinterpret_cast<SharedSignal*>(static_cast<uintptr_t>(signal.handle) -
100137
offsetof(SharedSignal, amd_signal));
@@ -112,6 +149,12 @@ static_assert(std::is_standard_layout<SharedSignal>::value,
112149
"SharedSignal must remain standard layout for IPC use.");
113150
static_assert(std::is_trivially_destructible<SharedSignal>::value,
114151
"SharedSignal must not be modified on delete for IPC use.");
152+
static_assert((offsetof(SharedSignal, sdma_start_ts) % 32) == 0,
153+
"Bad SDMA time stamp alignment.");
154+
static_assert((offsetof(SharedSignal, sdma_end_ts) % 32) == 0,
155+
"Bad SDMA time stamp alignment.");
156+
static_assert(sizeof(SharedSignal) == 128,
157+
"Bad SharedSignal size.");
115158

116159
/// @brief Pool class for SharedSignal suitable for use with Shared.
117160
class SharedSignalPool_t : private BaseShared {
@@ -318,12 +361,23 @@ class Signal {
318361
/// @brief Checks if signal is currently in use by a wait API.
319362
bool InWaiting() const { return waiting_ != 0; }
320363

364+
// Prep for copy profiling. Store copy agent and ready API block.
321365
__forceinline void async_copy_agent(core::Agent* agent) {
322366
async_copy_agent_ = agent;
367+
core::SharedSignal::Convert(Convert(this))->CopyPrep();
323368
}
324369

325370
__forceinline core::Agent* async_copy_agent() { return async_copy_agent_; }
326371

372+
void GetSdmaTsAddresses(uint64_t*& start, uint64_t*& end) {
373+
core::SharedSignal::Convert(Convert(this))->GetSdmaTsAddresses(start, end);
374+
}
375+
376+
// Set FetchCopyTs = true when reading time stamps from a copy operation.
377+
void GetRawTs(bool FetchCopyTs, uint64_t& start, uint64_t& end) {
378+
core::SharedSignal::Convert(Convert(this))->GetRawTs(FetchCopyTs, start, end);
379+
}
380+
327381
/// @brief Structure which defines key signal elements like type and value.
328382
/// Address of this struct is used as a value for the opaque handle of type
329383
/// hsa_signal_t provided to the public API.

src/core/runtime/amd_blit_kernel.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -625,9 +625,8 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(
625625
// Insert barrier packets to handle dependent signals.
626626
// Barrier bit keeps signal checking traffic from competing with a copy.
627627
const uint16_t kBarrierPacketHeader = (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
628-
(1 << HSA_PACKET_HEADER_BARRIER) |
629628
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
630-
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
629+
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
631630

632631
hsa_barrier_and_packet_t barrier_packet = {0};
633632
barrier_packet.header = HSA_PACKET_TYPE_INVALID;
@@ -807,7 +806,6 @@ void BlitKernel::PopulateQueue(uint64_t index, uint64_t code_handle, void* args,
807806

808807
static const uint16_t kDispatchPacketHeader =
809808
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
810-
(((completion_signal.handle != 0) ? 1 : 0) << HSA_PACKET_HEADER_BARRIER) |
811809
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
812810
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
813811

src/core/runtime/amd_blit_sdma.cpp

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -141,8 +141,8 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initial
141141
platform_atomic_support_ = link.info.atomic_support_64bit;
142142
}
143143

144-
// Determine if sDMA microcode supports HDP flush command
145-
if (agent_->GetSdmaMicrocodeVersion() >= SDMA_PKT_HDP_FLUSH::kMinVersion_) {
144+
// HDP flush supported on gfx900 and forward.
145+
if (agent_->isa()->GetMajorVersion() > 8) {
146146
hdp_flush_support_ = true;
147147
}
148148

@@ -248,22 +248,13 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitC
248248
// profiling in the middle of the call.
249249
const bool profiling_enabled = agent_->profiling_enabled();
250250

251-
uint64_t* end_ts_addr = NULL;
251+
uint64_t* start_ts_addr = nullptr;
252+
uint64_t* end_ts_addr = nullptr;
252253
uint32_t total_timestamp_command_size = 0;
253254

254255
if (profiling_enabled) {
255-
// SDMA timestamp packet requires 32 byte of aligned memory, but
256-
// amd_signal_t::end_ts is not 32 byte aligned. So an extra copy packet to
257-
// read from a 32 byte aligned bounce buffer is required to avoid changing
258-
// the amd_signal_t ABI.
259-
260-
end_ts_addr = agent_->ObtainEndTsObject();
261-
if (end_ts_addr == NULL) {
262-
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
263-
}
264-
265-
total_timestamp_command_size =
266-
(2 * timestamp_command_size_) + linear_copy_command_size_;
256+
out_signal.GetSdmaTsAddresses(start_ts_addr, end_ts_addr);
257+
total_timestamp_command_size = 2 * timestamp_command_size_;
267258
}
268259

269260
// On agent that does not support platform atomic, we replace it with
@@ -315,8 +306,7 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitC
315306
}
316307

317308
if (profiling_enabled) {
318-
BuildGetGlobalTimestampCommand(
319-
command_addr, reinterpret_cast<void*>(&out_signal.signal_.start_ts));
309+
BuildGetGlobalTimestampCommand(command_addr, reinterpret_cast<void*>(start_ts_addr));
320310
command_addr += timestamp_command_size_;
321311
}
322312

@@ -337,11 +327,6 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitC
337327
BuildGetGlobalTimestampCommand(command_addr,
338328
reinterpret_cast<void*>(end_ts_addr));
339329
command_addr += timestamp_command_size_;
340-
341-
BuildCopyCommand(command_addr, 1,
342-
reinterpret_cast<void*>(&out_signal.signal_.end_ts),
343-
reinterpret_cast<void*>(end_ts_addr), sizeof(uint64_t));
344-
command_addr += linear_copy_command_size_;
345330
}
346331

347332
// After transfer is completed, decrement the signal value.

src/core/runtime/amd_gpu_agent.cpp

Lines changed: 21 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -86,10 +86,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
8686
memory_bus_width_(0),
8787
memory_max_frequency_(0),
8888
ape1_base_(0),
89-
ape1_size_(0),
90-
end_ts_pool_size_(0),
91-
end_ts_pool_counter_(0),
92-
end_ts_base_addr_(NULL) {
89+
ape1_size_(0) {
9390
const bool is_apu_node = (properties_.NumCPUCores > 0);
9491
profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE;
9592

@@ -144,10 +141,6 @@ GpuAgent::~GpuAgent() {
144141
}
145142
}
146143

147-
if (end_ts_base_addr_ != NULL) {
148-
core::Runtime::runtime_singleton_->FreeMemory(end_ts_base_addr_);
149-
}
150-
151144
if (ape1_base_ != 0) {
152145
_aligned_free(reinterpret_cast<void*>(ape1_base_));
153146
}
@@ -405,58 +398,6 @@ void GpuAgent::InitCacheList() {
405398
cache_props_[i].CacheLevel, cache_props_[i].CacheSize));
406399
}
407400

408-
bool GpuAgent::InitEndTsPool() {
409-
if (HSA_PROFILE_FULL == profile_) {
410-
return true;
411-
}
412-
413-
if (end_ts_base_addr_.load(std::memory_order_acquire) != NULL) {
414-
return true;
415-
}
416-
417-
ScopedAcquire<KernelMutex> lock(&blit_lock_);
418-
419-
if (end_ts_base_addr_.load(std::memory_order_relaxed) != NULL) {
420-
return true;
421-
}
422-
423-
end_ts_pool_size_ =
424-
static_cast<uint32_t>((BlitSdmaBase::kQueueSize + BlitSdmaBase::kCopyPacketSize - 1) /
425-
(BlitSdmaBase::kCopyPacketSize));
426-
427-
// Allocate end timestamp object for both h2d and d2h DMA.
428-
const size_t alloc_size = 2 * end_ts_pool_size_ * kTsSize;
429-
430-
core::Runtime* runtime = core::Runtime::runtime_singleton_;
431-
432-
uint64_t* buff = NULL;
433-
if (HSA_STATUS_SUCCESS !=
434-
runtime->AllocateMemory(local_region_, alloc_size,
435-
MemoryRegion::AllocateRestrict,
436-
reinterpret_cast<void**>(&buff))) {
437-
return false;
438-
}
439-
440-
end_ts_base_addr_.store(buff, std::memory_order_release);
441-
442-
return true;
443-
}
444-
445-
uint64_t* GpuAgent::ObtainEndTsObject() {
446-
if (end_ts_base_addr_ == NULL) {
447-
return NULL;
448-
}
449-
450-
const uint32_t end_ts_index =
451-
end_ts_pool_counter_.fetch_add(1U, std::memory_order_acq_rel) %
452-
end_ts_pool_size_;
453-
const static size_t kNumU64 = kTsSize / sizeof(uint64_t);
454-
uint64_t* end_ts_addr = &end_ts_base_addr_[end_ts_index * kNumU64];
455-
assert(IsMultipleOf(end_ts_addr, kTsSize));
456-
457-
return end_ts_addr;
458-
}
459-
460401
hsa_status_t GpuAgent::IterateRegion(
461402
hsa_status_t (*callback)(hsa_region_t region, void* data),
462403
void* data) const {
@@ -701,10 +642,6 @@ hsa_status_t GpuAgent::DmaFill(void* ptr, uint32_t value, size_t count) {
701642
}
702643

703644
hsa_status_t GpuAgent::EnableDmaProfiling(bool enable) {
704-
if (enable && !InitEndTsPool()) {
705-
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
706-
}
707-
708645
for (auto& blit : blits_) {
709646
if (blit.created()) {
710647
const hsa_status_t stat = blit->EnableProfiling(enable);
@@ -1099,16 +1036,28 @@ void GpuAgent::ReleaseQueueScratch(ScratchInfo& scratch) {
10991036

11001037
void GpuAgent::TranslateTime(core::Signal* signal,
11011038
hsa_amd_profiling_dispatch_time_t& time) {
1039+
uint64_t start, end;
1040+
signal->GetRawTs(false, start, end);
11021041
// Order is important, we want to translate the end time first to ensure that packet duration is
11031042
// not impacted by clock measurement latency jitter.
1104-
time.end = TranslateTime(signal->signal_.end_ts);
1105-
time.start = TranslateTime(signal->signal_.start_ts);
1106-
1107-
if ((signal->signal_.start_ts == 0) || (signal->signal_.end_ts == 0) ||
1108-
(signal->signal_.start_ts > t1_.GPUClockCounter) ||
1109-
(signal->signal_.end_ts > t1_.GPUClockCounter) ||
1110-
(signal->signal_.start_ts < t0_.GPUClockCounter) ||
1111-
(signal->signal_.end_ts < t0_.GPUClockCounter))
1043+
time.end = TranslateTime(end);
1044+
time.start = TranslateTime(start);
1045+
1046+
if ((start == 0) || (end == 0) || (start > t1_.GPUClockCounter) || (end > t1_.GPUClockCounter) ||
1047+
(start < t0_.GPUClockCounter) || (end < t0_.GPUClockCounter))
1048+
debug_print("Signal %p time stamps may be invalid.", &signal->signal_);
1049+
}
1050+
1051+
void GpuAgent::TranslateTime(core::Signal* signal, hsa_amd_profiling_async_copy_time_t& time) {
1052+
uint64_t start, end;
1053+
signal->GetRawTs(true, start, end);
1054+
// Order is important, we want to translate the end time first to ensure that packet duration is
1055+
// not impacted by clock measurement latency jitter.
1056+
time.end = TranslateTime(end);
1057+
time.start = TranslateTime(start);
1058+
1059+
if ((start == 0) || (end == 0) || (start > t1_.GPUClockCounter) || (end > t1_.GPUClockCounter) ||
1060+
(start < t0_.GPUClockCounter) || (end < t0_.GPUClockCounter))
11121061
debug_print("Signal %p time stamps may be invalid.", &signal->signal_);
11131062
}
11141063

@@ -1215,11 +1164,6 @@ void GpuAgent::BindTrapHandler() {
12151164
return;
12161165
}
12171166

1218-
// Disable trap handler on APUs until KFD is fixed.
1219-
if (profile_ == HSA_PROFILE_FULL) {
1220-
return;
1221-
}
1222-
12231167
// Assemble the trap handler source code.
12241168
AssembleShader("TrapHandler", AssembleTarget::ISA, trap_code_buf_, trap_code_buf_size_);
12251169

src/core/runtime/default_signal.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,10 @@ hsa_signal_value_t BusyWaitSignal::WaitRelaxed(hsa_signal_condition_t condition,
8989
timer::fast_clock::time_point start_time, time;
9090
start_time = timer::fast_clock::now();
9191

92+
// Set a polling timeout value
93+
// Should be a few times bigger than null kernel latency
94+
const timer::fast_clock::duration kMaxElapsed = std::chrono::microseconds(200);
95+
9296
uint64_t hsa_freq;
9397
HSA::hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &hsa_freq);
9498
const timer::fast_clock::duration fast_timeout =
@@ -127,7 +131,9 @@ hsa_signal_value_t BusyWaitSignal::WaitRelaxed(hsa_signal_condition_t condition,
127131
value = atomic::Load(&signal_.value, std::memory_order_relaxed);
128132
return hsa_signal_value_t(value);
129133
}
130-
os::uSleep(20);
134+
if (time - start_time > kMaxElapsed) {
135+
os::uSleep(20);
136+
}
131137
}
132138
}
133139

0 commit comments

Comments
 (0)