diff --git a/src/core/inc/amd_gpu_agent.h b/src/core/inc/amd_gpu_agent.h index c7791ad29..61aefcdec 100644 --- a/src/core/inc/amd_gpu_agent.h +++ b/src/core/inc/amd_gpu_agent.h @@ -67,6 +67,7 @@ struct ScratchInfo { void* queue_base; size_t size; size_t size_per_thread; + uint32_t lanes_per_wave; ptrdiff_t queue_process_offset; bool large; bool retry; diff --git a/src/core/inc/amd_gpu_pm4.h b/src/core/inc/amd_gpu_pm4.h index b188682ac..008e5efa5 100644 --- a/src/core/inc/amd_gpu_pm4.h +++ b/src/core/inc/amd_gpu_pm4.h @@ -72,6 +72,12 @@ # define PM4_ACQUIRE_MEM_COHER_CNTL_SH_ICACHE_ACTION_ENA (1 << 29) #define PM4_ACQUIRE_MEM_DW2_COHER_SIZE(x) (((x) & 0xFFFFFFFF) << 0) #define PM4_ACQUIRE_MEM_DW3_COHER_SIZE_HI(x) (((x) & 0xFF) << 0) +#define PM4_ACQUIRE_MEM_DW7_GCR_CNTL(x) (((x) & 0x7FFFF) << 0) +# define PM4_ACQUIRE_MEM_GCR_CNTL_GLI_INV(x) (((x) & 0x3) << 0) +# define PM4_ACQUIRE_MEM_GCR_CNTL_GLK_INV (1 << 7) +# define PM4_ACQUIRE_MEM_GCR_CNTL_GLV_INV (1 << 8) +# define PM4_ACQUIRE_MEM_GCR_CNTL_GL1_INV (1 << 9) +# define PM4_ACQUIRE_MEM_GCR_CNTL_GL2_INV (1 << 14) #define PM4_RELEASE_MEM_DW1_EVENT_INDEX(x) (((x) & 0xF) << 8) # define PM4_RELEASE_MEM_EVENT_INDEX_AQL 0x7 diff --git a/src/core/inc/amd_gpu_shaders.h b/src/core/inc/amd_gpu_shaders.h index 59ba7dc4a..089390ed6 100644 --- a/src/core/inc/amd_gpu_shaders.h +++ b/src/core/inc/amd_gpu_shaders.h @@ -127,6 +127,13 @@ static const unsigned int kCodeTrapHandler9[] = { .set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT , 26 .set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT , 15 .set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK , 0x1F8000 + .elseif .amdgcn.gfx_generation_number == 10 + .set TTMP11_SAVE_REPLAY_W64H_SHIFT , 31 + .set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT , 24 + .set SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT , 25 + .set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT , 15 + .set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK , 0x3F8000 + .set SQ_WAVE_IB_STS_REPLAY_W64H_MASK , 0x2000000 .else .error "unsupported target" .endif @@ -217,6 +224,14 @@ static const unsigned int kCodeTrapHandler9[] = { s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 .endif + .if .amdgcn.gfx_generation_number == 10 + s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) + s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK + s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT) + s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK + s_or_b32 ttmp2, ttmp2, ttmp3 + s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 + .endif // Restore SQ_WAVE_STATUS. s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 @@ -296,6 +311,77 @@ static const unsigned int kCodeFill8[] = { 0x00001902, 0xD11C6A03, 0x01A90103, 0xBF82FFF5, 0xBF810000, }; +static const unsigned int kCodeCopyAligned10[] = { + 0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020, + 0xF4080400, 0xFA000030, 0xF4080500, 0xFA000040, 0xF4000600, 0xFA000050, + 0xBF8CC07F, 0x8F028602, 0xD70F6A00, 0x00020002, 0x7E060205, 0xD70F6A02, + 0x00020004, 0xD5286A03, 0x01A90103, 0x7E0A0207, 0xD70F6A04, 0x00020006, + 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001102, 0xBF86000F, 0x87FE6A7E, + 0xDC200000, 0x017D0002, 0xBF8C3F70, 0xD70F6A02, 0x00020418, 0xD5286A03, + 0x01A90103, 0xDC600000, 0x007D0104, 0xD70F6A04, 0x00020818, 0xD5286A05, + 0x01A90105, 0xBF82FFEE, 0xBEFE04C1, 0x8F198418, 0x34020084, 0x7E060209, + 0xD70F6A02, 0x00020208, 0xD5286A03, 0x01A90103, 0x7E0A020B, 0xD70F6A04, + 0x0002020A, 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001902, 0xBF86000E, + 0xDC380000, 0x087D0002, 0xD70F6A02, 0x00020419, 0xD5286A03, 0x01A90103, + 0xBF8C3F70, 0xDC780000, 0x007D0804, 0xD70F6A04, 0x00020819, 0xD5286A05, + 0x01A90105, 0xBF82FFEF, 0x8F198218, 0x34020082, 0x7E06020D, 0xD70F6A02, + 0x0002020C, 0xD5286A03, 0x01A90103, 0x7E0A020F, 0xD70F6A04, 0x0002020E, + 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00002102, 0xBF86000F, 0x87FE6A7E, + 0xDC300000, 0x017D0002, 0xD70F6A02, 0x00020419, 0xD5286A03, 0x01A90103, + 0xBF8C3F70, 0xDC700000, 0x007D0104, 0xD70F6A04, 0x00020819, 0xD5286A05, + 0x01A90105, 0xBF82FFEE, 0xBEFE04C1, 0x7E060211, 0xD70F6A02, 0x00020010, + 0xD5286A03, 0x01A90103, 0x7E0A0213, 0xD70F6A04, 0x00020012, 0xD5286A05, + 0x01A90105, 0xD4E1006A, 0x00002902, 0xBF860006, 0x87FE6A7E, 0xDC200000, + 0x017D0002, 0xBF8C3F70, 0xDC600000, 0x007D0104, 0xBF810000, +}; + +static const unsigned int kCodeCopyMisaligned10[] = { + 0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020, + 0xF4000400, 0xFA000030, 0xBF8CC07F, 0x8F028602, 0xD70F6A00, 0x00020002, + 0x7E060205, 0xD70F6A02, 0x00020004, 0xD5286A03, 0x01A90103, 0x7E0A0207, + 0xD70F6A04, 0x00020006, 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001102, + 0xBF860032, 0xDC200000, 0x067D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, + 0x01A90103, 0xDC200000, 0x077D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, + 0x01A90103, 0xDC200000, 0x087D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, + 0x01A90103, 0xDC200000, 0x097D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, + 0x01A90103, 0xBF8C3F70, 0xDC600000, 0x007D0604, 0xD70F6A04, 0x00020810, + 0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0704, 0xD70F6A04, 0x00020810, + 0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0804, 0xD70F6A04, 0x00020810, + 0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0904, 0xD70F6A04, 0x00020810, + 0xD5286A05, 0x01A90105, 0xBF82FFCB, 0x7E060209, 0xD70F6A02, 0x00020008, + 0xD5286A03, 0x01A90103, 0x7E0A020B, 0xD70F6A04, 0x0002000A, 0xD5286A05, + 0x01A90105, 0xD4E1006A, 0x00001902, 0xBF86000F, 0x87FE6A7E, 0xDC200000, + 0x017D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, 0x01A90103, 0xBF8C3F70, + 0xDC600000, 0x007D0104, 0xD70F6A04, 0x00020810, 0xD5286A05, 0x01A90105, + 0xBF82FFEE, 0xBF810000, +}; + +static const unsigned int kCodeFill10[] = { + 0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xBF8CC07F, 0x8F028602, + 0xD70F6A00, 0x00020002, 0x7E08020A, 0x7E0A020A, 0x7E0C020A, 0x7E0E020A, + 0x8F0C840B, 0x34020084, 0x7E060205, 0xD70F6A02, 0x00020204, 0xD5286A03, + 0x01A90103, 0xD4E1006A, 0x00000D02, 0xBF860007, 0xDC780000, 0x007D0402, + 0xD70F6A02, 0x0002040C, 0xD5286A03, 0x01A90103, 0xBF82FFF6, 0x8F0C820B, + 0x34020082, 0x7E060207, 0xD70F6A02, 0x00020206, 0xD5286A03, 0x01A90103, + 0xD4E1006A, 0x00001102, 0xBF860008, 0x87FE6A7E, 0xDC700000, 0x007D0402, + 0xD70F6A02, 0x0002040C, 0xD5286A03, 0x01A90103, 0xBF82FFF5, 0xBF810000, +}; + +static const unsigned int kCodeTrapHandler10[] = { + 0xB96EF803, 0x8770FF6E, 0x10000100, 0xBF06FF70, 0x00000100, 0xBEF003FF, + 0x20000000, 0xBF85000E, 0x8770FF6E, 0x00000800, 0xBEF003F4, 0xBF85000A, + 0x93EEFF6D, 0x00080010, 0xBF84002C, 0xBF06826E, 0xBEF003FF, 0x80000000, + 0xBF850003, 0x806C846C, 0x826D806D, 0xBF820025, 0xBEFE03FF, 0x80000000, + 0xBF90000A, 0xBF800007, 0xBF0C9F7E, 0xBF84FFFD, 0x876EFF7E, 0x000003FF, + 0x8F6E836E, 0xF4051BBD, 0xDC000000, 0xBF8CC07F, 0xF4051BB7, 0xFA0000C0, + 0xBF8CC07F, 0xBEF10380, 0xF6811C37, 0xFA000008, 0xBF8CC07F, 0x88707170, + 0xBF85000E, 0xF4051C37, 0xFA000010, 0xBF8CC07F, 0x87F07070, 0xBF840009, + 0xF4011BB7, 0xFA000018, 0xBF8CC07F, 0xF4411BB8, 0xFA000000, 0xBF8CC07F, + 0xBEFC0380, 0xBF800000, 0xBF900001, 0x8878FF78, 0x00002000, 0x906E8977, + 0x876FFF6E, 0x003F8000, 0x906E8677, 0x876EFF6E, 0x02000000, 0x886E6F6E, + 0xB9EEF807, 0x87FE7E7E, 0x87EA6A6A, 0xB9F8F802, 0xBE80226C, +}; + } // namespace amd #endif // header guard diff --git a/src/core/inc/registers.h b/src/core/inc/registers.h index 39d86aecf..211ff5f9d 100644 --- a/src/core/inc/registers.h +++ b/src/core/inc/registers.h @@ -85,6 +85,10 @@ BUF_NUM_FORMAT_RESERVED_6__VI = 0x00000006, BUF_NUM_FORMAT_FLOAT = 0x00000007, } BUF_NUM_FORMAT; +typedef enum BUF_FORMAT { +BUF_FORMAT_32_UINT = 0x00000014, +} BUF_FORMAT; + typedef enum SQ_SEL_XYZW01 { SQ_SEL_0 = 0x00000000, SQ_SEL_1 = 0x00000001, @@ -201,4 +205,38 @@ SQ_SEL_W = 0x00000007, float f32All; }; + union SQ_BUF_RSRC_WORD3_GFX10 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int FORMAT : 7; + unsigned int RESERVED1 : 2; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int RESOURCE_LEVEL : 1; + unsigned int RESERVED2 : 3; + unsigned int OOB_SELECT : 2; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int OOB_SELECT : 2; + unsigned int RESERVED2 : 3; + unsigned int RESOURCE_LEVEL : 1; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int RESERVED1 : 2; + unsigned int FORMAT : 7; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; #endif // header guard diff --git a/src/core/inc/sdma_registers.h b/src/core/inc/sdma_registers.h index 6e1a7fbf5..cf91cf002 100644 --- a/src/core/inc/sdma_registers.h +++ b/src/core/inc/sdma_registers.h @@ -295,7 +295,14 @@ typedef struct SDMA_PKT_FENCE_TAG { struct { unsigned int op : 8; unsigned int sub_op : 8; - unsigned int reserved_0 : 16; + unsigned int mtype : 3; + unsigned int gcc : 1; + unsigned int sys : 1; + unsigned int pad1 : 1; + unsigned int snp : 1; + unsigned int gpa : 1; + unsigned int l2_policy : 2; + unsigned int reserved_0 : 6; }; unsigned int DW_0_DATA; } HEADER_UNION; diff --git a/src/core/runtime/amd_aql_queue.cpp b/src/core/runtime/amd_aql_queue.cpp index 3a8e68641..f2bdf85d3 100644 --- a/src/core/runtime/amd_aql_queue.cpp +++ b/src/core/runtime/amd_aql_queue.cpp @@ -749,7 +749,7 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) { } // Process only one queue error. - if (error_code == 1) { + if (error_code & 0x401) { // insufficient scratch, wave64 or wave32 // Insufficient scratch - recoverable, don't process dynamic scratch if errors are present. auto& scratch = queue->queue_scratch_; @@ -764,10 +764,11 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) { uint32_t scratch_request = pkt.dispatch.private_segment_size; scratch.size_per_thread = scratch_request; + scratch.lanes_per_wave = (error_code & 0x400) ? 32 : 64; // Align whole waves to 1KB. - scratch.size_per_thread = AlignUp(scratch.size_per_thread, 16); + scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave); scratch.size = scratch.size_per_thread * (queue->amd_queue_.max_cu_id + 1) * - queue->agent_->properties().MaxSlotsScratchCU * queue->agent_->properties().WaveFrontSize; + queue->agent_->properties().MaxSlotsScratchCU * scratch.lanes_per_wave; queue->agent_->AcquireQueueScratch(scratch); @@ -948,7 +949,7 @@ void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) { rel_mem[4] = 0; rel_mem[5] = 0; rel_mem[6] = 0; - } else if (agent_->isa()->GetMajorVersion() == 9) { + } else if (agent_->isa()->GetMajorVersion() >= 9) { // Construct an AQL packet to jump to the PM4 IB. struct amd_aql_pm4_ib { uint16_t header; @@ -1001,7 +1002,7 @@ void AqlQueue::InitScratchSRD() { SQ_BUF_RSRC_WORD0 srd0; SQ_BUF_RSRC_WORD1 srd1; SQ_BUF_RSRC_WORD2 srd2; - SQ_BUF_RSRC_WORD3 srd3; + uint32_t srd3_u32; uint32_t scratch_base_hi = 0; uintptr_t scratch_base = uintptr_t(queue_scratch_.queue_base); @@ -1017,33 +1018,60 @@ void AqlQueue::InitScratchSRD() { srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size); - srd3.bits.DST_SEL_X = SQ_SEL_X; - srd3.bits.DST_SEL_Y = SQ_SEL_Y; - srd3.bits.DST_SEL_Z = SQ_SEL_Z; - srd3.bits.DST_SEL_W = SQ_SEL_W; - srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT; - srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32; - srd3.bits.ELEMENT_SIZE = 1; // 4 - srd3.bits.INDEX_STRIDE = 3; // 64 - srd3.bits.ADD_TID_ENABLE = 1; - srd3.bits.ATC__CI__VI = (agent_->profile() == HSA_PROFILE_FULL); - srd3.bits.HASH_ENABLE = 0; - srd3.bits.HEAP = 0; - srd3.bits.MTYPE__CI__VI = 0; - srd3.bits.TYPE = SQ_RSRC_BUF; + if (agent_->isa()->GetMajorVersion() < 10) { + SQ_BUF_RSRC_WORD3 srd3; + + srd3.bits.DST_SEL_X = SQ_SEL_X; + srd3.bits.DST_SEL_Y = SQ_SEL_Y; + srd3.bits.DST_SEL_Z = SQ_SEL_Z; + srd3.bits.DST_SEL_W = SQ_SEL_W; + srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT; + srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32; + srd3.bits.ELEMENT_SIZE = 1; // 4 + srd3.bits.INDEX_STRIDE = 3; // 64 + srd3.bits.ADD_TID_ENABLE = 1; + srd3.bits.ATC__CI__VI = (agent_->profile() == HSA_PROFILE_FULL); + srd3.bits.HASH_ENABLE = 0; + srd3.bits.HEAP = 0; + srd3.bits.MTYPE__CI__VI = 0; + srd3.bits.TYPE = SQ_RSRC_BUF; + + srd3_u32 = srd3.u32All; + } else { + SQ_BUF_RSRC_WORD3_GFX10 srd3; + + srd3.bits.DST_SEL_X = SQ_SEL_X; + srd3.bits.DST_SEL_Y = SQ_SEL_Y; + srd3.bits.DST_SEL_Z = SQ_SEL_Z; + srd3.bits.DST_SEL_W = SQ_SEL_W; + srd3.bits.FORMAT = BUF_FORMAT_32_UINT; + srd3.bits.RESERVED1 = 0; + srd3.bits.INDEX_STRIDE = 0; // filled in by CP + srd3.bits.ADD_TID_ENABLE = 1; + srd3.bits.RESOURCE_LEVEL = 1; + srd3.bits.RESERVED2 = 0; + srd3.bits.OOB_SELECT = 2; // no bounds check in swizzle mode + srd3.bits.TYPE = SQ_RSRC_BUF; + + srd3_u32 = srd3.u32All; + } // Update Queue's Scratch descriptor's property amd_queue_.scratch_resource_descriptor[0] = srd0.u32All; amd_queue_.scratch_resource_descriptor[1] = srd1.u32All; amd_queue_.scratch_resource_descriptor[2] = srd2.u32All; - amd_queue_.scratch_resource_descriptor[3] = srd3.u32All; + amd_queue_.scratch_resource_descriptor[3] = srd3_u32; // Populate flat scratch parameters in amd_queue_. amd_queue_.scratch_backing_memory_location = queue_scratch_.queue_process_offset; amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size; - amd_queue_.scratch_workitem_byte_size = - uint32_t(queue_scratch_.size_per_thread); + + // For backwards compatibility this field records the per-lane scratch + // for a 64 lane wavefront. If scratch was allocated for 32 lane waves + // then the effective size for a 64 lane wave is halved. + amd_queue_.scratch_wave64_lane_byte_size = + uint32_t((queue_scratch_.size_per_thread * queue_scratch_.lanes_per_wave) / 64); // Set concurrent wavefront limits only when scratch is being used. COMPUTE_TMPRING_SIZE tmpring_size = {}; @@ -1059,8 +1087,8 @@ void AqlQueue::InitScratchSRD() { // Scratch is allocated program COMPUTE_TMPRING_SIZE register // Scratch Size per Wave is specified in terms of kilobytes - uint32_t wave_size = agent_props.WaveFrontSize; - uint32_t wave_scratch = (((wave_size * queue_scratch_.size_per_thread) + 1023) / 1024); + uint32_t wave_scratch = (((queue_scratch_.lanes_per_wave * + queue_scratch_.size_per_thread) + 1023) / 1024); tmpring_size.bits.WAVESIZE = wave_scratch; assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow."); uint32_t num_waves = (queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024)); diff --git a/src/core/runtime/amd_blit_sdma.cpp b/src/core/runtime/amd_blit_sdma.cpp index 2a189de89..188345e2d 100644 --- a/src/core/runtime/amd_blit_sdma.cpp +++ b/src/core/runtime/amd_blit_sdma.cpp @@ -142,7 +142,8 @@ hsa_status_t BlitSdma::Initial } // HDP flush supported on gfx900 and forward. - if (agent_->isa()->GetMajorVersion() > 8) { + // FIXME: Not working on gfx10, raises SRBM write protection interrupt. + if (agent_->isa()->GetMajorVersion() == 9) { hdp_flush_support_ = true; } @@ -623,6 +624,10 @@ void BlitSdma::BuildFenceComma packet_addr->HEADER_UNION.op = SDMA_OP_FENCE; + if (agent_->isa()->GetMajorVersion() >= 10) { + packet_addr->HEADER_UNION.mtype = 3; + } + packet_addr->ADDR_LO_UNION.addr_31_0 = ptrlow32(fence); packet_addr->ADDR_HI_UNION.addr_63_32 = ptrhigh32(fence); diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp index 5b27486dd..08950dda9 100644 --- a/src/core/runtime/amd_gpu_agent.cpp +++ b/src/core/runtime/amd_gpu_agent.cpp @@ -173,6 +173,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar ASICShader compute_7; ASICShader compute_8; ASICShader compute_9; + ASICShader compute_10; }; std::map compiled_shaders = { @@ -181,24 +182,28 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar {NULL, 0, 0, 0}, {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4}, + {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4}, }}, {"CopyAligned", { {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12}, {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, + {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, }}, {"CopyMisaligned", { {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10}, {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, + {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, }}, {"Fill", { {kCodeFill7, sizeof(kCodeFill7), 19, 8}, {kCodeFill8, sizeof(kCodeFill8), 19, 8}, {kCodeFill8, sizeof(kCodeFill8), 19, 8}, + {kCodeFill10, sizeof(kCodeFill10), 19, 8}, }}}; auto compiled_shader_it = compiled_shaders.find(func_name); @@ -217,6 +222,9 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar case 9: asic_shader = &compiled_shader_it->second.compute_9; break; + case 10: + asic_shader = &compiled_shader_it->second.compute_10; + break; default: assert(false && "Precompiled shader unavailable for target"); } @@ -886,13 +894,22 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, // Allocate scratch memory ScratchInfo scratch; if (private_segment_size == UINT_MAX) { - private_segment_size = 0; + private_segment_size = (profile_ == HSA_PROFILE_BASE) ? 0 : scratch_per_thread_; + } + + if (private_segment_size > 262128) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + scratch.lanes_per_wave = 64; + scratch.size_per_thread = AlignUp(private_segment_size, 1024 / scratch.lanes_per_wave); + if (scratch.size_per_thread > 262128) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } scratch.size_per_thread = private_segment_size; const uint32_t num_cu = properties_.NumFComputeCores / properties_.NumSIMDPerCU; - scratch.size = - scratch.size_per_thread * properties_.MaxSlotsScratchCU * properties_.WaveFrontSize * num_cu; + scratch.size = scratch.size_per_thread * 32 * scratch.lanes_per_wave * num_cu; scratch.queue_base = nullptr; scratch.queue_process_offset = 0; @@ -941,7 +958,8 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) { ScopedAcquire lock(&scratch_lock_); // Limit to 1/8th of scratch pool for small scratch and 1/4 of that for a single queue. size_t small_limit = scratch_pool_.size() >> 3; - size_t single_limit = small_limit >> 2; + // Lift limit for 2.10 release RCCL workaround. + size_t single_limit = 146800640; //small_limit >> 2; bool large = (scratch.size > single_limit) || (scratch_pool_.size() - scratch_pool_.remaining() + scratch.size > small_limit); large = (isa_->GetMajorVersion() < 8) ? false : large; @@ -1186,29 +1204,40 @@ void GpuAgent::InvalidateCodeCaches() { // Microcode is handling code cache invalidation. return; } - } else if (isa_->GetMajorVersion() > 9) { + } else if (isa_->GetMajorVersion() > 10) { assert(false && "Code cache invalidation not implemented for this agent"); } // Invalidate caches which may hold lines of code object allocation. - constexpr uint32_t cache_inv_size_dw = 7; - uint32_t cache_inv[cache_inv_size_dw]; + uint32_t cache_inv[8] = {0}; + uint32_t cache_inv_size_dw; + + if (isa_->GetMajorVersion() < 10) { + cache_inv[1] = PM4_ACQUIRE_MEM_DW1_COHER_CNTL( + PM4_ACQUIRE_MEM_COHER_CNTL_SH_ICACHE_ACTION_ENA | + PM4_ACQUIRE_MEM_COHER_CNTL_SH_KCACHE_ACTION_ENA | + PM4_ACQUIRE_MEM_COHER_CNTL_TC_ACTION_ENA | + PM4_ACQUIRE_MEM_COHER_CNTL_TC_WB_ACTION_ENA); + + cache_inv_size_dw = 7; + } else { + cache_inv[7] = PM4_ACQUIRE_MEM_DW7_GCR_CNTL( + PM4_ACQUIRE_MEM_GCR_CNTL_GLI_INV(1) | + PM4_ACQUIRE_MEM_GCR_CNTL_GLK_INV | + PM4_ACQUIRE_MEM_GCR_CNTL_GLV_INV | + PM4_ACQUIRE_MEM_GCR_CNTL_GL1_INV | + PM4_ACQUIRE_MEM_GCR_CNTL_GL2_INV); + + cache_inv_size_dw = 8; + } cache_inv[0] = PM4_HDR(PM4_HDR_IT_OPCODE_ACQUIRE_MEM, cache_inv_size_dw, - isa_->GetMajorVersion()); - cache_inv[1] = PM4_ACQUIRE_MEM_DW1_COHER_CNTL( - PM4_ACQUIRE_MEM_COHER_CNTL_SH_ICACHE_ACTION_ENA | - PM4_ACQUIRE_MEM_COHER_CNTL_SH_KCACHE_ACTION_ENA | - PM4_ACQUIRE_MEM_COHER_CNTL_TC_ACTION_ENA | - PM4_ACQUIRE_MEM_COHER_CNTL_TC_WB_ACTION_ENA); + isa_->GetMajorVersion()); cache_inv[2] = PM4_ACQUIRE_MEM_DW2_COHER_SIZE(0xFFFFFFFF); cache_inv[3] = PM4_ACQUIRE_MEM_DW3_COHER_SIZE_HI(0xFF); - cache_inv[4] = 0; - cache_inv[5] = 0; - cache_inv[6] = 0; // Submit the command to the utility queue and wait for it to complete. - queues_[QueueUtility]->ExecutePM4(cache_inv, sizeof(cache_inv)); + queues_[QueueUtility]->ExecutePM4(cache_inv, cache_inv_size_dw * sizeof(uint32_t)); } lazy_ptr& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) { diff --git a/src/core/runtime/hsa.cpp b/src/core/runtime/hsa.cpp index bbb599db4..bb8e75881 100644 --- a/src/core/runtime/hsa.cpp +++ b/src/core/runtime/hsa.cpp @@ -1887,6 +1887,12 @@ static std::string ConvertOldTargetNameToNew( NewName = "amdgcn-amd-amdhsa--gfx906"; else if (OldName == "AMD:AMDGPU:9:0:8") NewName = "amdgcn-amd-amdhsa--gfx908"; + else if (OldName == "AMD:AMDGPU:10:1:0") + NewName = "amdgcn-amd-amdhsa--gfx1010"; + else if (OldName == "AMD:AMDGPU:10:1:1") + NewName = "amdgcn-amd-amdhsa--gfx1011"; + else if (OldName == "AMD:AMDGPU:10:1:2") + NewName = "amdgcn-amd-amdhsa--gfx1012"; else assert(false && "Unhandled target"); diff --git a/src/core/runtime/isa.cpp b/src/core/runtime/isa.cpp index 77631a8fe..2fa7d2872 100755 --- a/src/core/runtime/isa.cpp +++ b/src/core/runtime/isa.cpp @@ -218,6 +218,9 @@ const IsaRegistry::IsaMap IsaRegistry::GetSupportedIsas() { ISAREG_ENTRY_GEN(9, 0, 6, false, true ) ISAREG_ENTRY_GEN(9, 0, 8, false, true ) ISAREG_ENTRY_GEN(9, 0, 8, false, false) + ISAREG_ENTRY_GEN(10, 1, 0, false, false) + ISAREG_ENTRY_GEN(10, 1, 1, false, false) + ISAREG_ENTRY_GEN(10, 1, 2, false, false) return supported_isas; } diff --git a/src/inc/amd_hsa_elf.h b/src/inc/amd_hsa_elf.h index edee52605..4d9dba5b8 100644 --- a/src/inc/amd_hsa_elf.h +++ b/src/inc/amd_hsa_elf.h @@ -68,6 +68,9 @@ #define EF_AMDGPU_MACH_AMDGCN_GFX906_LC 0x02f #define EF_AMDGPU_MACH_AMDGCN_GFX908_LC 0x030 #define EF_AMDGPU_MACH_AMDGCN_GFX909_LC 0x031 +#define EF_AMDGPU_MACH_AMDGCN_GFX1010_LC 0x033 +#define EF_AMDGPU_MACH_AMDGCN_GFX1011_LC 0x034 +#define EF_AMDGPU_MACH_AMDGCN_GFX1012_LC 0x035 #define EF_AMDGPU_XNACK_LC 0x100 #define EF_AMDGPU_SRAM_ECC_LC 0x200 diff --git a/src/inc/amd_hsa_queue.h b/src/inc/amd_hsa_queue.h index 2176e8470..2da98964d 100644 --- a/src/inc/amd_hsa_queue.h +++ b/src/inc/amd_hsa_queue.h @@ -77,7 +77,7 @@ typedef struct AMD_QUEUE_ALIGN amd_queue_s { uint32_t scratch_resource_descriptor[4]; uint64_t scratch_backing_memory_location; uint64_t scratch_backing_memory_byte_size; - uint32_t scratch_workitem_byte_size; + uint32_t scratch_wave64_lane_byte_size; amd_queue_properties32_t queue_properties; uint32_t reserved3[2]; hsa_signal_t queue_inactive_signal; diff --git a/src/libamdhsacode/amd_hsa_code.cpp b/src/libamdhsacode/amd_hsa_code.cpp index 8bec5b2d5..7a133ff9c 100644 --- a/src/libamdhsacode/amd_hsa_code.cpp +++ b/src/libamdhsacode/amd_hsa_code.cpp @@ -579,6 +579,12 @@ namespace code { NewName = "amdgcn-amd-amdhsa--gfx906"; else if (OldName == "AMD:AMDGPU:9:0:8") NewName = "amdgcn-amd-amdhsa--gfx908"; + else if (OldName == "AMD:AMDGPU:10:1:0") + NewName = "amdgcn-amd-amdhsa--gfx1010"; + else if (OldName == "AMD:AMDGPU:10:1:1") + NewName = "amdgcn-amd-amdhsa--gfx1011"; + else if (OldName == "AMD:AMDGPU:10:1:2") + NewName = "amdgcn-amd-amdhsa--gfx1012"; else assert(false && "Unhandled target"); @@ -631,6 +637,9 @@ namespace code { case EF_AMDGPU_MACH_AMDGCN_GFX904_LC: isaName += "gfx904"; break; case EF_AMDGPU_MACH_AMDGCN_GFX906_LC: isaName += "gfx906"; break; case EF_AMDGPU_MACH_AMDGCN_GFX908_LC: isaName += "gfx908"; break; + case EF_AMDGPU_MACH_AMDGCN_GFX1010_LC: isaName += "gfx1010"; break; + case EF_AMDGPU_MACH_AMDGCN_GFX1011_LC: isaName += "gfx1011"; break; + case EF_AMDGPU_MACH_AMDGCN_GFX1012_LC: isaName += "gfx1012"; break; default: return false; } @@ -1348,6 +1357,8 @@ namespace code { asic = "VI"; } else if (major_version == 9) { asic = "GFX9"; + } else if (major_version == 10) { + asic = "GFX10"; } else { assert(!"unknown compute capability"); } diff --git a/src/loader/loaders.cpp b/src/loader/loaders.cpp index 439d8930d..5c72664d5 100644 --- a/src/loader/loaders.cpp +++ b/src/loader/loaders.cpp @@ -92,6 +92,9 @@ namespace loader { gfx902.handle = 902; gfx903.handle = 903; gfx908.handle = 908; + gfx1010.handle = 1010; + gfx1011.handle = 1011; + gfx1012.handle = 1012; } hsa_isa_t OfflineLoaderContext::IsaFromName(const char *name) @@ -123,6 +126,12 @@ namespace loader { return gfx903; } else if (sname == "AMD:AMDGPU:9:0:8") { return gfx908; + } else if (sname == "AMD:AMDGPU:10:1:0") { + return gfx1010; + } else if (sname == "AMD:AMDGPU:10:1:1") { + return gfx1011; + } else if (sname == "AMD:AMDGPU:10:1:2") { + return gfx1012; } assert(0); diff --git a/src/loader/loaders.hpp b/src/loader/loaders.hpp index dc8addd23..a16eac078 100644 --- a/src/loader/loaders.hpp +++ b/src/loader/loaders.hpp @@ -56,6 +56,7 @@ namespace loader { hsa_isa_t invalid; hsa_isa_t gfx700, gfx701, gfx800, gfx801, gfx802, gfx803, gfx804, gfx810; hsa_isa_t gfx900, gfx901, gfx902, gfx903, gfx908; + hsa_isa_t gfx1010, gfx1011, gfx1012; std::ostream& out; typedef std::set PointerSet; PointerSet pointers;