diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f012e808f..dee403688 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -97,7 +97,7 @@ if (ROCM_CCACHE_BUILD) endif() # if (ROCM_CCACHE_BUILD) ## Get version strings -get_version ( "1.2.0" ) +get_version ( "1.3.0" ) if ( ${ROCM_PATCH_VERSION} ) set ( VERSION_PATCH ${ROCM_PATCH_VERSION}) endif() diff --git a/src/core/common/hsa_table_interface.cpp b/src/core/common/hsa_table_interface.cpp index 11b1e4957..e48f24c54 100644 --- a/src/core/common/hsa_table_interface.cpp +++ b/src/core/common/hsa_table_interface.cpp @@ -1145,6 +1145,12 @@ hsa_status_t HSA_API hsa_amd_deregister_deallocation_callback(void* ptr, return amdExtTable->hsa_amd_deregister_deallocation_callback_fn(ptr, callback); } +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_signal_value_pointer(hsa_signal_t signal, + volatile hsa_signal_value_t** value_ptr) { + return amdExtTable->hsa_amd_signal_value_pointer_fn(signal, value_ptr); +} + // Tools only table interfaces. namespace rocr { diff --git a/src/core/inc/amd_gpu_shaders.h b/src/core/inc/amd_gpu_shaders.h index d33862afd..68cf52d74 100644 --- a/src/core/inc/amd_gpu_shaders.h +++ b/src/core/inc/amd_gpu_shaders.h @@ -116,7 +116,8 @@ static const unsigned int kCodeTrapHandler9[] = { .set SQ_WAVE_PC_HI_TRAP_ID_SIZE , 8 .set SQ_WAVE_PC_HI_TRAP_ID_BFE , (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16)) .set SQ_WAVE_PC_HI_HT_MASK , 0x1000000 - .set SQ_WAVE_STATUS_HALT_MASK , 0x2000 + .set SQ_WAVE_STATUS_HALT_BIT , 13 + .set SQ_WAVE_STATUS_HALT_BFE , (SQ_WAVE_STATUS_HALT_BIT | (1 << 16)) .set SQ_WAVE_TRAPSTS_ADDRESS_WATCH_MASK , 0x7080 .set SQ_WAVE_TRAPSTS_MEM_VIOL_MASK , 0x100 .set SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK , 0x800 @@ -128,11 +129,16 @@ static const unsigned int kCodeTrapHandler9[] = { .set MAX_NUM_DOORBELLS_MASK , ((1 << 10) - 1) .set SENDMSG_M0_DOORBELL_ID_BITS , 12 .set SENDMSG_M0_DOORBELL_ID_MASK , ((1 << SENDMSG_M0_DOORBELL_ID_BITS) - 1) - .set TTMP11_TRAP_RAISED_BIT , 7 - .set TTMP11_EXCP_RAISED_BIT , 8 - .set TTMP11_EVENTS_MASK , (1 << TTMP11_TRAP_RAISED_BIT) | (1 << TTMP11_EXCP_RAISED_BIT) - .set INSN_S_ENDPGM_OPCODE , 0xBF810000 - .set INSN_S_ENDPGM_MASK , 0xFFFF0000 + + .set TTMP7_DISPATCH_ID_CONVERTED_BIT , 31 + .set TTMP7_WAVE_STOPPED_BIT , 30 + .set TTMP7_SAVED_STATUS_HALT_BIT , 29 + .set TTMP7_SAVED_TRAP_ID_SHIFT , 25 + .set TTMP7_SAVED_TRAP_ID_BITS , 4 + .set TTMP7_SAVED_TRAP_ID_MASK , ((1 << TTMP7_SAVED_TRAP_ID_BITS) - 1) + .set TTMP7_PACKET_INDEX_BITS , 25 + .set TTMP7_PACKET_INDEX_MASK , ((1 << TTMP7_PACKET_INDEX_BITS) - 1) + .set TTMP11_PC_HI_SHIFT , 7 .if .amdgcn.gfx_generation_number == 9 .set DEBUG_INTERRUPT_CONTEXT_ID_BIT , 23 @@ -158,9 +164,9 @@ static const unsigned int kCodeTrapHandler9[] = { // ttmp14 = TMA[31:0] // ttmp15 = TMA[63:32] // gfx9: - // ttmp11 = SQ_WAVE_IB_STS[20:15], 0[16:0], TrapRaised[0], ExcpRaised[0], NoScratch[0], WaveIdInWG[5:0] + // ttmp11 = SQ_WAVE_IB_STS[20:15], 0[18:0], NoScratch[0], WaveIdInWG[5:0] // gfx10: - // ttmp11 = SQ_WAVE_IB_STS[25], SQ_WAVE_IB_STS[21:15], 0[14:0], TrapRaised[0], ExcpRaised[0], NoScratch[0], WaveIdInWG[5:0] + // ttmp11 = SQ_WAVE_IB_STS[25], SQ_WAVE_IB_STS[21:15], 0[16:0], NoScratch[0], WaveIdInWG[5:0] .macro mGetDoorbellId s_mov_b32 exec_lo, 0x80000000 @@ -197,6 +203,13 @@ static const unsigned int kCodeTrapHandler9[] = { .endm trap_entry: + s_andn2_b32 ttmp7, ttmp7, (TTMP7_SAVED_TRAP_ID_MASK << TTMP7_SAVED_TRAP_ID_SHIFT) | (1 << TTMP7_SAVED_STATUS_HALT_BIT) + + // Save the entry status.halt in ttmp7.saved_status_halt + s_bfe_u32 ttmp2, ttmp12, SQ_WAVE_STATUS_HALT_BFE + s_lshl_b32 ttmp2, ttmp2, TTMP7_SAVED_STATUS_HALT_BIT + s_or_b32 ttmp7, ttmp7, ttmp2 + // If trap raised (non-zero trap id) then branch. s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE s_cbranch_scc1 .trap_raised @@ -206,16 +219,6 @@ static const unsigned int kCodeTrapHandler9[] = { s_and_b32 ttmp3, ttmp2, (SQ_WAVE_TRAPSTS_MEM_VIOL_MASK | SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK) s_cbranch_scc1 .excp_raised - // If not address watch exception, then the trap enterered due to single step exception. - s_and_b32 ttmp3, ttmp2, SQ_WAVE_TRAPSTS_ADDRESS_WATCH_MASK - s_cbranch_scc0 .signal_debugger - - s_bitset1_b32 ttmp11, TTMP11_EXCP_RAISED_BIT - s_branch .signal_debugger - - .signal_trap_debugger: - s_bitset1_b32 ttmp11, TTMP11_TRAP_RAISED_BIT - .signal_debugger: // Fetch doorbell index for our queue. s_mov_b32 ttmp2, exec_lo @@ -224,8 +227,7 @@ static const unsigned int kCodeTrapHandler9[] = { s_mov_b32 exec_hi, ttmp3 // Restore exec_lo, move the doorbell_id into ttmp3 - s_and_b32 exec_lo, exec_lo, SENDMSG_M0_DOORBELL_ID_MASK - s_mov_b32 ttmp3, exec_lo + s_and_b32 ttmp3, exec_lo, SENDMSG_M0_DOORBELL_ID_MASK s_mov_b32 exec_lo, ttmp2 // Set the debug interrupt context id. @@ -241,26 +243,69 @@ static const unsigned int kCodeTrapHandler9[] = { // Restore m0 s_mov_b32 m0, ttmp2 - // If PC is at an s_endpgm instruction then don't halt the wavefront. - s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK - s_load_dword ttmp2, [ttmp0, ttmp1] + // Parking the wave requires saving the original pc in the preserved ttmps. + // Since all ttmps are used, we must first free ttmp6 by compressing the + // 40bit dispatch ptr in ttmp6:7 into a 25bit queue packet id. + // + // Register layout before parking the wave: + // + // ttmp6: dispatch_ptr[31:6] 0[5:0] + // ttmp7: 0[0] wave_stopped[0] status_halt[0] trap_id[3:0] 0[16:0] dispatch_ptr[39:32] + // ttmp11: 1st_level_ttmp11[31:23] 0[15:0] 1st_level_ttmp11[6:0] + // + // After parking the wave: + // + // ttmp6: pc_lo[31:0] + // ttmp7: 1[0] wave_stopped[0] status_halt[0] trap_id[3:0] packet_id[24:0] + // ttmp11: 1st_level_ttmp11[31:23] pc_hi[15:0] 1st_level_ttmp11[6:0] + // + // The conversion from dispatch ptr to queue packet index only needs to be + // done once, the first time the wave executes the trap handler. + + .if ((.amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor >= 3) || .amdgcn.gfx_generation_number > 10) + s_branch .halt_wave + .else + s_bitcmp1_b32 ttmp7, TTMP7_DISPATCH_ID_CONVERTED_BIT + s_cbranch_scc1 .ttmp7_has_dispatch_index + + s_and_b32 ttmp3, ttmp3, MAX_NUM_DOORBELLS_MASK + s_lshl_b32 ttmp3, ttmp3, 0x3 + + // Map doorbell index to amd_queue_t* through TMA (doorbell_queue_map). + s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], ttmp3 glc s_waitcnt lgkmcnt(0) - s_and_b32 ttmp2, ttmp2, INSN_S_ENDPGM_MASK - s_cmp_eq_u32 ttmp2, INSN_S_ENDPGM_OPCODE - s_cbranch_scc0 .halt_wave - // Since the 1st level trap handler calls the 2nd level handler when - // (mode.debug_en && !status.halt), we must clear mode.debug_en if we - // don't want to re-enter this handler indefinitely. - s_mov_b32 ttmp2, 0 - s_setreg_b32 hwreg(HW_REG_MODE, SQ_WAVE_MODE_DEBUG_EN_SHIFT, 1), ttmp2 + // Retrieve queue base_address from hsa_queue_t*. + s_load_dword ttmp2, [ttmp2, ttmp3], 0x8 glc + s_waitcnt lgkmcnt(0) - s_and_b32 ttmp12, ttmp12, ~SQ_WAVE_STATUS_HALT_MASK - mExitTrap + // The dispatch index is (dispatch_ptr.lo - base_address.lo) >> 6 + s_sub_u32 ttmp2, ttmp6, ttmp2 + s_lshr_b32 ttmp2, ttmp2, 0x6 + s_andn2_b32 ttmp7, ttmp7, TTMP7_PACKET_INDEX_MASK + s_or_b32 ttmp7, ttmp7, ttmp2 + s_bitset1_b32 ttmp7, TTMP7_DISPATCH_ID_CONVERTED_BIT - .excp_raised: - s_bitset1_b32 ttmp11, TTMP11_EXCP_RAISED_BIT + .ttmp7_has_dispatch_index: + // Save the PC + s_mov_b32 ttmp6, ttmp0 + s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK + s_lshl_b32 ttmp1, ttmp1, TTMP11_PC_HI_SHIFT + s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP11_PC_HI_SHIFT) + s_or_b32 ttmp11, ttmp11, ttmp1 + // Park the wave + s_getpc_b64 [ttmp0, ttmp1] + s_add_u32 ttmp0, ttmp0, .parked - . + s_addc_u32 ttmp1, ttmp1, 0x0 + s_branch .halt_wave + + .parked: + s_trap 0x2 + s_branch .parked + .endif + + .excp_raised: // If memory violation without XNACK error then signal queue error. // XNACK error will be handled by VM interrupt, since it has more information. s_and_b32 ttmp3, ttmp2, (SQ_WAVE_TRAPSTS_MEM_VIOL_MASK | SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK) @@ -281,24 +326,29 @@ static const unsigned int kCodeTrapHandler9[] = { s_branch .halt_wave .trap_raised: + // Save the entry trap id in ttmp7.saved_trap_id + s_min_u32 ttmp3, ttmp2, 0xF + s_lshl_b32 ttmp3, ttmp3, TTMP7_SAVED_TRAP_ID_SHIFT + s_or_b32 ttmp7, ttmp7, ttmp3 + // If debugger trap (s_trap >= 3) then signal debugger. s_cmp_ge_u32 ttmp2, 0x3; - s_cbranch_scc1 .signal_trap_debugger + s_cbranch_scc1 .signal_debugger // If llvm.trap (s_trap 2) then signal queue error. s_cmp_eq_u32 ttmp2, 0x2 s_mov_b32 ttmp3, SIGNAL_CODE_LLVM_TRAP - s_cbranch_scc1 .signal_trap_error + s_cbranch_scc1 .signal_error // For other traps advance PC and return to shader. s_add_u32 ttmp0, ttmp0, 0x4 s_addc_u32 ttmp1, ttmp1, 0x0 s_branch .exit_trap - .signal_trap_error: - s_bitset1_b32 ttmp11, TTMP11_TRAP_RAISED_BIT - .signal_error: + .if (.amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor >= 3) + // This needs to be rewritten for gfx10.3 as scalar stores are not available. + .else // FIXME: don't trash ttmp4/ttmp5 when exception handling is unified. s_mov_b32 ttmp4, ttmp3 @@ -348,6 +398,7 @@ static const unsigned int kCodeTrapHandler9[] = { s_mov_b32 m0, 0x0 s_nop 0 s_sendmsg sendmsg(MSG_INTERRUPT) + .endif .skip_event_trigger: // Since we trashed ttmp4/ttmp5, reset the wave_id to 0 @@ -355,34 +406,37 @@ static const unsigned int kCodeTrapHandler9[] = { s_mov_b32 ttmp5, 0x0 .halt_wave: + s_bitset1_b32 ttmp7, TTMP7_WAVE_STOPPED_BIT + // Halt the wavefront. - s_or_b32 ttmp12, ttmp12, SQ_WAVE_STATUS_HALT_MASK + s_bitset1_b32 ttmp12, SQ_WAVE_STATUS_HALT_BIT .exit_trap: mExitTrap */ - 0x92eeff6d, 0x00080010, 0xbf850040, 0xb8eef803, 0x866fff6e, 0x00000900, - 0xbf85002f, 0x866fff6e, 0x00007080, 0xbf840003, 0xbef71a88, 0xbf820001, - 0xbef71a87, 0xbeee007e, 0xbeef007f, 0xbefe00ff, 0x80000000, 0xbf90000a, - 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, 0x867eff7e, 0x00000fff, - 0xbeef007e, 0xbefe006e, 0xbeef1a97, 0xbeee007c, 0xbefc006f, 0xbf800000, - 0xbf900001, 0xbefc006e, 0x866dff6d, 0x0000ffff, 0xc0021bb6, 0x00000000, - 0xbf8cc07f, 0x866eff6e, 0xffff0000, 0xbf06ff6e, 0xbf810000, 0xbf84004d, - 0xbeee0080, 0xb96e02c1, 0x8678ff78, 0xffffdfff, 0x8f6e8b77, 0x866eff6e, + 0x8973ff73, 0x3e000000, 0x92eeff78, 0x0001000d, 0x8e6e9d6e, 0x87736e73, + 0x92eeff6d, 0x00080010, 0xbf850041, 0xb8eef803, 0x866fff6e, 0x00000900, + 0xbf850031, 0xbeee007e, 0xbeef007f, 0xbefe00ff, 0x80000000, 0xbf90000a, + 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, 0x866fff7e, 0x00000fff, + 0xbefe006e, 0xbeef1a97, 0xbeee007c, 0xbefc006f, 0xbf800000, 0xbf900001, + 0xbefc006e, 0xbf0d9f73, 0xbf85000f, 0x866fff6f, 0x000003ff, 0x8e6f836f, + 0xc0051bbd, 0x0000006f, 0xbf8cc07f, 0xc0031bb7, 0x00000008, 0xbf8cc07f, + 0x80ee6e72, 0x8f6e866e, 0x8973ff73, 0x01ffffff, 0x87736e73, 0xbef31a9f, + 0xbef2006c, 0x866dff6d, 0x0000ffff, 0x8e6d876d, 0x8977ff77, 0x007fff80, + 0x87776d77, 0xbeec1c00, 0x806cff6c, 0x00000010, 0x826d806d, 0xbf820044, + 0xbf920002, 0xbf82fffe, 0x866fff6e, 0x10000100, 0xbf06ff6f, 0x00000100, + 0xbeef00ff, 0x20000000, 0xbf850011, 0x866fff6e, 0x00000800, 0xbeef00f4, + 0xbf85000d, 0xbf820036, 0x83ef8f6e, 0x8e6f996f, 0x87736f73, 0xbf09836e, + 0xbf85ffbe, 0xbf06826e, 0xbeef00ff, 0x80000000, 0xbf850003, 0x806c846c, + 0x826d806d, 0xbf82002c, 0xbef0006f, 0xbeee007e, 0xbeef007f, 0xbefe00ff, + 0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, + 0x867eff7e, 0x000003ff, 0x8e6f837e, 0xbefe006e, 0xc0051bbd, 0x0000006f, + 0xbf8cc07f, 0xc0071bb7, 0x000000c0, 0xbf8cc07f, 0xbef10080, 0xc2831c37, + 0x00000008, 0xbf8cc07f, 0x87707170, 0xbf85000e, 0xc0071c37, 0x00000010, + 0xbf8cc07f, 0x86f07070, 0xbf840009, 0xc0031bb7, 0x00000018, 0xbf8cc07f, + 0xc0431bb8, 0x00000000, 0xbf8cc07f, 0xbefc0080, 0xbf800000, 0xbf900001, + 0xbef00080, 0xbef10080, 0xbef31a9e, 0xbef81a8d, 0x8f6e8b77, 0x866eff6e, 0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c, - 0xbef71a88, 0x866fff6e, 0x10000100, 0xbf06ff6f, 0x00000100, 0xbeef00ff, - 0x20000000, 0xbf85000f, 0x866fff6e, 0x00000800, 0xbeef00f4, 0xbf85000b, - 0xbf820034, 0xbf09836e, 0xbf85ffc7, 0xbf06826e, 0xbeef00ff, 0x80000000, - 0xbf850003, 0x806c846c, 0x826d806d, 0xbf82002d, 0xbef71a87, 0xbef0006f, - 0xbeee007e, 0xbeef007f, 0xbefe00ff, 0x80000000, 0xbf90000a, 0xbf800007, - 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, 0x867eff7e, 0x000003ff, 0x8e6f837e, - 0xbefe006e, 0xc0051bbd, 0x0000006f, 0xbf8cc07f, 0xc0071bb7, 0x000000c0, - 0xbf8cc07f, 0xbef10080, 0xc2831c37, 0x00000008, 0xbf8cc07f, 0x87707170, - 0xbf85000e, 0xc0071c37, 0x00000010, 0xbf8cc07f, 0x86f07070, 0xbf840009, - 0xc0031bb7, 0x00000018, 0xbf8cc07f, 0xc0431bb8, 0x00000000, 0xbf8cc07f, - 0xbefc0080, 0xbf800000, 0xbf900001, 0xbef00080, 0xbef10080, 0x8778ff78, - 0x00002000, 0x8f6e8b77, 0x866eff6e, 0x001f8000, 0xb96ef807, 0x86fe7e7e, - 0x86ea6a6a, 0xb978f802, 0xbe801f6c, }; static const unsigned int kCodeCopyAligned8[] = { @@ -497,30 +551,46 @@ static const unsigned int kCodeFill10[] = { 0xD70F6A02, 0x0002040C, 0xD5286A03, 0x01A90103, 0xBF82FFF5, 0xBF810000, }; -static const unsigned int kCodeTrapHandler10[] = { - 0x93eeff6d, 0x00080010, 0xbf850044, 0xb96ef803, 0x876fff6e, 0x00000900, - 0xbf850033, 0x876fff6e, 0x00007080, 0xbf840003, 0xbef71d88, 0xbf820001, - 0xbef71d87, 0xbeee037e, 0xbeef037f, 0xbefe03ff, 0x80000000, 0xbf90000a, - 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff036f, 0x877eff7e, 0x00000fff, - 0xbeef037e, 0xbefe036e, 0xbeef1d96, 0xbeee037c, 0xbefc036f, 0xbf800000, - 0xbf900001, 0xbefc036e, 0x876dff6d, 0x0000ffff, 0xf4001bb6, 0xfa000000, - 0xbf8cc07f, 0x876eff6e, 0xffff0000, 0xbf06ff6e, 0xbf810000, 0xbf840051, - 0xbeee0380, 0xb9ee02c1, 0x8778ff78, 0xffffdfff, 0x906e8977, 0x876fff6e, +static const unsigned int kCodeTrapHandler1010[] = { + 0x8a73ff73, 0x3e000000, 0x93eeff78, 0x0001000d, 0x8f6e9d6e, 0x88736e73, + 0x93eeff6d, 0x00080010, 0xbf850041, 0xb96ef803, 0x876fff6e, 0x00000900, + 0xbf850031, 0xbeee037e, 0xbeef037f, 0xbefe03ff, 0x80000000, 0xbf90000a, + 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff036f, 0x876fff7e, 0x00000fff, + 0xbefe036e, 0xbeef1d96, 0xbeee037c, 0xbefc036f, 0xbf800000, 0xbf900001, + 0xbefc036e, 0xbf0d9f73, 0xbf85000f, 0x876fff6f, 0x000003ff, 0x8f6f836f, + 0xf4051bbd, 0xde000000, 0xbf8cc07f, 0xf4011bb7, 0xfa000008, 0xbf8cc07f, + 0x80ee6e72, 0x906e866e, 0x8a73ff73, 0x01ffffff, 0x88736e73, 0xbef31d9f, + 0xbef2036c, 0x876dff6d, 0x0000ffff, 0x8f6d876d, 0x8a77ff77, 0x007fff80, + 0x88776d77, 0xbeec1f00, 0x806cff6c, 0x00000010, 0x826d806d, 0xbf820044, + 0xbf920002, 0xbf82fffe, 0x876fff6e, 0x10000100, 0xbf06ff6f, 0x00000100, + 0xbeef03ff, 0x20000000, 0xbf850011, 0x876fff6e, 0x00000800, 0xbeef03f4, + 0xbf85000d, 0xbf820036, 0x83ef8f6e, 0x8f6f996f, 0x88736f73, 0xbf09836e, + 0xbf85ffbe, 0xbf06826e, 0xbeef03ff, 0x80000000, 0xbf850003, 0x806c846c, + 0x826d806d, 0xbf82002c, 0xbef0036f, 0xbeee037e, 0xbeef037f, 0xbefe03ff, + 0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff036f, + 0x877eff7e, 0x000003ff, 0x8f6f837e, 0xbefe036e, 0xf4051bbd, 0xde000000, + 0xbf8cc07f, 0xf4051bb7, 0xfa0000c0, 0xbf8cc07f, 0xbef10380, 0xf6811c37, + 0xfa000008, 0xbf8cc07f, 0x88707170, 0xbf85000e, 0xf4051c37, 0xfa000010, + 0xbf8cc07f, 0x87f07070, 0xbf840009, 0xf4011bb7, 0xfa000018, 0xbf8cc07f, + 0xf4411bb8, 0xfa000000, 0xbf8cc07f, 0xbefc0380, 0xbf800000, 0xbf900001, + 0xbef00380, 0xbef10380, 0xbef31d9e, 0xbef81d8d, 0x906e8977, 0x876fff6e, 0x003f8000, 0x906e8677, 0x876eff6e, 0x02000000, 0x886e6f6e, 0xb9eef807, - 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, 0xbe80226c, 0xbef71d88, 0x876fff6e, - 0x10000100, 0xbf06ff6f, 0x00000100, 0xbeef03ff, 0x20000000, 0xbf85000f, - 0x876fff6e, 0x00000800, 0xbeef03f4, 0xbf85000b, 0xbf820034, 0xbf09836e, - 0xbf85ffc3, 0xbf06826e, 0xbeef03ff, 0x80000000, 0xbf850003, 0x806c846c, - 0x826d806d, 0xbf82002d, 0xbef71d87, 0xbef0036f, 0xbeee037e, 0xbeef037f, - 0xbefe03ff, 0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, - 0xbeff036f, 0x877eff7e, 0x000003ff, 0x8f6f837e, 0xbefe036e, 0xf4051bbd, - 0xde000000, 0xbf8cc07f, 0xf4051bb7, 0xfa0000c0, 0xbf8cc07f, 0xbef10380, - 0xf6811c37, 0xfa000008, 0xbf8cc07f, 0x88707170, 0xbf85000e, 0xf4051c37, - 0xfa000010, 0xbf8cc07f, 0x87f07070, 0xbf840009, 0xf4011bb7, 0xfa000018, - 0xbf8cc07f, 0xf4411bb8, 0xfa000000, 0xbf8cc07f, 0xbefc0380, 0xbf800000, - 0xbf900001, 0xbef00380, 0xbef10380, 0x8878ff78, 0x00002000, 0x906e8977, - 0x876fff6e, 0x003f8000, 0x906e8677, 0x876eff6e, 0x02000000, 0x886e6f6e, - 0xb9eef807, 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, 0xbe80226c, + 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, 0xbe80226c, +}; + +static const unsigned int kCodeTrapHandler10[] = { + 0x8a73ff73, 0x3e000000, 0x93eeff78, 0x0001000d, 0x8f6e9d6e, 0x88736e73, + 0x93eeff6d, 0x00080010, 0xbf850023, 0xb96ef803, 0x876fff6e, 0x00000900, + 0xbf850013, 0xbeee037e, 0xbeef037f, 0xbefe03ff, 0x80000000, 0xbf90000a, + 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff036f, 0x876fff7e, 0x00000fff, + 0xbefe036e, 0xbeef1d96, 0xbeee037c, 0xbefc036f, 0xbf800000, 0xbf900001, + 0xbefc036e, 0xbf82001a, 0x876fff6e, 0x10000100, 0xbf06ff6f, 0x00000100, + 0xbeef03ff, 0x20000000, 0xbf850011, 0x876fff6e, 0x00000800, 0xbeef03f4, + 0xbf85000d, 0xbf82000e, 0x83ef8f6e, 0x8f6f996f, 0x88736f73, 0xbf09836e, + 0xbf85ffdc, 0xbf06826e, 0xbeef03ff, 0x80000000, 0xbf850003, 0x806c846c, + 0x826d806d, 0xbf820004, 0xbef00380, 0xbef10380, 0xbef31d9e, 0xbef81d8d, + 0x906e8977, 0x876fff6e, 0x003f8000, 0x906e8677, 0x876eff6e, 0x02000000, + 0x886e6f6e, 0xb9eef807, 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, 0xbe80226c, }; } // namespace amd diff --git a/src/core/inc/hsa_ext_amd_impl.h b/src/core/inc/hsa_ext_amd_impl.h index 1527682f7..510e36960 100644 --- a/src/core/inc/hsa_ext_amd_impl.h +++ b/src/core/inc/hsa_ext_amd_impl.h @@ -242,6 +242,10 @@ hsa_status_t hsa_amd_register_deallocation_callback( hsa_status_t hsa_amd_deregister_deallocation_callback( void* ptr, hsa_amd_deallocation_callback_t callback); +// Mirrors Amd Extension Apis +hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t signal, + volatile hsa_signal_value_t** value_ptr); + } // namespace amd } // namespace rocr diff --git a/src/core/inc/runtime.h b/src/core/inc/runtime.h index 53d78f8f0..634224c5f 100644 --- a/src/core/inc/runtime.h +++ b/src/core/inc/runtime.h @@ -314,7 +314,7 @@ class Runtime { amd::hsa::code::AmdHsaCodeManager* code_manager() { return &code_manager_; } - std::function& + std::function& system_allocator() { return system_allocator_; } @@ -341,6 +341,10 @@ class Runtime { uint64_t sys_clock_freq() const { return sys_clock_freq_; } + void KfdVersion(const HsaVersionInfo& version) { kfd_version = version; } + + HsaVersionInfo KfdVersion() const { return kfd_version; } + protected: static void AsyncEventsLoop(void*); @@ -482,8 +486,7 @@ class Runtime { std::map allocation_map_; // Allocator using ::system_region_ - std::function - system_allocator_; + std::function system_allocator_; // Deallocator using ::system_region_ std::function system_deallocator_; @@ -533,6 +536,9 @@ class Runtime { // Pools KFD Events for InterruptSignal InterruptSignal::EventPool EventPool; + // Kfd version + HsaVersionInfo kfd_version; + // Frees runtime memory when the runtime library is unloaded if safe to do so. // Failure to release the runtime indicates an incorrect application but is // common (example: calls library routines at process exit). diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp index a6c5586d8..06265bda2 100644 --- a/src/core/runtime/amd_gpu_agent.cpp +++ b/src/core/runtime/amd_gpu_agent.cpp @@ -202,6 +202,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar ASICShader compute_7; ASICShader compute_8; ASICShader compute_9; + ASICShader compute_1010; ASICShader compute_10; }; @@ -211,6 +212,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar {NULL, 0, 0, 0}, {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4}, + {kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4}, {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4}, }}, {"CopyAligned", @@ -219,6 +221,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, + {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, }}, {"CopyMisaligned", { @@ -226,6 +229,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, + {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, }}, {"Fill", { @@ -233,6 +237,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar {kCodeFill8, sizeof(kCodeFill8), 19, 8}, {kCodeFill8, sizeof(kCodeFill8), 19, 8}, {kCodeFill10, sizeof(kCodeFill10), 19, 8}, + {kCodeFill10, sizeof(kCodeFill10), 19, 8}, }}}; auto compiled_shader_it = compiled_shaders.find(func_name); @@ -249,10 +254,13 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar asic_shader = &compiled_shader_it->second.compute_8; break; case 9: - asic_shader = &compiled_shader_it->second.compute_9; + asic_shader = &compiled_shader_it->second.compute_9; break; case 10: - asic_shader = &compiled_shader_it->second.compute_10; + if(isa_->GetMinorVersion() == 1) + asic_shader = &compiled_shader_it->second.compute_1010; + else + asic_shader = &compiled_shader_it->second.compute_10; break; default: assert(false && "Precompiled shader unavailable for target"); @@ -1128,15 +1136,13 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) { assert(((!scratch.large) | use_reclaim) && "Large scratch used with reclaim disabled."); if (scratch.queue_base != nullptr) { - if (profile_ == HSA_PROFILE_FULL) return; - if (profile_ == HSA_PROFILE_BASE) { - HSAuint64 alternate_va; - if (hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) == - HSAKMT_STATUS_SUCCESS) { - if (scratch.large) scratch_used_large_ += scratch.size; - scratch_cache_.insert(scratch); - return; - } + HSAuint64 alternate_va; + if ((profile_ == HSA_PROFILE_FULL) || + (hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) == + HSAKMT_STATUS_SUCCESS)) { + if (scratch.large) scratch_used_large_ += scratch.size; + scratch_cache_.insert(scratch); + return; } } diff --git a/src/core/runtime/amd_topology.cpp b/src/core/runtime/amd_topology.cpp index d813d38c6..185838489 100644 --- a/src/core/runtime/amd_topology.cpp +++ b/src/core/runtime/amd_topology.cpp @@ -67,7 +67,6 @@ namespace AMD { // Minimum acceptable KFD version numbers static const uint kKfdVersionMajor = 0; static const uint kKfdVersionMinor = 99; -static HsaVersionInfo kfd_version; CpuAgent* DiscoverCpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { if (node_prop.NumCPUCores == 0) { @@ -89,7 +88,10 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { try { gpu = new GpuAgent(node_id, node_prop); - // Check for sramecc incompatibility in gfx906 and gfx908. sramecc bit fixed in kfd 1.4. + const HsaVersionInfo& kfd_version = core::Runtime::runtime_singleton_->KfdVersion(); + + // Check for sramecc incompatibility due to sramecc not being reported correctly in kfd before + // 1.4. if (gpu->isa()->IsSrameccSupported() && (kfd_version.KernelInterfaceMajorVersion <= 1 && kfd_version.KernelInterfaceMinorVersion < 4)) { // gfx906 has both sramecc modes in use. Suppress the device. @@ -226,6 +228,7 @@ static void SurfaceGpuList(std::vector& gpu_list) { /// @brief Calls Kfd thunk to get the snapshot of the topology of the system, /// which includes associations between, node, devices, memory and caches. void BuildTopology() { + HsaVersionInfo kfd_version; if (hsaKmtGetVersion(&kfd_version) != HSAKMT_STATUS_SUCCESS) { return; } @@ -241,6 +244,8 @@ void BuildTopology() { core::g_use_interrupt_wait = false; } + core::Runtime::runtime_singleton_->KfdVersion(kfd_version); + HsaSystemProperties props; hsaKmtReleaseSystemProperties(); diff --git a/src/core/runtime/hsa_api_trace.cpp b/src/core/runtime/hsa_api_trace.cpp index 5a5c8bcd3..9e50971b3 100644 --- a/src/core/runtime/hsa_api_trace.cpp +++ b/src/core/runtime/hsa_api_trace.cpp @@ -390,6 +390,7 @@ void HsaApiTable::UpdateAmdExts() { amd_ext_api.hsa_amd_memory_lock_to_pool_fn = AMD::hsa_amd_memory_lock_to_pool; amd_ext_api.hsa_amd_register_deallocation_callback_fn = AMD::hsa_amd_register_deallocation_callback; amd_ext_api.hsa_amd_deregister_deallocation_callback_fn = AMD::hsa_amd_deregister_deallocation_callback; + amd_ext_api.hsa_amd_signal_value_pointer_fn = AMD::hsa_amd_signal_value_pointer; } void LoadInitialHsaApiTable() { diff --git a/src/core/runtime/hsa_ext_amd.cpp b/src/core/runtime/hsa_ext_amd.cpp index d65d0dc0e..f4776eb99 100644 --- a/src/core/runtime/hsa_ext_amd.cpp +++ b/src/core/runtime/hsa_ext_amd.cpp @@ -470,6 +470,23 @@ hsa_status_t hsa_amd_signal_create(hsa_signal_value_t initial_value, uint32_t nu CATCH; } +hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t hsa_signal, + volatile hsa_signal_value_t** value_ptr) { + TRY; + IS_OPEN(); + IS_BAD_PTR(value_ptr); + core::Signal* signal = core::Signal::Convert(hsa_signal); + IS_VALID(signal); + + if(!core::BusyWaitSignal::IsType(signal)) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + *value_ptr = (volatile hsa_signal_value_t*)&signal->signal_.value; + return HSA_STATUS_SUCCESS; + + CATCH; +} + uint32_t hsa_amd_signal_wait_any(uint32_t signal_count, hsa_signal_t* hsa_signals, hsa_signal_condition_t* conds, hsa_signal_value_t* values, uint64_t timeout_hint, hsa_wait_state_t wait_hint, diff --git a/src/core/runtime/isa.cpp b/src/core/runtime/isa.cpp index 547fbf83d..8b889bb5d 100755 --- a/src/core/runtime/isa.cpp +++ b/src/core/runtime/isa.cpp @@ -293,6 +293,7 @@ constexpr size_t hsa_name_size = 63; ISAREG_ENTRY_GEN("gfx1030", 10, 3, 0, unsupported, unsupported) ISAREG_ENTRY_GEN("gfx1031", 10, 3, 1, unsupported, unsupported) ISAREG_ENTRY_GEN("gfx1032", 10, 3, 2, unsupported, unsupported) + ISAREG_ENTRY_GEN("gfx1033", 10, 3, 3, unsupported, unsupported) #undef ISAREG_ENTRY_GEN return supported_isas; } diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp index 9ea49a9e3..c3cce57a1 100644 --- a/src/core/runtime/runtime.cpp +++ b/src/core/runtime/runtime.cpp @@ -173,20 +173,14 @@ void Runtime::RegisterAgent(Agent* agent) { if (cpu_agents_.size() == 1) { // Might need memory pooling to cover allocation that // requires less than 4096 bytes. - system_allocator_ = - [&](size_t size, size_t alignment, - MemoryRegion::AllocateFlags alloc_flags) -> void* { - assert(alignment <= 4096); - void* ptr = NULL; - return (HSA_STATUS_SUCCESS == - core::Runtime::runtime_singleton_->AllocateMemory( - system_regions_fine_[0], size, alloc_flags, &ptr)) - ? ptr - : NULL; - }; - - system_deallocator_ = - [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); }; + system_allocator_ = [this](size_t size, size_t align, MemoryRegion::AllocateFlags alloc_flags) -> void* { + assert(align <= 4096); + void* ptr = nullptr; + core::Runtime::runtime_singleton_->AllocateMemory(system_regions_fine_[0], size, alloc_flags, &ptr); + return ptr; + }; + + system_deallocator_ = [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); }; BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_); } @@ -451,9 +445,8 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) { requires the caller to specify all allowed agents we can't assume that a peer mapped pointer would remain mapped for the duration of the copy. */ - void* temp = nullptr; - system_region->Allocate(size, core::MemoryRegion::AllocateNoFlags, &temp); - MAKE_SCOPE_GUARD([&]() { system_region->Free(temp, size); }); + void* temp = system_allocator_(size, 0, core::MemoryRegion::AllocateNoFlags); + MAKE_SCOPE_GUARD([&]() { system_deallocator_(temp); }); hsa_status_t err = src_agent->DmaCopy(temp, source, size); if (err == HSA_STATUS_SUCCESS) err = dst_agent->DmaCopy(dst, temp, size); return err; @@ -1273,7 +1266,8 @@ Runtime::Runtime() sys_clock_freq_(0), vm_fault_event_(nullptr), vm_fault_signal_(nullptr), - ref_count_(0) {} + ref_count_(0), + kfd_version{0} {} hsa_status_t Runtime::Load() { flag_.Refresh(); diff --git a/src/hsacore.so.def b/src/hsacore.so.def index ea90b3363..4666121cf 100644 --- a/src/hsacore.so.def +++ b/src/hsacore.so.def @@ -220,6 +220,7 @@ global: hsa_amd_queue_set_priority; hsa_amd_register_deallocation_callback; hsa_amd_deregister_deallocation_callback; + hsa_amd_signal_value_pointer; _amdgpu_r_debug; local: diff --git a/src/image/addrlib/src/core/addrlib2.cpp b/src/image/addrlib/src/core/addrlib2.cpp index 700287abd..2d215cb65 100644 --- a/src/image/addrlib/src/core/addrlib2.cpp +++ b/src/image/addrlib/src/core/addrlib2.cpp @@ -314,8 +314,6 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceInfo( } } - ADDR_ASSERT(pOut->surfSize != 0); - ValidBaseAlignments(pOut->baseAlign); return returnCode; diff --git a/src/image/blit_kernel.cpp b/src/image/blit_kernel.cpp index 4d5a996a9..33a3e1c91 100644 --- a/src/image/blit_kernel.cpp +++ b/src/image/blit_kernel.cpp @@ -88,6 +88,7 @@ extern uint8_t ocl_blit_object_gfx1012[]; extern uint8_t ocl_blit_object_gfx1030[]; extern uint8_t ocl_blit_object_gfx1031[]; extern uint8_t ocl_blit_object_gfx1032[]; +extern uint8_t ocl_blit_object_gfx1033[]; // Arguments inserted by OCL compiler, all zero here. struct OCLHiddenArgs { @@ -1001,6 +1002,8 @@ hsa_status_t BlitKernel::GetPatchedBlitObject(const char* agent_name, *blit_code_object = ocl_blit_object_gfx1031; } else if (sname == "gfx1032") { *blit_code_object = ocl_blit_object_gfx1032; + } else if (sname == "gfx1033") { + *blit_code_object = ocl_blit_object_gfx1033; } else { return HSA_STATUS_ERROR_INVALID_ISA_NAME; } diff --git a/src/image/blit_src/CMakeLists.txt b/src/image/blit_src/CMakeLists.txt index 465b1caf6..5e4fd7f0d 100644 --- a/src/image/blit_src/CMakeLists.txt +++ b/src/image/blit_src/CMakeLists.txt @@ -69,7 +69,7 @@ endif() # Determine the target devices if not specified if (NOT DEFINED TARGET_DEVICES) - set (TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032") + set (TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033") endif() set( TARGET_DEVICES ${TARGET_DEVICES} CACHE STRING "Build targets" FORCE ) diff --git a/src/image/image_manager_nv.cpp b/src/image/image_manager_nv.cpp index 755fa8f01..7312f6d80 100755 --- a/src/image/image_manager_nv.cpp +++ b/src/image/image_manager_nv.cpp @@ -484,7 +484,11 @@ hsa_status_t ImageManagerNv::PopulateImageSrd(Image& image) const { word4.f.DEPTH = (image_array) // Doesn't hurt but isn't array_size already >0? ? std::max(image.desc.array_size, static_cast(1)) - 1 - : (image_3d) ? image.desc.depth - 1 : out.pitch - 1; + : (image_3d) ? image.desc.depth - 1 : 0; + uint32_t minor_ver = MinorVerFromDevID(chip_id_); + // For 1d, 2d and 2d-msaa in gfx1030 and beyond this is pitch-1 + if ((minor_ver >= 3) && !image_array && !image_3d) + word4.f.PITCH = out.pitch - 1; word5.val = 0; word6.val = 0; @@ -630,6 +634,7 @@ uint32_t ImageManagerNv::GetAddrlibSurfaceInfoNv( const uint32_t num_slice = static_cast( std::max(kMinNumSlice, std::max(desc.array_size, desc.depth))); + uint32_t minor_ver = MinorVerFromDevID(chip_id_); ADDR2_COMPUTE_SURFACE_INFO_INPUT in = {0}; in.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_INPUT); in.format = addrlib_format; @@ -637,6 +642,9 @@ uint32_t ImageManagerNv::GetAddrlibSurfaceInfoNv( in.width = width; in.height = height; in.numSlices = num_slice; + // Custom Pitch is supported in gfx1030 and beyond + if (minor_ver >= 3) + in.pitchInElement = image_data_row_pitch / image_prop.element_size; switch (desc.geometry) { case HSA_EXT_IMAGE_GEOMETRY_1D: case HSA_EXT_IMAGE_GEOMETRY_1DB: diff --git a/src/image/resource_nv.h b/src/image/resource_nv.h index 986d08e73..1e9f5d250 100755 --- a/src/image/resource_nv.h +++ b/src/image/resource_nv.h @@ -300,21 +300,34 @@ union SQ_IMG_RSRC_WORD3 { #define SQ_IMG_RSC_WRD4_REG_SZ 32 #define SQ_IMG_RSC_WRD4_DEPTH_SZ 13 #define SQ_IMG_RSC_WRD4_BASE_ARR_SZ 13 -struct sq_img_rsrc_word4_t { +#define SQ_IMG_RSC_WRD4_PITCH_SZ 14 +union sq_img_rsrc_word4_t { + struct { #if defined(LITTLEENDIAN_CPU) - // For arrays this is last slice in view, for 3D this is depth-1, For remaining this is pitch-1 - unsigned int DEPTH : SQ_IMG_RSC_WRD4_DEPTH_SZ; - unsigned int : 1; //Pitch[13] in gfx1030 - unsigned int : 2; - unsigned int BASE_ARRAY : SQ_IMG_RSC_WRD4_BASE_ARR_SZ; - unsigned int : 3; + // For arrays this is last slice in view, for 3D this is depth-1, For remaining this is pitch-1 + unsigned int DEPTH : SQ_IMG_RSC_WRD4_DEPTH_SZ; + unsigned int : 1; //Pitch[13] in gfx1030 + unsigned int : 2; + unsigned int BASE_ARRAY : SQ_IMG_RSC_WRD4_BASE_ARR_SZ; + unsigned int : 3; #elif defined(BIGENDIAN_CPU) - unsigned int : 3; - unsigned int BASE_ARRAY : SQ_IMG_RSC_WRD4_BASE_ARR_SZ; - unsigned int : 2; - unsigned int : 1; //Pitch[13] in gfx1030 - unsigned int DEPTH : SQ_IMG_RSC_WRD4_DEPTH_SZ; //Pitch[0:12] in gfx1030 + unsigned int : 3; + unsigned int BASE_ARRAY : SQ_IMG_RSC_WRD4_BASE_ARR_SZ; + unsigned int : 2; + unsigned int : 1; //Pitch[13] in gfx1030 + unsigned int DEPTH : SQ_IMG_RSC_WRD4_DEPTH_SZ; //Pitch[0:12] in gfx1030 #endif + }; + struct { +#if defined(LITTLEENDIAN_CPU) + // For 1d, 2d and 2d-msaa in gfx1030 this is pitch-1 + unsigned int PITCH : SQ_IMG_RSC_WRD4_PITCH_SZ; + unsigned int : SQ_IMG_RSC_WRD4_REG_SZ-SQ_IMG_RSC_WRD4_PITCH_SZ; +#elif defined(BIGENDIAN_CPU) + unsigned int : SQ_IMG_RSC_WRD4_REG_SZ-SQ_IMG_RSC_WRD4_PITCH_SZ; + unsigned int PITCH : SQ_IMG_RSC_WRD4_PITCH_SZ; +#endif + }; }; union SQ_IMG_RSRC_WORD4 { sq_img_rsrc_word4_t bitfields, bits, f; diff --git a/src/inc/hsa_api_trace.h b/src/inc/hsa_api_trace.h index 5c33f07f8..bf3e9197e 100644 --- a/src/inc/hsa_api_trace.h +++ b/src/inc/hsa_api_trace.h @@ -182,6 +182,7 @@ struct AmdExtTable { decltype(hsa_amd_memory_lock_to_pool)* hsa_amd_memory_lock_to_pool_fn; decltype(hsa_amd_register_deallocation_callback)* hsa_amd_register_deallocation_callback_fn; decltype(hsa_amd_deregister_deallocation_callback)* hsa_amd_deregister_deallocation_callback_fn; + decltype(hsa_amd_signal_value_pointer)* hsa_amd_signal_value_pointer_fn; }; // Table to export HSA Core Runtime Apis diff --git a/src/inc/hsa_ext_amd.h b/src/inc/hsa_ext_amd.h index 04a6e4d71..9df7c49ae 100644 --- a/src/inc/hsa_ext_amd.h +++ b/src/inc/hsa_ext_amd.h @@ -55,6 +55,115 @@ extern "C" { #endif +/** \addtogroup aql Architected Queuing Language + * @{ + */ + +/** + * @brief A fixed-size type used to represent ::hsa_signal_condition_t constants. + */ +typedef uint32_t hsa_signal_condition32_t; + +/** + * @brief AMD vendor specific packet type. + */ +typedef enum { + /** + * Packet used by agents to delay processing of subsequent packets until a + * configurable condition is satisfied by an HSA signal. Only kernel dispatch + * queues created from AMD GPU Agents support this packet. + */ + HSA_AMD_PACKET_TYPE_BARRIER_VALUE = 2, +} hsa_amd_packet_type_t; + +/** + * @brief A fixed-size type used to represent ::hsa_amd_packet_type_t constants. + */ +typedef uint8_t hsa_amd_packet_type8_t; + +/** + * @brief AMD vendor specific AQL packet header + */ +typedef struct hsa_amd_packet_header_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + *Format of the vendor specific packet. + */ + hsa_amd_packet_type8_t AmdFormat; + + /** + * Reserved. Must be 0. + */ + uint8_t reserved; +} hsa_amd_vendor_packet_header_t; + +/** + * @brief AMD barrier value packet. Halts packet processing and waits for + * (signal_value & ::mask) ::cond ::value to be satisfied, where signal_value + * is the value of the signal ::signal. + */ +typedef struct hsa_amd_barrier_value_packet_s { + /** + * AMD vendor specific packet header. + */ + hsa_amd_vendor_packet_header_t header; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved0; + + /** + * Dependent signal object. A signal with a handle value of 0 is + * allowed and is interpreted by the packet processor a satisfied + * dependency. + */ + hsa_signal_t signal; + + /** + * Value to compare against. + */ + hsa_signal_value_t value; + + /** + * Bit mask to be combined by bitwise AND with ::signal's value. + */ + hsa_signal_value_t mask; + + /** + * Comparison operation. See ::hsa_signal_condition_t. + */ + hsa_signal_condition32_t cond; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved3; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; +} hsa_amd_barrier_value_packet_t; + +/** @} */ + /** * @brief Enumeration constants added to ::hsa_status_t. * @@ -485,6 +594,37 @@ hsa_status_t HSA_API hsa_amd_signal_create(hsa_signal_value_t initial_value, uin const hsa_agent_t* consumers, uint64_t attributes, hsa_signal_t* signal); +/** + * @brief Returns a pointer to the value of a signal. + * + * Use of this API does not modify the lifetime of ::signal and any + * hsa_signal_value_t retrieved by this API has lifetime equal to that of + * ::signal. + * + * This API is intended for partial interoperability with non-HSA compatible + * devices and should not be used where HSA interfaces are available. + * + * Use of the signal value must comply with use restritions of ::signal. + * Use may result in data races if the operations performed are not platform + * atomic. Use with HSA_AMD_SIGNAL_AMD_GPU_ONLY or HSA_AMD_SIGNAL_IPC + * attributed signals is required. + * + * @param[in] Signal handle to extract the signal value pointer from. + * + * @param[out] Location where the extracted signal value pointer will be placed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL signal is not a valid hsa_signal_t + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT value_ptr is NULL. + */ +hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t signal, + volatile hsa_signal_value_t** value_ptr); + /** * @brief Asyncronous signal handler function type. * diff --git a/src/inc/hsa_ven_amd_aqlprofile.h b/src/inc/hsa_ven_amd_aqlprofile.h index fb763c0ed..169ab5278 100644 --- a/src/inc/hsa_ven_amd_aqlprofile.h +++ b/src/inc/hsa_ven_amd_aqlprofile.h @@ -114,6 +114,13 @@ typedef enum { HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_RPB = 24, // System blocks HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SDMA = 25, + // GFX10 added blocks + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1A = 26, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1C = 27, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2A = 28, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2C = 29, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCR = 30, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GUS = 31, HSA_VEN_AMD_AQLPROFILE_BLOCKS_NUMBER } hsa_ven_amd_aqlprofile_block_name_t; diff --git a/src/loader/executable.cpp b/src/loader/executable.cpp index d53a09850..eb640fb4c 100644 --- a/src/loader/executable.cpp +++ b/src/loader/executable.cpp @@ -75,7 +75,10 @@ __attribute__((noinline)) static void _loader_debug_state() { // r_version history: // 1: Initial debug protocol // 2: New trap handler ABI. The reason for halting a wave is recorded in ttmp11[8:7]. -HSA_API r_debug _amdgpu_r_debug = {2, +// 3: New trap handler ABI. A wave halted at S_ENDPGM rewinds its PC by 8 bytes, and sets ttmp11[9]=1. +// 4: New trap handler ABI. Save the trap id in ttmp11[16:9] +// 5: New trap handler ABI. Save the PC in ttmp11[22:7] ttmp6[31:0], and park the wave if stopped +HSA_API r_debug _amdgpu_r_debug = {5, nullptr, reinterpret_cast(&_loader_debug_state), r_debug::RT_CONSISTENT, diff --git a/src/loader/loaders.cpp b/src/loader/loaders.cpp index a369896fa..a36ce2c95 100644 --- a/src/loader/loaders.cpp +++ b/src/loader/loaders.cpp @@ -99,6 +99,7 @@ namespace loader { gfx1030.handle = 1030; gfx1031.handle = 1031; gfx1032.handle = 1032; + gfx1033.handle = 1033; } hsa_isa_t OfflineLoaderContext::IsaFromName(const char *name) @@ -148,6 +149,8 @@ namespace loader { return gfx1031; } else if (sname == "AMD:AMDGPU:10:3:2") { return gfx1032; + } else if (sname == "AMD:AMDGPU:10:3:3") { + return gfx1033; } // The offline loader only supports code object v2 which only supports diff --git a/src/loader/loaders.hpp b/src/loader/loaders.hpp index e98c7463a..ef6ef2a59 100644 --- a/src/loader/loaders.hpp +++ b/src/loader/loaders.hpp @@ -58,7 +58,7 @@ namespace loader { hsa_isa_t gfx700, gfx701, gfx702, gfx703, gfx704, gfx705; hsa_isa_t gfx801, gfx802, gfx803, gfx805, gfx810; hsa_isa_t gfx900, gfx902, gfx904, gfx906, gfx908; - hsa_isa_t gfx1010, gfx1011, gfx1012, gfx1030, gfx1031, gfx1032; + hsa_isa_t gfx1010, gfx1011, gfx1012, gfx1030, gfx1031, gfx1032, gfx1033; std::ostream& out; typedef std::set PointerSet; PointerSet pointers;