From 6d81a6c6370b9e6ffc5e2dd504ae8b2a4447ba8a Mon Sep 17 00:00:00 2001 From: Satyanvesh Dittakavi Date: Tue, 20 Dec 2022 11:56:43 +0000 Subject: [PATCH 01/56] SWDEV-372396 - Address the errors in hiprtc headers with -Werror Change-Id: Ied553216dfc2ff8b5c72de617cb47b6ca8f52063 --- include/hip/amd_detail/amd_device_functions.h | 6 ++- .../amd_detail/amd_hip_cooperative_groups.h | 32 +++++++------ .../hip/amd_detail/amd_hip_unsafe_atomics.h | 6 ++- include/hip/amd_detail/amd_warp_functions.h | 11 ++++- .../hip_cooperative_groups_helper.h | 46 ++++++++++++------- src/hiprtc/cmake/HIPRTC.cmake | 9 +++- src/hiprtc/hiprtcInternal.cpp | 4 +- 7 files changed, 78 insertions(+), 36 deletions(-) diff --git a/include/hip/amd_detail/amd_device_functions.h b/include/hip/amd_detail/amd_device_functions.h index ce421c63..57403eb3 100644 --- a/include/hip/amd_detail/amd_device_functions.h +++ b/include/hip/amd_detail/amd_device_functions.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -444,7 +444,8 @@ __device__ static inline unsigned long long int __double2ull_ru(double x) { __device__ static inline unsigned long long int __double2ull_rz(double x) { return (unsigned long long int)x; } - +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wc++98-compat-pedantic" __device__ static inline long long int __double_as_longlong(double x) { static_assert(sizeof(long long) == sizeof(double), ""); @@ -453,6 +454,7 @@ __device__ static inline long long int __double_as_longlong(double x) { return tmp; } +#pragma clang diagnostic pop /* __device__ unsigned short __float2half_rn(float x); diff --git a/include/hip/amd_detail/amd_hip_cooperative_groups.h b/include/hip/amd_detail/amd_hip_cooperative_groups.h index 747f65a4..575a9f8e 100644 --- a/include/hip/amd_detail/amd_hip_cooperative_groups.h +++ b/include/hip/amd_detail/amd_hip_cooperative_groups.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -32,6 +32,13 @@ THE SOFTWARE. #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wc++98-compat" +#pragma clang diagnostic ignored "-Wsign-conversion" +#pragma clang diagnostic ignored "-Wunused-parameter" +#pragma clang diagnostic ignored "-Wreserved-macro-identifier" +#pragma clang diagnostic ignored "-Wpadded" + #if __cplusplus #if !defined(__HIPCC_RTC__) #include @@ -69,8 +76,8 @@ class thread_group { // only when the group is supposed to contain only the calling the thread // (throurh the API - `this_thread()`), and in all other cases, this thread // group object is a sub-object of some other derived thread group object - __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = (uint64_t)0, - uint64_t mask = (uint64_t)0) { + __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast(0), + uint64_t mask = static_cast(0)) { _type = type; _size = size; _mask = mask; @@ -199,7 +206,7 @@ class thread_block : public thread_group { const bool pow2 = ((tile_size & (tile_size - 1)) == 0); // Invalid tile size, assert if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) { - __hip_assert(false && "invalid tile size"); + __hip_assert(false && "invalid tile size") } thread_group tiledGroup = thread_group(internal::cg_tiled_group, tile_size); @@ -246,7 +253,7 @@ class tiled_group : public thread_group { const bool pow2 = ((tile_size & (tile_size - 1)) == 0); if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) { - __hip_assert(false && "invalid tile size"); + __hip_assert(false && "invalid tile size") } if (size() <= tile_size) { @@ -282,7 +289,6 @@ class tiled_group : public thread_group { * \details Represents a active thread group in a wavefront. * This group type also supports sub-wave level intrinsics. */ - class coalesced_group : public thread_group { private: friend __CG_QUALIFIER__ coalesced_group coalesced_threads(); @@ -300,8 +306,8 @@ class coalesced_group : public thread_group { // prepare a mask for further partitioning it so that it stays coalesced. if (coalesced_info.tiled_info.is_tiled) { unsigned int base_offset = (thread_rank() & (~(tile_size - 1))); - unsigned int masklength = min((unsigned int)size() - base_offset, tile_size); - lane_mask member_mask = (lane_mask)(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength); + unsigned int masklength = min(static_cast(size()) - base_offset, tile_size); + lane_mask member_mask = static_cast(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength); member_mask <<= (__lane_id() & ~(tile_size - 1)); coalesced_group coalesced_tile = coalesced_group(member_mask); @@ -358,7 +364,7 @@ class coalesced_group : public thread_group { __CG_QUALIFIER__ T shfl(T var, int srcRank) const { static_assert(is_valid_type::value, "Neither an integer or float type."); - srcRank = srcRank % size(); + srcRank = srcRank % static_cast(size()); int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank : (__AMDGCN_WAVEFRONT_SIZE == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1)) @@ -452,7 +458,7 @@ __CG_QUALIFIER__ uint32_t thread_group::thread_rank() const { return (static_cast(this)->thread_rank()); } default: { - __hip_assert(false && "invalid cooperative group type"); + __hip_assert(false && "invalid cooperative group type") return -1; } } @@ -476,7 +482,7 @@ __CG_QUALIFIER__ bool thread_group::is_valid() const { return (static_cast(this)->is_valid()); } default: { - __hip_assert(false && "invalid cooperative group type"); + __hip_assert(false && "invalid cooperative group type") return false; } } @@ -505,7 +511,7 @@ __CG_QUALIFIER__ void thread_group::sync() const { break; } default: { - __hip_assert(false && "invalid cooperative group type"); + __hip_assert(false && "invalid cooperative group type") } } } @@ -697,6 +703,6 @@ __CG_QUALIFIER__ thread_block_tile tiled_partition(const Paren return impl::tiled_partition_internal(g); } } // namespace cooperative_groups - +#pragma clang diagnostic pop #endif // __cplusplus #endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H diff --git a/include/hip/amd_detail/amd_hip_unsafe_atomics.h b/include/hip/amd_detail/amd_hip_unsafe_atomics.h index 243b5a64..f9e9738a 100644 --- a/include/hip/amd_detail/amd_hip_unsafe_atomics.h +++ b/include/hip/amd_detail/amd_hip_unsafe_atomics.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2021 - Present Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -23,6 +23,9 @@ THE SOFTWARE. #pragma once #ifdef __cplusplus +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wold-style-cast" + /** * @brief Unsafe floating point rmw atomic add. * @@ -563,4 +566,5 @@ __device__ inline double safeAtomicMin(double* addr, double val) { #endif } +#pragma clang diagnostic pop #endif diff --git a/include/hip/amd_detail/amd_warp_functions.h b/include/hip/amd_detail/amd_warp_functions.h index b18ff5f5..fb6065b1 100644 --- a/include/hip/amd_detail/amd_warp_functions.h +++ b/include/hip/amd_detail/amd_warp_functions.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -23,6 +23,14 @@ THE SOFTWARE. #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H #define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreserved-identifier" +#pragma clang diagnostic ignored "-Wreserved-macro-identifier" +#pragma clang diagnostic ignored "-Wsign-conversion" +#pragma clang diagnostic ignored "-Wold-style-cast" +#pragma clang diagnostic ignored "-Wc++98-compat" +#pragma clang diagnostic ignored "-Wc++98-compat-pedantic" + __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) { union { int i; unsigned u; float f; } tmp; tmp.u = src; tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i); @@ -491,4 +499,5 @@ unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = return tmp1; } +#pragma clang diagnostic pop #endif diff --git a/include/hip/amd_detail/hip_cooperative_groups_helper.h b/include/hip/amd_detail/hip_cooperative_groups_helper.h index a90f0a3a..877c6a43 100644 --- a/include/hip/amd_detail/hip_cooperative_groups_helper.h +++ b/include/hip/amd_detail/hip_cooperative_groups_helper.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -39,6 +39,12 @@ THE SOFTWARE. #define __align__(x) __attribute__((aligned(x))) #endif +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreserved-macro-identifier" +#pragma clang diagnostic ignored "-Wc++98-compat" +#pragma clang diagnostic ignored "-Wc++98-compat-pedantic" +#pragma clang diagnostic ignored "-Wshorten-64-to-32" + #if !defined(__CG_QUALIFIER__) #define __CG_QUALIFIER__ __device__ __forceinline__ #endif @@ -92,15 +98,18 @@ typedef enum { */ namespace multi_grid { -__CG_STATIC_QUALIFIER__ uint32_t num_grids() { return (uint32_t)__ockl_multi_grid_num_grids(); } +__CG_STATIC_QUALIFIER__ uint32_t num_grids() { + return static_cast(__ockl_multi_grid_num_grids()); } -__CG_STATIC_QUALIFIER__ uint32_t grid_rank() { return (uint32_t)__ockl_multi_grid_grid_rank(); } +__CG_STATIC_QUALIFIER__ uint32_t grid_rank() { + return static_cast(__ockl_multi_grid_grid_rank()); } -__CG_STATIC_QUALIFIER__ uint32_t size() { return (uint32_t)__ockl_multi_grid_size(); } +__CG_STATIC_QUALIFIER__ uint32_t size() { return static_cast(__ockl_multi_grid_size()); } -__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return (uint32_t)__ockl_multi_grid_thread_rank(); } +__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { + return static_cast(__ockl_multi_grid_thread_rank()); } -__CG_STATIC_QUALIFIER__ bool is_valid() { return (bool)__ockl_multi_grid_is_valid(); } +__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast(__ockl_multi_grid_is_valid()); } __CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); } @@ -112,28 +121,28 @@ __CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); } namespace grid { __CG_STATIC_QUALIFIER__ uint32_t size() { - return (uint32_t)((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) * + return static_cast((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) * (blockDim.x * gridDim.x)); } __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { // Compute global id of the workgroup to which the current thread belongs to - uint32_t blkIdx = (uint32_t)((blockIdx.z * gridDim.y * gridDim.x) + + uint32_t blkIdx = static_cast((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x) + (blockIdx.x)); // Compute total number of threads being passed to reach current workgroup // within grid uint32_t num_threads_till_current_workgroup = - (uint32_t)(blkIdx * (blockDim.x * blockDim.y * blockDim.z)); + static_cast(blkIdx * (blockDim.x * blockDim.y * blockDim.z)); // Compute thread local rank within current workgroup - uint32_t local_thread_rank = (uint32_t)((threadIdx.z * blockDim.y * blockDim.x) + + uint32_t local_thread_rank = static_cast((threadIdx.z * blockDim.y * blockDim.x) + (threadIdx.y * blockDim.x) + (threadIdx.x)); return (num_threads_till_current_workgroup + local_thread_rank); } -__CG_STATIC_QUALIFIER__ bool is_valid() { return (bool)__ockl_grid_is_valid(); } +__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast(__ockl_grid_is_valid()); } __CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); } @@ -146,19 +155,21 @@ __CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); } namespace workgroup { __CG_STATIC_QUALIFIER__ dim3 group_index() { - return (dim3((uint32_t)blockIdx.x, (uint32_t)blockIdx.y, (uint32_t)blockIdx.z)); + return (dim3(static_cast(blockIdx.x), static_cast(blockIdx.y), + static_cast(blockIdx.z))); } __CG_STATIC_QUALIFIER__ dim3 thread_index() { - return (dim3((uint32_t)threadIdx.x, (uint32_t)threadIdx.y, (uint32_t)threadIdx.z)); + return (dim3(static_cast(threadIdx.x), static_cast(threadIdx.y), + static_cast(threadIdx.z))); } __CG_STATIC_QUALIFIER__ uint32_t size() { - return ((uint32_t)(blockDim.x * blockDim.y * blockDim.z)); + return (static_cast(blockDim.x * blockDim.y * blockDim.z)); } __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { - return ((uint32_t)((threadIdx.z * blockDim.y * blockDim.x) + + return (static_cast((threadIdx.z * blockDim.y * blockDim.x) + (threadIdx.y * blockDim.x) + (threadIdx.x))); } @@ -187,8 +198,8 @@ __CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, " // // For each thread, this function returns the number of active threads which // have i-th bit of x set and come before the current thread. -__device__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) { - int counter=0; +__CG_STATIC_QUALIFIER__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) { + unsigned int counter=0; #if __AMDGCN_WAVEFRONT_SIZE == 32 counter = __builtin_amdgcn_mbcnt_lo(x, add); #else @@ -206,5 +217,6 @@ __device__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) { } // namespace cooperative_groups +#pragma clang diagnostic pop #endif // __cplusplus #endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H diff --git a/src/hiprtc/cmake/HIPRTC.cmake b/src/hiprtc/cmake/HIPRTC.cmake index 19b59f36..521b0234 100644 --- a/src/hiprtc/cmake/HIPRTC.cmake +++ b/src/hiprtc/cmake/HIPRTC.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2021 - 2022 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -29,6 +29,7 @@ function(get_hiprtc_macros HIPRTC_DEFINES) set(${HIPRTC_DEFINES} "#pragma clang diagnostic push\n\ #pragma clang diagnostic ignored \"-Wreserved-id-macro\"\n\ +#pragma clang diagnostic ignored \"-Wc++98-compat-pedantic\"\n\ #define __device__ __attribute__((device))\n\ #define __host__ __attribute__((host))\n\ #define __global__ __attribute__((global))\n\ @@ -51,7 +52,10 @@ function(get_hiprtc_macros HIPRTC_DEFINES) #pragma clang diagnostic pop\n\ #define HIP_INCLUDE_HIP_HIP_RUNTIME_H\n\ #define HIP_INCLUDE_HIP_HIP_FP16_H\n\ +#pragma clang diagnostic push\n\ +#pragma clang diagnostic ignored \"-Wreserved-macro-identifier\"\n\ #define _HIP_BFLOAT16_H_\n\ +#pragma clang diagnostic pop\n\ #define HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H" PARENT_SCOPE) endfunction(get_hiprtc_macros) @@ -63,12 +67,15 @@ if(HIPRTC_ADD_MACROS) FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_DEFINES}") FILE(READ "${HIPRTC_WARP_HEADER_FILE}" HIPRTC_WARP_HEADER) FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_WARP_HEADER}") +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreserved-macro-identifier" FILE(READ "${HIPRTC_COOP_HELPER_FILE}" HIPRTC_COOP_HELPER) FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_COOP_HELPER}") FILE(READ "${HIPRTC_COOP_HEADER_FILE}" HIPRTC_COOP_HEADER) FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_COOP_HEADER}") FILE(READ "${HIPRTC_UNSAFE_ATOMICS_FILE}" HIPRTC_UNSAFE_ATOMICS) FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_UNSAFE_ATOMICS}") +#pragma clang diagnostic pop endif() macro(generate_hiprtc_header HiprtcHeader) diff --git a/src/hiprtc/hiprtcInternal.cpp b/src/hiprtc/hiprtcInternal.cpp index dfff262c..e4e4f044 100644 --- a/src/hiprtc/hiprtcInternal.cpp +++ b/src/hiprtc/hiprtcInternal.cpp @@ -1,5 +1,5 @@ /* -Copyright (c) 2022 - Present Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -135,6 +135,8 @@ RTCCompileProgram::RTCCompileProgram(std::string name_) : RTCProgram(name_), fgp compile_options_.push_back("hiprtc_runtime.h"); compile_options_.push_back("-std=c++14"); compile_options_.push_back("-nogpuinc"); + compile_options_.push_back("-Wno-gnu-line-marker"); + compile_options_.push_back("-Wno-missing-prototypes"); #ifdef _WIN32 compile_options_.push_back("-target"); compile_options_.push_back("x86_64-pc-windows-msvc"); From 0ef12695a9558b3507a7491e09c787aba7fc72a3 Mon Sep 17 00:00:00 2001 From: Jaydeep Patel Date: Tue, 24 Jan 2023 09:50:43 +0000 Subject: [PATCH 02/56] SWDEV-379125 - Check only if there is attr set for graph kernel node. Change-Id: I8768b33ad27e75eb753d99d682edf60b31b240df --- src/hip_graph_internal.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hip_graph_internal.hpp b/src/hip_graph_internal.hpp index 6462535d..5fb557dc 100644 --- a/src/hip_graph_internal.hpp +++ b/src/hip_graph_internal.hpp @@ -942,7 +942,7 @@ class hipGraphKernelNode : public hipGraphNode { } hipError_t GetAttrParams(hipKernelNodeAttrID attr, hipKernelNodeAttrValue* params) { // Get kernel attr params - if (kernelAttrInUse_ != attr) return hipErrorInvalidValue; + if (kernelAttrInUse_ != 0 && kernelAttrInUse_ != attr) return hipErrorInvalidValue; if (attr == hipKernelNodeAttributeAccessPolicyWindow) { params->accessPolicyWindow.base_ptr = kernelAttr_.accessPolicyWindow.base_ptr; params->accessPolicyWindow.hitProp = kernelAttr_.accessPolicyWindow.hitProp; From 26e8996ce868dd060e65ed60621dd2c9e358e17a Mon Sep 17 00:00:00 2001 From: Rakesh Roy Date: Fri, 20 Jan 2023 22:39:43 +0530 Subject: [PATCH 03/56] SWDEV-375004 - Fix rocprim test failure - For !__HIPCC_RTC__ case, operator<<(std::ostream& os, const hip_bfloat16& bf16) calls itself in endless recursion - Convert hip_bfloat16 to float to fix this Change-Id: I252a656817550caf43c587cebf461474f12b8c0c --- include/hip/amd_detail/amd_hip_bfloat16.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/include/hip/amd_detail/amd_hip_bfloat16.h b/include/hip/amd_detail/amd_hip_bfloat16.h index 8c7f7839..deb3bfb7 100644 --- a/include/hip/amd_detail/amd_hip_bfloat16.h +++ b/include/hip/amd_detail/amd_hip_bfloat16.h @@ -32,10 +32,8 @@ #include "host_defines.h" #if defined(__HIPCC_RTC__) #define __HOST_DEVICE__ __device__ - #define HIP_OSTREAM __hip_internal::ostream #else #define __HOST_DEVICE__ __host__ __device__ - #define HIP_OSTREAM std::ostream #endif #if __cplusplus < 201103L || !defined(__HIPCC__) @@ -181,12 +179,12 @@ static_assert(__hip_internal::is_trivial{}, static_assert(sizeof(hip_bfloat16) == sizeof(hip_bfloat16_public) && offsetof(hip_bfloat16, data) == offsetof(hip_bfloat16_public, data), "internal hip_bfloat16 does not match public hip_bfloat16"); -#endif -inline HIP_OSTREAM& operator<<(HIP_OSTREAM& os, const hip_bfloat16& bf16) +inline std::ostream& operator<<(std::ostream& os, const hip_bfloat16& bf16) { - return os << bf16; + return os << float(bf16); } +#endif inline __HOST_DEVICE__ hip_bfloat16 operator+(hip_bfloat16 a) { From 3bd49d75c237e33d2b3f74a1be33735646419b8e Mon Sep 17 00:00:00 2001 From: sdashmiz Date: Tue, 24 Jan 2023 10:31:18 -0500 Subject: [PATCH 04/56] SWDEV-379151 - correct error status - if the stream is null ptr and there is capture ongoing return an error Signed-off-by: sdashmiz Change-Id: Iff33e4aebc253fb7bb56daf5c455722d726c6705 --- src/hip_stream.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp index 3d1e9168..35d673b9 100644 --- a/src/hip_stream.cpp +++ b/src/hip_stream.cpp @@ -695,7 +695,7 @@ hipError_t hipLaunchHostFunc_spt(hipStream_t stream, hipHostFn_t fn, void* userD // ================================================================================================ hipError_t hipLaunchHostFunc(hipStream_t stream, hipHostFn_t fn, void* userData) { HIP_INIT_API(hipLaunchHostFunc, stream, fn, userData); - if (stream == nullptr) { + if (stream == nullptr && (hip::Stream::StreamCaptureOngoing() == true)) { HIP_RETURN(hipErrorStreamCaptureImplicit); } HIP_RETURN(hipLaunchHostFunc_common(stream, fn, userData)); From f04b9f79e75d330df28cf93045f0232bd069adc0 Mon Sep 17 00:00:00 2001 From: Ajay GunaShekar Date: Sun, 15 Jan 2023 10:01:14 -0500 Subject: [PATCH 05/56] SWDEV-372757 - Don't destroy null queue in MT This reverts commit 48cc8c33ee83ecc1925954432a4652299bf44311. Reason for revert: patch does not fix all stream hangs. So another patch was merged which fixes all issues. Change-Id: I332d1ea29c23747b46b7667fe3e34e0ceefd2b23 --- src/hip_internal.hpp | 2 +- src/hip_stream.cpp | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/hip_internal.hpp b/src/hip_internal.hpp index 99bdf00d..31ec410d 100644 --- a/src/hip_internal.hpp +++ b/src/hip_internal.hpp @@ -409,7 +409,7 @@ namespace hip { Device(amd::Context* ctx, int devId): context_(ctx), deviceId_(devId), null_stream_(this, Stream::Priority::Normal, 0, true), - flags_(hipDeviceScheduleSpin), + flags_(hipDeviceScheduleSpin), isActive_(false), default_mem_pool_(nullptr), current_mem_pool_(nullptr) diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp index 35d673b9..a0444334 100644 --- a/src/hip_stream.cpp +++ b/src/hip_stream.cpp @@ -48,12 +48,8 @@ Stream::~Stream() { amd::ScopedLock lock(streamSetLock); streamSet.erase(this); - // Skip queue destruction for null stream in MT. Queue worker thread can be destroyed on - // the app exit, during the stream destruction, causing a race condition. - if (!null_ || AMD_DIRECT_DISPATCH) { - queue_->release(); - queue_ = nullptr; - } + queue_->release(); + queue_ = nullptr; } } From 5f16e600713ff72a6b49ab39e8f868c6ff1c46c7 Mon Sep 17 00:00:00 2001 From: sdashmiz Date: Thu, 12 Jan 2023 16:24:55 -0500 Subject: [PATCH 06/56] SWDEV-374389 - correct update dependency behaviour - nodes should belong to graph - num of passed dependency cant be larger than graph nodes Signed-off-by: sdashmiz Change-Id: Ia6f2283546bc44edee705b0483bfe506b7b1177a --- src/hip_graph.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/hip_graph.cpp b/src/hip_graph.cpp index 4b5d9e15..74c89249 100644 --- a/src/hip_graph.cpp +++ b/src/hip_graph.cpp @@ -1674,13 +1674,19 @@ hipError_t hipStreamUpdateCaptureDependencies(hipStream_t stream, hipGraphNode_t if (s->GetCaptureStatus() == hipStreamCaptureStatusNone) { HIP_RETURN(hipErrorIllegalState); } - if ((numDependencies > 0 && dependencies == nullptr) || + if ((s->GetCaptureGraph()->GetNodeCount() < numDependencies) || + (numDependencies > 0 && dependencies == nullptr) || (flags != 0 && flags != hipStreamAddCaptureDependencies && flags != hipStreamSetCaptureDependencies)) { HIP_RETURN(hipErrorInvalidValue); } std::vector depNodes; + const std::vector& graphNodes = s->GetCaptureGraph()->GetNodes(); for (int i = 0; i < numDependencies; i++) { + if ((dependencies[i] == nullptr) || + std::find(std::begin(graphNodes), std::end(graphNodes), dependencies[i]) == std::end(graphNodes)) { + HIP_RETURN(hipErrorInvalidValue); + } depNodes.push_back(dependencies[i]); } if (flags == hipStreamAddCaptureDependencies) { From b242cbcaa52e1ee9293382996c6573d2b9f3601a Mon Sep 17 00:00:00 2001 From: sdashmiz Date: Thu, 5 Jan 2023 14:08:45 -0500 Subject: [PATCH 07/56] SWDEV-374368 - dependency list should not have duplicate Signed-off-by: sdashmiz Change-Id: I67e1c8203f10916b98408e2004e6c64e96b1933b --- src/hip_graph.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/hip_graph.cpp b/src/hip_graph.cpp index 74c89249..35b14e23 100644 --- a/src/hip_graph.cpp +++ b/src/hip_graph.cpp @@ -36,11 +36,16 @@ inline hipError_t ihipGraphAddNode(hipGraphNode_t graphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, bool capture = true) { graph->AddNode(graphNode); + std::unordered_set DuplicateDep; for (size_t i = 0; i < numDependencies; i++) { if ((!hipGraphNode::isNodeValid(pDependencies[i])) || (graph != pDependencies[i]->GetParentGraph())) { return hipErrorInvalidValue; } + if (DuplicateDep.find(pDependencies[i]) != DuplicateDep.end()) { + return hipErrorInvalidValue; + } + DuplicateDep.insert(pDependencies[i]); pDependencies[i]->AddEdge(graphNode); } if (capture == false) { From 40d56ed9d62266c3d19b079888e03777ec935987 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Thu, 26 Jan 2023 23:48:49 +0000 Subject: [PATCH 08/56] SWDEV-379678 - Remove catch build option Change-Id: Idedf5e61da1e6c7fbdd9a65e57220875a3b24112 --- CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ccd8337c..38948ecf 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -447,9 +447,7 @@ if(${RUN_HIT} EQUAL 0) execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_COMMON_BIN_DIR}" "${HIP_ROOT_DIR}/bin" RESULT_VARIABLE RUN_HIT ERROR_QUIET) endif() if(HIP_CATCH_TEST EQUAL "1") - enable_testing() - set(HIP_PATH ${HIP_ROOT_DIR}) - add_subdirectory(${HIP_COMMON_DIR}/tests/catch ${PROJECT_BINARY_DIR}/catch) + message(STATUS "Building of catch tests through hipamd is no longer supported. Testing targets will not be available. catch tests have been moved to an independent github project hip-tests. Please refer to hip-tests Readme for build instructions! ") else() if(${RUN_HIT} EQUAL 0) set(CMAKE_MODULE_PATH "${HIP_ROOT_DIR}/cmake" ${CMAKE_MODULE_PATH}) From 8fbdad1ee2d5fcc897b9e5ae013c621ab671254f Mon Sep 17 00:00:00 2001 From: sdashmiz Date: Fri, 14 Oct 2022 18:11:08 -0400 Subject: [PATCH 09/56] SWDEV-361623 - correct remove edge behaviour - remove node dependency before checking parents - reduce edge level acording to new value of node Signed-off-by: sdashmiz Change-Id: Id4bff1684f7e0b42beeebc4d2e009bfdb507fb5f --- src/hip_graph.cpp | 2 +- src/hip_graph_internal.hpp | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/hip_graph.cpp b/src/hip_graph.cpp index 35b14e23..75d0f3d7 100644 --- a/src/hip_graph.cpp +++ b/src/hip_graph.cpp @@ -1711,7 +1711,7 @@ hipError_t hipGraphRemoveDependencies(hipGraph_t graph, const hipGraphNode_t* fr } for (size_t i = 0; i < numDependencies; i++) { if (to[i]->GetParentGraph() != graph || from[i]->GetParentGraph() != graph || - from[i]->RemoveEdge(to[i]) == false) { + from[i]->RemoveUpdateEdge(to[i]) == false) { HIP_RETURN(hipErrorInvalidValue); } } diff --git a/src/hip_graph_internal.hpp b/src/hip_graph_internal.hpp index 5fb557dc..dfebf119 100644 --- a/src/hip_graph_internal.hpp +++ b/src/hip_graph_internal.hpp @@ -265,6 +265,9 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute { dependencies_.erase(std::remove(dependencies_.begin(), dependencies_.end(), node), dependencies_.end()); } + void RemoveEdge(const Node& childNode) { + edges_.erase(std::remove(edges_.begin(), edges_.end(), childNode), edges_.end()); + } /// Return graph node children const std::vector& GetEdges() const { return edges_; } /// Updates graph node children @@ -280,6 +283,12 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute { edge->UpdateEdgeLevel(); } } + void ReduceEdgeLevel() { + for (auto edge: edges_) { + edge->SetLevel(std::min(edge->GetLevel(),GetLevel() + 1)); + edge->ReduceEdgeLevel(); + } + } /// Add edge, update parent node outdegree, child node indegree, level and dependency void AddEdge(const Node& childNode) { edges_.push_back(childNode); @@ -290,7 +299,7 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute { childNode->AddDependency(this); } /// Remove edge, update parent node outdegree, child node indegree, level and dependency - bool RemoveEdge(const Node& childNode) { + bool RemoveUpdateEdge(const Node& childNode) { // std::remove changes the end() hence saving it before hand for validation auto currEdgeEnd = edges_.end(); auto it = std::remove(edges_.begin(), edges_.end(), childNode); @@ -301,15 +310,20 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute { edges_.erase(it, edges_.end()); outDegree_--; childNode->SetInDegree(childNode->GetInDegree() - 1); + childNode->RemoveDependency(this); const std::vector& dependencies = childNode->GetDependencies(); int32_t level = 0; int32_t parentLevel = 0; + uint32_t origLevel = 0; for (auto parent : dependencies) { parentLevel = parent->GetLevel(); level = std::max(level, (parentLevel + 1)); } + origLevel = childNode->GetLevel(); childNode->SetLevel(level); - childNode->RemoveDependency(this); + if (level < origLevel) { + childNode->ReduceEdgeLevel(); + } return true; } /// Get Runlist of the nodes embedded as part of the graphnode(e.g. ChildGraph) From d6871b77d7ae0bc65652684c6ef60e4050dd84d9 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 23 Dec 2022 10:13:30 -0500 Subject: [PATCH 10/56] SWDEV-1 - Use proper builtins for math intrinsics Change-Id: If0e420332c718abe0e9f6316c467b6b29b80b183 --- include/hip/amd_detail/amd_math_functions.h | 18 +++++++++--------- include/hip/amd_detail/math_fwd.h | 20 -------------------- 2 files changed, 9 insertions(+), 29 deletions(-) diff --git a/include/hip/amd_detail/amd_math_functions.h b/include/hip/amd_detail/amd_math_functions.h index 471f6ee3..3c17d298 100644 --- a/include/hip/amd_detail/amd_math_functions.h +++ b/include/hip/amd_detail/amd_math_functions.h @@ -640,22 +640,22 @@ inline float __fmul_rz(float x, float y) { return __ocml_mul_rtz_f32(x, y); } __DEVICE__ inline -float __frcp_rd(float x) { return __llvm_amdgcn_rcp_f32(x); } +float __frcp_rd(float x) { return __builtin_amdgcn_rcpf(x); } #endif __DEVICE__ inline -float __frcp_rn(float x) { return __llvm_amdgcn_rcp_f32(x); } +float __frcp_rn(float x) { return __builtin_amdgcn_rcpf(x); } #if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ inline -float __frcp_ru(float x) { return __llvm_amdgcn_rcp_f32(x); } +float __frcp_ru(float x) { return __builtin_amdgcn_rcpf(x); } __DEVICE__ inline -float __frcp_rz(float x) { return __llvm_amdgcn_rcp_f32(x); } +float __frcp_rz(float x) { return __builtin_amdgcn_rcpf(x); } #endif __DEVICE__ inline -float __frsqrt_rn(float x) { return __llvm_amdgcn_rsq_f32(x); } +float __frsqrt_rn(float x) { return __builtin_amdgcn_rsqf(x); } #if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ inline @@ -1155,18 +1155,18 @@ inline double __dmul_rz(double x, double y) { return __ocml_mul_rtz_f64(x, y); } __DEVICE__ inline -double __drcp_rd(double x) { return __llvm_amdgcn_rcp_f64(x); } +double __drcp_rd(double x) { return __builtin_amdgcn_rcp(x); } #endif __DEVICE__ inline -double __drcp_rn(double x) { return __llvm_amdgcn_rcp_f64(x); } +double __drcp_rn(double x) { return __builtin_amdgcn_rcp(x); } #if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ inline -double __drcp_ru(double x) { return __llvm_amdgcn_rcp_f64(x); } +double __drcp_ru(double x) { return __builtin_amdgcn_rcp(x); } __DEVICE__ inline -double __drcp_rz(double x) { return __llvm_amdgcn_rcp_f64(x); } +double __drcp_rz(double x) { return __builtin_amdgcn_rcp(x); } __DEVICE__ inline double __dsqrt_rd(double x) { return __ocml_sqrt_rtn_f64(x); } diff --git a/include/hip/amd_detail/math_fwd.h b/include/hip/amd_detail/math_fwd.h index 050c88c1..9e999268 100644 --- a/include/hip/amd_detail/math_fwd.h +++ b/include/hip/amd_detail/math_fwd.h @@ -386,19 +386,6 @@ float __ocml_fma_rtp_f32(float, float, float); __device__ __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float); - -__device__ -__attribute__((const)) -float __llvm_amdgcn_cos_f32(float) __asm("llvm.amdgcn.cos.f32"); -__device__ -__attribute__((const)) -float __llvm_amdgcn_rcp_f32(float) __asm("llvm.amdgcn.rcp.f32"); -__device__ -__attribute__((const)) -float __llvm_amdgcn_rsq_f32(float) __asm("llvm.amdgcn.rsq.f32"); -__device__ -__attribute__((const)) -float __llvm_amdgcn_sin_f32(float) __asm("llvm.amdgcn.sin.f32"); // END INTRINSICS // END FLOAT @@ -697,13 +684,6 @@ double __ocml_fma_rtp_f64(double, double, double); __device__ __attribute__((const)) double __ocml_fma_rtz_f64(double, double, double); - -__device__ -__attribute__((const)) -double __llvm_amdgcn_rcp_f64(double) __asm("llvm.amdgcn.rcp.f64"); -__device__ -__attribute__((const)) -double __llvm_amdgcn_rsq_f64(double) __asm("llvm.amdgcn.rsq.f64"); // END INTRINSICS // END DOUBLE From 6c355059c98c839f3bfe4625b5149b7aa8aaa461 Mon Sep 17 00:00:00 2001 From: Jaydeep Patel Date: Fri, 27 Jan 2023 13:07:01 +0000 Subject: [PATCH 11/56] SWDEV-377804 - Initial commit to support hipGraphInstantiateFlagAutoFreeOnLaunch Change-Id: I7a35becb6c98a6ff70264e141317d98be7457a37 --- src/hip_graph.cpp | 8 +++++--- src/hip_graph_internal.cpp | 5 +++++ src/hip_graph_internal.hpp | 12 +++++++++--- src/hip_mempool_impl.cpp | 6 ++++++ src/hip_mempool_impl.hpp | 4 ++-- 5 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/hip_graph.cpp b/src/hip_graph.cpp index 75d0f3d7..642a4bd7 100644 --- a/src/hip_graph.cpp +++ b/src/hip_graph.cpp @@ -1215,7 +1215,8 @@ hipError_t hipGraphAddChildGraphNode(hipGraphNode_t* pGraphNode, hipGraph_t grap HIP_RETURN(status); } -hipError_t ihipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph) { +hipError_t ihipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph, + uint64_t flags = 0) { if (pGraphExec == nullptr || graph == nullptr) { HIP_RETURN(hipErrorInvalidValue); } @@ -1232,7 +1233,8 @@ hipError_t ihipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph) { clonedGraph->LevelOrder(levelOrder); clonedGraph->GetUserObjs(graphExeUserObj); *pGraphExec = - new hipGraphExec(levelOrder, parallelLists, nodeWaitLists, clonedNodes, graphExeUserObj); + new hipGraphExec(levelOrder, parallelLists, nodeWaitLists, clonedNodes, + graphExeUserObj, flags); if (*pGraphExec != nullptr) { return (*pGraphExec)->Init(); } else { @@ -1247,7 +1249,7 @@ hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph, } hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t graph, - unsigned long long flags) { + unsigned long long flags = 0) { HIP_INIT_API(hipGraphInstantiateWithFlags, pGraphExec, graph, flags); if (pGraphExec == nullptr || graph == nullptr) { HIP_RETURN(hipErrorInvalidValue); diff --git a/src/hip_graph_internal.cpp b/src/hip_graph_internal.cpp index 3ce74cab..ab7fc65d 100644 --- a/src/hip_graph_internal.cpp +++ b/src/hip_graph_internal.cpp @@ -796,6 +796,11 @@ hipError_t hipGraphExec::Run(hipStream_t stream) { if (queue == nullptr) { return hipErrorInvalidResourceHandle; } + if (flags_ == hipGraphInstantiateFlagAutoFreeOnLaunch) { + if (!levelOrder_.empty()) { + levelOrder_[0]->GetParentGraph()->FreeAllMemory(); + } + } UpdateQueue(parallelLists_, queue, this); std::vector rootCommands; amd::Command* endCommand = nullptr; diff --git a/src/hip_graph_internal.hpp b/src/hip_graph_internal.hpp index dfebf119..536e0387 100644 --- a/src/hip_graph_internal.hpp +++ b/src/hip_graph_internal.hpp @@ -530,6 +530,10 @@ struct ihipGraph { } return false; } + + void FreeAllMemory() { + mem_pool_->FreeAllMemory(); + } }; struct hipGraphExec { @@ -544,19 +548,21 @@ struct hipGraphExec { static std::unordered_set graphExecSet_; std::unordered_set graphExeUserObj_; static amd::Monitor graphExecSetLock_; - + uint64_t flags_ = 0; public: hipGraphExec(std::vector& levelOrder, std::vector>& lists, std::unordered_map>& nodeWaitLists, std::unordered_map& clonedNodes, - std::unordered_set& userObjs) + std::unordered_set& userObjs, + uint64_t flags = 0) : parallelLists_(lists), levelOrder_(levelOrder), nodeWaitLists_(nodeWaitLists), clonedNodes_(clonedNodes), lastEnqueuedCommand_(nullptr), graphExeUserObj_(userObjs), - currentQueueIndex_(0) { + currentQueueIndex_(0), + flags_(flags) { amd::ScopedLock lock(graphExecSetLock_); graphExecSet_.insert(this); } diff --git a/src/hip_mempool_impl.cpp b/src/hip_mempool_impl.cpp index ddec8f49..2606688f 100644 --- a/src/hip_mempool_impl.cpp +++ b/src/hip_mempool_impl.cpp @@ -397,4 +397,10 @@ void MemoryPool::GetAccess(hip::Device* device, hipMemAccessFlags* flags) { } } +void MemoryPool::FreeAllMemory(hip::Stream* stream) { + while (!busy_heap_.Allocations().empty()) { + FreeMemory(busy_heap_.Allocations().begin()->first, stream); + } +} + } diff --git a/src/hip_mempool_impl.hpp b/src/hip_mempool_impl.hpp index e42bc7eb..9d176b17 100644 --- a/src/hip_mempool_impl.hpp +++ b/src/hip_mempool_impl.hpp @@ -136,7 +136,7 @@ class Heap : public amd::EmbeddedObject { bool IsActiveMemory(amd::Memory* memory) const { return (allocations_.find(memory) != allocations_.end()); } - + const auto& Allocations() { return allocations_; } private: Heap() = delete; Heap(const Heap&) = delete; @@ -217,7 +217,7 @@ class MemoryPool : public amd::ReferenceCountedObject { bool EventDependencies() const { return (state_.event_dependencies_) ? true : false; } bool Opportunistic() const { return (state_.opportunistic_) ? true : false; } bool InternalDependencies() const { return (state_.internal_dependencies_) ? true : false; } - + void FreeAllMemory(hip::Stream* stream = nullptr); private: MemoryPool() = delete; MemoryPool(const MemoryPool&) = delete; From f5393b99a91df99967e6761bcb3c07c02be9573c Mon Sep 17 00:00:00 2001 From: Jaydeep Patel Date: Tue, 31 Jan 2023 13:22:22 +0000 Subject: [PATCH 12/56] SWDEV-379395 - Use getQueue as it handles null stream and flags can be combination. Change-Id: If37854a9fc0fc57acb30e652953a7f283831cad8 --- src/hip_module.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/hip_module.cpp b/src/hip_module.cpp index f3ae2611..ae5d2660 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -516,7 +516,8 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* return hipErrorInvalidValue; } - if ((flags < 0) || (flags > hipCooperativeLaunchMultiDeviceNoPostSync)) { + if (flags > (hipCooperativeLaunchMultiDeviceNoPostSync + + hipCooperativeLaunchMultiDeviceNoPreSync)) { return hipErrorInvalidValue; } @@ -730,7 +731,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL return hipErrorInvalidValue; } - amd::HostQueue* queue = reinterpret_cast(launch.stream)->asHostQueue(); + amd::HostQueue* queue = hip::getQueue(launch.stream); hipFunction_t func = nullptr; // The order of devices in the launch may not match the order in the global array for (size_t dev = 0; dev < g_devices.size(); ++dev) { From 4e87bbf7e985fd28722dfceb13c3858991057eb2 Mon Sep 17 00:00:00 2001 From: Matthew Arsenault Date: Wed, 25 Jan 2023 14:47:48 -0500 Subject: [PATCH 13/56] SWDEV-1 - Reapply "SWDEV-1 - Directly call __builtin_amdgcn_fence" This reverts commit bcf857c23772f810942b305721f4132cbb7de654. Resubmit after https://github.com/ROCmSoftwarePlatform/rocPRIM/commit/ba8a86166ec9c4cf3a40ecde7017feeb1484e7f6 Change-Id: I194559484a5a0d7f4443381fe77218ea73d2def9 --- include/hip/amd_detail/amd_device_functions.h | 17 ++++++-------- include/hip/amd_detail/device_library_decls.h | 22 ------------------- 2 files changed, 7 insertions(+), 32 deletions(-) diff --git a/include/hip/amd_detail/amd_device_functions.h b/include/hip/amd_detail/amd_device_functions.h index 57403eb3..30d2511e 100644 --- a/include/hip/amd_detail/amd_device_functions.h +++ b/include/hip/amd_detail/amd_device_functions.h @@ -777,21 +777,21 @@ __device__ inline static void __threadfence() { - __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device); + __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent"); } __device__ inline static void __threadfence_block() { - __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group); + __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup"); } __device__ inline static void __threadfence_system() { - __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices); + __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, ""); } // abort @@ -864,14 +864,11 @@ void __assertfail() } #endif /* defined(_WIN32) || defined(_WIN64) */ -__device__ -inline -static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope) -{ +__device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) { if (flags) { - __atomic_work_item_fence(flags, __memory_order_release, scope); + __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup"); __builtin_amdgcn_s_barrier(); - __atomic_work_item_fence(flags, __memory_order_acquire, scope); + __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); } else { __builtin_amdgcn_s_barrier(); } @@ -881,7 +878,7 @@ __device__ inline static void __barrier(int n) { - __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group); + __work_group_barrier((__cl_mem_fence_flags)n); } __device__ diff --git a/include/hip/amd_detail/device_library_decls.h b/include/hip/amd_detail/device_library_decls.h index 8add4fa2..02228705 100644 --- a/include/hip/amd_detail/device_library_decls.h +++ b/include/hip/amd_detail/device_library_decls.h @@ -128,26 +128,4 @@ __device__ inline static __local void* __to_local(unsigned x) { return (__local #define __CLK_LOCAL_MEM_FENCE 0x01 typedef unsigned __cl_mem_fence_flags; -typedef enum __memory_scope { - __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, - __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, - __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, - __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, - __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP -} __memory_scope; - -// enum values aligned with what clang uses in EmitAtomicExpr() -typedef enum __memory_order -{ - __memory_order_relaxed = __ATOMIC_RELAXED, - __memory_order_acquire = __ATOMIC_ACQUIRE, - __memory_order_release = __ATOMIC_RELEASE, - __memory_order_acq_rel = __ATOMIC_ACQ_REL, - __memory_order_seq_cst = __ATOMIC_SEQ_CST -} __memory_order; - -// Linked from hip.amdgcn.bc -extern "C" __device__ void -__atomic_work_item_fence(__cl_mem_fence_flags, __memory_order, __memory_scope); - #endif From 6b3d25153de17c783e73bec96ee0c4ac51c25594 Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Fri, 3 Feb 2023 10:33:55 -0500 Subject: [PATCH 14/56] SWDEV-353281 - Switch Graph to operate with stream MemPool was designed to use hip::Stream, but graph implementation uses amd::HostQueue. Hence switch graph to hip::Stream management. Change-Id: Ia319389de45e4c3c6043d17473279a6f27a13140 --- src/hip_graph_internal.cpp | 43 ++++++++++++++++++-------------------- src/hip_graph_internal.hpp | 43 +++++++++++++++++++------------------- 2 files changed, 42 insertions(+), 44 deletions(-) diff --git a/src/hip_graph_internal.cpp b/src/hip_graph_internal.cpp index ab7fc65d..48a7bc30 100644 --- a/src/hip_graph_internal.cpp +++ b/src/hip_graph_internal.cpp @@ -679,34 +679,29 @@ bool hipGraphExec::isGraphExecValid(hipGraphExec* pGraphExec) { return true; } -hipError_t hipGraphExec::CreateQueues(size_t numQueues) { - parallelQueues_.reserve(numQueues); - for (size_t i = 0; i < numQueues; i++) { - amd::HostQueue* queue; - queue = new amd::HostQueue( - *hip::getCurrentDevice()->asContext(), *hip::getCurrentDevice()->devices()[0], 0, - amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal); - - bool result = (queue != nullptr) ? queue->create() : false; - // Create a host queue - if (result) { - parallelQueues_.push_back(queue); - } else { - ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed to create host queue\n"); +hipError_t hipGraphExec::CreateStreams(uint32_t num_streams) { + parallel_streams_.reserve(num_streams); + for (uint32_t i = 0; i < num_streams; ++i) { + auto stream = new hip::Stream(hip::getCurrentDevice(), + hip::Stream::Priority::Normal, hipStreamNonBlocking); + if (stream == nullptr || !stream->Create()) { + delete stream; + ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed to create parallel stream!\n"); return hipErrorOutOfMemory; } + parallel_streams_.push_back(stream); } return hipSuccess; } hipError_t hipGraphExec::Init() { hipError_t status; - size_t reqNumQueues = 1; + size_t min_num_streams = 1; for (auto& node : levelOrder_) { - reqNumQueues += node->GetNumParallelQueues(); + min_num_streams += node->GetNumParallelStreams(); } - status = CreateQueues(parallelLists_.size() - 1 + reqNumQueues); + status = CreateStreams(parallelLists_.size() - 1 + min_num_streams); return status; } @@ -771,19 +766,19 @@ hipError_t FillCommands(std::vector>& parallelLists, return hipSuccess; } -void UpdateQueue(std::vector>& parallelLists, amd::HostQueue*& queue, +void UpdateStream(std::vector>& parallelLists, hip::Stream* stream, hipGraphExec* ptr) { int i = 0; for (const auto& list : parallelLists) { // first parallel list will be launched on the same queue as parent if (i == 0) { for (auto& node : list) { - node->SetQueue(queue, ptr); + node->SetStream(stream, ptr); } - } else { // New queue for parallel branches - amd::HostQueue* paralleQueue = ptr->GetAvailableQueue(); + } else { // New stream for parallel branches + hip::Stream* stream = ptr->GetAvailableStreams(); for (auto& node : list) { - node->SetQueue(paralleQueue, ptr); + node->SetStream(stream, ptr); } } i++; @@ -801,7 +796,9 @@ hipError_t hipGraphExec::Run(hipStream_t stream) { levelOrder_[0]->GetParentGraph()->FreeAllMemory(); } } - UpdateQueue(parallelLists_, queue, this); + auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->GetNullStream() + : reinterpret_cast(stream); + UpdateStream(parallelLists_, hip_stream, this); std::vector rootCommands; amd::Command* endCommand = nullptr; status = diff --git a/src/hip_graph_internal.hpp b/src/hip_graph_internal.hpp index 536e0387..e2bee8a1 100644 --- a/src/hip_graph_internal.hpp +++ b/src/hip_graph_internal.hpp @@ -39,8 +39,8 @@ hipError_t FillCommands(std::vector>& parallelLists, std::unordered_map>& nodeWaitLists, std::vector& levelOrder, std::vector& rootCommands, amd::Command*& endCommand, amd::HostQueue* queue); -void UpdateQueue(std::vector>& parallelLists, amd::HostQueue*& queue, - hipGraphExec* ptr); +void UpdateStream(std::vector>& parallelLists, hip::Stream* stream, + hipGraphExec* ptr); struct hipUserObject : public amd::ReferenceCountedObject { typedef void (*UserCallbackDestructor)(void* data); @@ -154,6 +154,7 @@ struct hipGraphNodeDOTAttribute { struct hipGraphNode : public hipGraphNodeDOTAttribute { protected: + hip::Stream* stream_ = nullptr; amd::HostQueue* queue_; uint32_t level_; unsigned int id_; @@ -223,7 +224,10 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute { amd::HostQueue* GetQueue() { return queue_; } - virtual void SetQueue(amd::HostQueue* queue, hipGraphExec* ptr = nullptr) { queue_ = queue; } + virtual void SetStream(hip::Stream* stream, hipGraphExec* ptr = nullptr) { + stream_ = stream; + queue_ = stream->asHostQueue(); + } /// Create amd::command for the graph node virtual hipError_t CreateCommand(amd::HostQueue* queue) { commands_.clear(); @@ -337,7 +341,7 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute { command->updateEventWaitList(waitList); } } - virtual size_t GetNumParallelQueues() { return 0; } + virtual size_t GetNumParallelStreams() { return 0; } /// Enqueue commands part of the node virtual void EnqueueCommands(hipStream_t stream) { // If the node is disabled it becomes empty node. To maintain ordering just enqueue marker. @@ -541,7 +545,7 @@ struct hipGraphExec { // level order of the graph doesn't include nodes embedded as part of the child graph std::vector levelOrder_; std::unordered_map> nodeWaitLists_; - std::vector parallelQueues_; + std::vector parallel_streams_; uint currentQueueIndex_; std::unordered_map clonedNodes_; amd::Command* lastEnqueuedCommand_; @@ -570,8 +574,8 @@ struct hipGraphExec { ~hipGraphExec() { // new commands are launched for every launch they are destroyed as and when command is // terminated after it complete execution - for (auto queue : parallelQueues_) { - queue->release(); + for (auto stream : parallel_streams_) { + delete stream; } for (auto it = clonedNodes_.begin(); it != clonedNodes_.end(); it++) delete it->second; amd::ScopedLock lock(graphExecSetLock_); @@ -596,10 +600,10 @@ struct hipGraphExec { std::vector& GetNodes() { return levelOrder_; } - amd::HostQueue* GetAvailableQueue() { return parallelQueues_[currentQueueIndex_++]; } + hip::Stream* GetAvailableStreams() { return parallel_streams_[currentQueueIndex_++]; } void ResetQueueIndex() { currentQueueIndex_ = 0; } hipError_t Init(); - hipError_t CreateQueues(size_t numQueues); + hipError_t CreateStreams(uint32_t num_streams); hipError_t Run(hipStream_t stream); }; @@ -628,20 +632,21 @@ struct hipChildGraphNode : public hipGraphNode { ihipGraph* GetChildGraph() { return childGraph_; } - size_t GetNumParallelQueues() { + size_t GetNumParallelStreams() { LevelOrder(childGraphlevelOrder_); size_t num = 0; for (auto& node : childGraphlevelOrder_) { - num += node->GetNumParallelQueues(); + num += node->GetNumParallelStreams(); } // returns total number of parallel queues required for child graph nodes to be launched // first parallel list will be launched on the same queue as parent return num + (parallelLists_.size() - 1); } - void SetQueue(amd::HostQueue* queue, hipGraphExec* ptr = nullptr) { - queue_ = queue; - UpdateQueue(parallelLists_, queue, ptr); + void SetStream(hip::Stream* stream, hipGraphExec* ptr = nullptr) { + stream_ = stream; + queue_ = stream->asHostQueue(); + UpdateStream(parallelLists_, stream, ptr); } // For nodes that are dependent on the child graph node waitlist is the last node of the first @@ -1876,9 +1881,7 @@ class hipGraphMemAllocNode : public hipGraphNode { virtual hipError_t CreateCommand(amd::HostQueue* queue) { auto error = hipGraphNode::CreateCommand(queue); - // Note: memory pool can work with hip::Streams only. It can't accept amd::HostQueue. - // Resource tracking is disabled! - auto ptr = Execute(); + auto ptr = Execute(stream_); return error; } @@ -1919,13 +1922,11 @@ class hipGraphMemFreeNode : public hipGraphNode { virtual hipError_t CreateCommand(amd::HostQueue* queue) { auto error = hipGraphNode::CreateCommand(queue); - // Note: memory pool can work with hip::Streams only. It can't accept amd::HostQueue. - // Resource tracking is disabled! - Execute(); + Execute(stream_); return error; } - void Execute(hip::Stream* stream = nullptr) { + void Execute(hip::Stream* stream) { auto graph = GetParentGraph(); if (graph != nullptr) { graph->FreeMemory(device_ptr_, stream); From 24077b409e9e259ef60e8db295b0ce86d06ae6b0 Mon Sep 17 00:00:00 2001 From: Jaydeep Patel Date: Mon, 6 Feb 2023 17:27:31 +0000 Subject: [PATCH 15/56] SWDEV-380405 - Make hipMemcpyPeer async w.r.t. host and let dst device wait till copy finishes by src device. Change-Id: Idbe25404d0c31b93436e190b548f12f327f47a04 --- src/hip_memory.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index f3fcf68f..3eb2ad1e 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -464,6 +464,7 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin if (src == dst && kind == hipMemcpyDefault) { return hipSuccess; } + bool isP2P = false; size_t sOffset = 0; amd::Memory* srcMemory = getMemoryObject(src, sOffset); size_t dOffset = 0; @@ -475,6 +476,11 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin isAsync = false; } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) { isAsync = false; + } else if ((srcMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) && + (srcMemory->getContext().devices().size() == 1) && + (dstMemory->getContext().devices().size() == 1)) { + isAsync = true; + isP2P = true; } amd::Command* command = nullptr; status = ihipMemcpyCommand(command, dst, src, sizeBytes, kind, queue, isAsync); @@ -484,6 +490,15 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin command->enqueue(); if (!isAsync) { command->awaitCompletion(); + } else if (isP2P) { + amd::HostQueue* pQueue = hip::getNullStream(dstMemory->getContext()); + amd::Command::EventWaitList waitList; + waitList.push_back(command); + amd::Command* depdentMarker = new amd::Marker(*pQueue, false, waitList); + if (depdentMarker != nullptr) { + depdentMarker->enqueue(); + depdentMarker->release(); + } } else { amd::HostQueue* newQueue = command->queue(); if (newQueue != &queue) { From 69d5abba5a401b6b706e97a0d1d190b3d5ebe831 Mon Sep 17 00:00:00 2001 From: Ioannis Assiouras Date: Thu, 2 Feb 2023 00:39:33 +0000 Subject: [PATCH 16/56] SWDEV-380466 - Fixed typo in unsafe atomicAdd for doubles and gfx940 Change-Id: I3b9ea4773f94bf6a9b9dfb655062f42aeb3b5eba --- include/hip/amd_detail/amd_hip_unsafe_atomics.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hip/amd_detail/amd_hip_unsafe_atomics.h b/include/hip/amd_detail/amd_hip_unsafe_atomics.h index f9e9738a..7c2a329e 100644 --- a/include/hip/amd_detail/amd_hip_unsafe_atomics.h +++ b/include/hip/amd_detail/amd_hip_unsafe_atomics.h @@ -178,7 +178,7 @@ __device__ inline float unsafeAtomicMin(float* addr, float val) { * @return Original value contained in \p addr. */ __device__ inline double unsafeAtomicAdd(double* addr, double value) { -#if (defined(__gfx90a__) || defined(__gfx940_)) && \ +#if (defined(__gfx90a__) || defined(__gfx940__)) && \ __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64) return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value); #elif defined (__hip_atomic_fetch_add) From 97292e417c89327edd9b2adc0bb26e0d2be3dc09 Mon Sep 17 00:00:00 2001 From: Ioannis Assiouras Date: Thu, 2 Feb 2023 12:08:59 +0000 Subject: [PATCH 17/56] SWDEV-380687 - Fixed typo in safe atomicAdd for gfx90a Change-Id: I87bc6d9e0ef1f564d679c6280c71c4633d3aa619 --- include/hip/amd_detail/amd_hip_unsafe_atomics.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hip/amd_detail/amd_hip_unsafe_atomics.h b/include/hip/amd_detail/amd_hip_unsafe_atomics.h index 7c2a329e..0100e99e 100644 --- a/include/hip/amd_detail/amd_hip_unsafe_atomics.h +++ b/include/hip/amd_detail/amd_hip_unsafe_atomics.h @@ -310,7 +310,7 @@ __device__ inline double unsafeAtomicMin(double* addr, double val) { */ __device__ inline float safeAtomicAdd(float* addr, float value) { #if defined(__gfx908__) || \ - (defined(__gfx90a) && !__has_builtin(__hip_atomic_fetch_add)) + (defined(__gfx90a__) && !__has_builtin(__hip_atomic_fetch_add)) // On gfx908, we can generate unsafe FP32 atomic add that does not follow all // IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead. // On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to From e73773fb240b7eeb949b0f0928bbbda9e061e1dd Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 23 Dec 2022 10:17:02 -0500 Subject: [PATCH 18/56] SWDEV-1 - Eliminate rcp_2f16 pseudo intrinsic No such wrapper or intrinsic would ever exist because there is no such underlying instruction. Change-Id: I6c3f64cd2df2a58edf32037da8f5712868f296ea --- include/hip/amd_detail/amd_hip_fp16.h | 5 ++++- include/hip/amd_detail/hip_fp16_math_fwd.h | 5 ----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/include/hip/amd_detail/amd_hip_fp16.h b/include/hip/amd_detail/amd_hip_fp16.h index fb07cfb6..694deb22 100644 --- a/include/hip/amd_detail/amd_hip_fp16.h +++ b/include/hip/amd_detail/amd_hip_fp16.h @@ -1672,7 +1672,10 @@ THE SOFTWARE. __half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); } inline __HOST_DEVICE__ - __half2 h2rcp(__half2 x) { return __llvm_amdgcn_rcp_2f16(x); } + __half2 h2rcp(__half2 x) { + return _Float16_2{__llvm_amdgcn_rcp_f16(x.x), + __llvm_amdgcn_rcp_f16(x.y)}; + } inline __HOST_DEVICE__ __half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); } diff --git a/include/hip/amd_detail/hip_fp16_math_fwd.h b/include/hip/amd_detail/hip_fp16_math_fwd.h index 36942c1a..aac0bcca 100644 --- a/include/hip/amd_detail/hip_fp16_math_fwd.h +++ b/include/hip/amd_detail/hip_fp16_math_fwd.h @@ -73,11 +73,6 @@ extern "C" __device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16); __device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16); __device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16); - __device__ inline - __2f16 __llvm_amdgcn_rcp_2f16(__2f16 x) // Not currently exposed by ROCDL. - { - return __2f16{__llvm_amdgcn_rcp_f16(x.x), __llvm_amdgcn_rcp_f16(x.y)}; - } __device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16); __device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16); __device__ __2f16 __ocml_sin_2f16(__2f16); From 36f929bb42a252533a61f1e275de6565bfbaa7ae Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 23 Dec 2022 10:22:13 -0500 Subject: [PATCH 19/56] SWDEV-1 - Use proper __builtin_amdgcn_rcph builtin Change-Id: Ic0b1b25a5ab7d132cb51425f2a17bc779c762668 --- include/hip/amd_detail/amd_hip_fp16.h | 6 +++--- include/hip/amd_detail/hip_fp16_math_fwd.h | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/hip/amd_detail/amd_hip_fp16.h b/include/hip/amd_detail/amd_hip_fp16.h index 694deb22..fda27219 100644 --- a/include/hip/amd_detail/amd_hip_fp16.h +++ b/include/hip/amd_detail/amd_hip_fp16.h @@ -1569,7 +1569,7 @@ THE SOFTWARE. __half hrcp(__half x) { return __half_raw{ - __llvm_amdgcn_rcp_f16(static_cast<__half_raw>(x).data)}; + __builtin_amdgcn_rcph(static_cast<__half_raw>(x).data)}; } inline __device__ @@ -1673,8 +1673,8 @@ THE SOFTWARE. inline __HOST_DEVICE__ __half2 h2rcp(__half2 x) { - return _Float16_2{__llvm_amdgcn_rcp_f16(x.x), - __llvm_amdgcn_rcp_f16(x.y)}; + return _Float16_2{__builtin_amdgcn_rcph(x.x), + __builtin_amdgcn_rcph(x.y)}; } inline __HOST_DEVICE__ diff --git a/include/hip/amd_detail/hip_fp16_math_fwd.h b/include/hip/amd_detail/hip_fp16_math_fwd.h index aac0bcca..caf6ec75 100644 --- a/include/hip/amd_detail/hip_fp16_math_fwd.h +++ b/include/hip/amd_detail/hip_fp16_math_fwd.h @@ -44,7 +44,6 @@ extern "C" __device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16); __device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16); __device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int); - __device__ __attribute__((const)) _Float16 __llvm_amdgcn_rcp_f16(_Float16); __device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16); __device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16); __device__ _Float16 __ocml_sin_f16(_Float16); From d589b47bcd131a28b0b17331717a20f882076639 Mon Sep 17 00:00:00 2001 From: pghafari Date: Mon, 23 Jan 2023 18:06:18 -0500 Subject: [PATCH 20/56] SWDEV-377571 - adding scopelock for hipDeviceReset Change-Id: I3dd95a40d6abff721a4774f26e99d1162bafdfa1 --- src/hip_device.cpp | 15 +++++++++------ src/hip_internal.hpp | 2 +- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/hip_device.cpp b/src/hip_device.cpp index fedd62e5..a3d059ac 100644 --- a/src/hip_device.cpp +++ b/src/hip_device.cpp @@ -107,13 +107,16 @@ void Device::RemoveStreamFromPools(Stream* stream) { // ================================================================================================ void Device::Reset() { - auto it = mem_pools_.begin(); - while (it != mem_pools_.end()) { - auto current = it++; - (*current)->ReleaseAllMemory(); - delete *current; + { + amd::ScopedLock lock(lock_); + auto it = mem_pools_.begin(); + while (it != mem_pools_.end()) { + auto current = it++; + (*current)->ReleaseAllMemory(); + delete *current; + } + mem_pools_.clear(); } - mem_pools_.clear(); flags_ = hipDeviceScheduleSpin; hip::Stream::destroyAllStreams(deviceId_); amd::MemObjMap::Purge(devices()[0]); diff --git a/src/hip_internal.hpp b/src/hip_internal.hpp index 31ec410d..2c3f7fb9 100644 --- a/src/hip_internal.hpp +++ b/src/hip_internal.hpp @@ -382,7 +382,7 @@ namespace hip { /// HIP Device class class Device { - amd::Monitor lock_{"Device lock"}; + amd::Monitor lock_{"Device lock", true}; /// ROCclr context amd::Context* context_; /// Device's ID From c374fd8811c2320cc4b00b2d13e54662bc623139 Mon Sep 17 00:00:00 2001 From: Jaydeep Patel Date: Fri, 3 Feb 2023 11:45:07 +0000 Subject: [PATCH 21/56] SWDEV-380405 - Check for invalid stream for hipMemcpyPeerAsync. Change-Id: I6dfccb4d20bb638bd596c071030c68889743d706 --- src/hip_peer.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/hip_peer.cpp b/src/hip_peer.cpp index fb6b3329..d5255103 100644 --- a/src/hip_peer.cpp +++ b/src/hip_peer.cpp @@ -232,7 +232,9 @@ hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int src srcDevice < 0 || dstDevice < 0) { HIP_RETURN(hipErrorInvalidDevice); } - + if (!hip::isValid(stream)) { + return hipErrorContextIsDestroyed; + } HIP_RETURN(hipMemcpyAsync(dst, src, sizeBytes, hipMemcpyDeviceToDevice, stream)); } From c225227282aa6c104996f23ffed0178fe8c41b0f Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Wed, 8 Feb 2023 10:20:59 -0800 Subject: [PATCH 22/56] SWDEV-381633 - Better log Change-Id: I5c59d42462fe8a233ef10e26ec67314219aeb167 --- src/hip_context.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hip_context.cpp b/src/hip_context.cpp index 824e6959..515e1813 100644 --- a/src/hip_context.cpp +++ b/src/hip_context.cpp @@ -44,7 +44,7 @@ bool init() { if (!amd::Runtime::init()) { return false; } - LogPrintfInfo("Direct Dispatch: %d", AMD_DIRECT_DISPATCH); + ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Direct Dispatch: %d", AMD_DIRECT_DISPATCH); const std::vector& devices = amd::Device::getDevices(CL_DEVICE_TYPE_GPU, false); From a0c4e4085446aaff5cadc7f8bfb8a34249ab43e6 Mon Sep 17 00:00:00 2001 From: sdashmiz Date: Tue, 7 Feb 2023 11:23:43 -0500 Subject: [PATCH 23/56] SWDEV-374378 - correct setparam for memcpy node - params should be valid when used for default flag since we support unified virtual address space Signed-off-by: sdashmiz Change-Id: I75d40e437b12ee58e72e423bb4818b484ce35b66 --- src/hip_graph_internal.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hip_graph_internal.cpp b/src/hip_graph_internal.cpp index 48a7bc30..68bcab0a 100644 --- a/src/hip_graph_internal.cpp +++ b/src/hip_graph_internal.cpp @@ -77,14 +77,14 @@ hipError_t hipGraphMemcpyNode1D::ValidateParams(void* dst, const void* src, size if (origDstMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) { return hipErrorInvalidValue; } - if (kind != hipMemcpyHostToDevice) { + if ((kind != hipMemcpyHostToDevice) && (kind != hipMemcpyDefault)) { return hipErrorInvalidValue; } } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) { // device to host if (origSrcMemory->getContext().devices()[0] != srcMemory->getContext().devices()[0]) { return hipErrorInvalidValue; } - if (kind != hipMemcpyDeviceToHost) { + if ((kind != hipMemcpyDeviceToHost) && (kind != hipMemcpyDefault)) { return hipErrorInvalidValue; } } else if ((srcMemory != nullptr) && (dstMemory != nullptr)) { From e8a17711dd0b406bb9090d4eb9a4269d834c4e46 Mon Sep 17 00:00:00 2001 From: Satyanvesh Dittakavi Date: Wed, 1 Feb 2023 09:12:56 +0000 Subject: [PATCH 24/56] SWDEV-370552 - Correct the HIPRTC behavior to optimize the ISA only once Change-Id: Idaf0ea8294657db3666e9548deb6a9629e0ee718 --- src/hiprtc/hiprtcInternal.cpp | 63 ++++++++++++++++++++++++++--------- src/hiprtc/hiprtcInternal.hpp | 4 ++- 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/src/hiprtc/hiprtcInternal.cpp b/src/hiprtc/hiprtcInternal.cpp index e4e4f044..188b42d8 100644 --- a/src/hiprtc/hiprtcInternal.cpp +++ b/src/hiprtc/hiprtcInternal.cpp @@ -117,11 +117,6 @@ RTCCompileProgram::RTCCompileProgram(std::string name_) : RTCProgram(name_), fgp compile_options_.reserve(20); // count of options below compile_options_.push_back("-O3"); -#ifdef HIPRTC_EARLY_INLINE - compile_options_.push_back("-mllvm"); - compile_options_.push_back("-amdgpu-early-inline-all"); -#endif - if (GPU_ENABLE_WGP_MODE) compile_options_.push_back("-mcumode"); if (!GPU_ENABLE_WAVE32_MODE) compile_options_.push_back("-mwavefrontsize64"); @@ -188,7 +183,23 @@ bool RTCCompileProgram::addBuiltinHeader() { return true; } -bool RTCCompileProgram::transformOptions() { +bool RTCCompileProgram::findLLVMOptions(const std::vector& options, + std::vector& llvm_options) { + for (size_t i = 0; i < options.size(); ++i) { + if (options[i] == "-mllvm") { + if (options.size() == (i+1)) { + LogInfo( + "-mllvm option passed by the app, it comes as a pair but there is no option after this"); + return false; + } + llvm_options.push_back(options[i]); + llvm_options.push_back(options[i + 1]); + } + } + return true; +} + +bool RTCCompileProgram::transformOptions(std::vector& compile_options) { auto getValueOf = [](const std::string& option) { std::string res; auto f = std::find(option.begin(), option.end(), '='); @@ -196,7 +207,7 @@ bool RTCCompileProgram::transformOptions() { return res; }; - for (auto& i : compile_options_) { + for (auto& i : compile_options) { if (i == "-hip-pch") { LogInfo( "-hip-pch is deprecated option, has no impact on execution of new hiprtc programs, it " @@ -218,9 +229,9 @@ bool RTCCompileProgram::transformOptions() { } if (auto res = std::find_if( - compile_options_.begin(), compile_options_.end(), + compile_options.begin(), compile_options.end(), [](const std::string& str) { return str.find("--offload-arch=") != std::string::npos; }); - res != compile_options_.end()) { + res != compile_options.end()) { auto isaName = getValueOf(*res); isa_ = "amdgcn-amd-amdhsa--" + isaName; settings_.offloadArchProvided = true; @@ -242,15 +253,21 @@ bool RTCCompileProgram::compile(const std::vector& options, bool fg fgpu_rdc_ = fgpu_rdc; // Append compile options - compile_options_.reserve(compile_options_.size() + options.size()); - compile_options_.insert(compile_options_.end(), options.begin(), options.end()); + std::vector compileOpts(compile_options_); + compileOpts.reserve(compile_options_.size() + options.size() + 2); + compileOpts.insert(compileOpts.end(), options.begin(), options.end()); + + if (!fgpu_rdc_) { + compileOpts.push_back("-Xclang"); + compileOpts.push_back("-disable-llvm-passes"); + } - if (!transformOptions()) { + if (!transformOptions(compileOpts)) { LogError("Error in hiprtc: unable to transform options"); return false; } - if (!compileToBitCode(compile_input_, isa_, compile_options_, build_log_, LLVMBitcode_)) { + if (!compileToBitCode(compile_input_, isa_, compileOpts, build_log_, LLVMBitcode_)) { LogError("Error in hiprtc: unable to compile source to bitcode"); return false; } @@ -287,14 +304,30 @@ bool RTCCompileProgram::compile(const std::vector& options, bool fg return false; } + std::vector llvmOptions; + // Find the -mllvm options passed by the app such as "-mllvm" "-amdgpu-early-inline-all=true" + if (!findLLVMOptions(options, llvmOptions)) { + LogError("Error in hiprtc: unable to match -mllvm options"); + return false; + } + + std::vector exeOpts(exe_options_); + exeOpts.reserve(exeOpts.size() + llvmOptions.size() + 2); + // Add these options by default for optimizations during BC to Relocatable phase. + exeOpts.push_back("-mllvm"); + exeOpts.push_back("-amdgpu-internalize-symbols"); + // User provided -mllvm options are appended at the end since they can override the above + // default options if necessary + exeOpts.insert(exeOpts.end(), llvmOptions.begin(), llvmOptions.end()); + if (settings_.dumpISA) { - if (!dumpIsaFromBC(exec_input_, isa_, exe_options_, name_, build_log_)) { + if (!dumpIsaFromBC(exec_input_, isa_, exeOpts, name_, build_log_)) { LogError("Error in hiprtc: unable to dump isa code"); return false; } } - if (!createExecutable(exec_input_, isa_, exe_options_, build_log_, executable_)) { + if (!createExecutable(exec_input_, isa_, exeOpts, build_log_, executable_)) { LogError("Error in hiprtc: unable to create executable"); return false; } diff --git a/src/hiprtc/hiprtcInternal.hpp b/src/hiprtc/hiprtcInternal.hpp index 8b846e49..28fcc90f 100644 --- a/src/hiprtc/hiprtcInternal.hpp +++ b/src/hiprtc/hiprtcInternal.hpp @@ -146,7 +146,9 @@ class RTCCompileProgram : public RTCProgram { // Private Member functions bool addSource_impl(); bool addBuiltinHeader(); - bool transformOptions(); + bool transformOptions(std::vector& compile_options); + bool findLLVMOptions(const std::vector& options, + std::vector& llvm_options); RTCCompileProgram() = delete; RTCCompileProgram(RTCCompileProgram&) = delete; From 312dff7b794337aa040be0691acc78e9f968a8d2 Mon Sep 17 00:00:00 2001 From: Ranjith Ramakrishnan Date: Wed, 8 Feb 2023 14:26:51 -0800 Subject: [PATCH 25/56] SWDEV-366831 - File reorg backward compatibility message changed to #error Change-Id: I44adb1a4b9bba0bd89edb75be715504ede2e3d08 --- header_template.hpp.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/header_template.hpp.in b/header_template.hpp.in index 2a23abb1..5ec051d5 100644 --- a/header_template.hpp.in +++ b/header_template.hpp.in @@ -23,7 +23,7 @@ #define @include_guard@ #if defined(__GNUC__) -#warning "This file is deprecated. Use header files from HIP_PATH/include, where HIP_PATH corresponds to HIP install path" +#error "This file is deprecated. Use header files from HIP_PATH/include, where HIP_PATH corresponds to HIP install path" #else #pragma message("This file is deprecated. Use header files from HIP_PATH/include, where HIP_PATH corresponds to HIP install path") #endif From e6442326cec9af22c9d6e765d16dd70f32d58392 Mon Sep 17 00:00:00 2001 From: Sourabh Betigeri Date: Wed, 8 Feb 2023 19:28:51 +0000 Subject: [PATCH 26/56] SWDEV-375202 - Fixes print formatting for flags in hipGraphDebugPrint() to match CUDA Change-Id: I2d85fc38d2c65bc12534109883fe00802e77e62d --- src/hip_graph_internal.hpp | 48 ++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/src/hip_graph_internal.hpp b/src/hip_graph_internal.hpp index e2bee8a1..935779a6 100644 --- a/src/hip_graph_internal.hpp +++ b/src/hip_graph_internal.hpp @@ -735,12 +735,31 @@ class hipGraphKernelNode : public hipGraphNode { unsigned int kernelAttrInUse_; public: + void PrintAttributes(std::ostream& out, hipGraphDebugDotFlags flag) { + out << "["; + out << "style"; + out << "=\""; + out << style_; + (flag == hipGraphDebugDotFlagsKernelNodeParams || + flag == hipGraphDebugDotFlagsKernelNodeAttributes) ? + out << "\n" : out << "\""; + out << "shape"; + out << "=\""; + out << GetShape(flag); + out << "\""; + out << "label"; + out << "=\""; + out << GetLabel(flag); + out << "\""; + out << "];"; + } + std::string GetLabel(hipGraphDebugDotFlags flag) { hipFunction_t func = getFunc(*pKernelParams_, ihipGetDevice()); hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func); std::string label; - if (flag == hipGraphDebugDotFlagsKernelNodeParams || flag == hipGraphDebugDotFlagsVerbose) { - char buffer[500]; + char buffer[500]; + if (flag == hipGraphDebugDotFlagsVerbose) { sprintf(buffer, "{\n%s\n| {ID | %d | %s\\<\\<\\<(%u,%u,%u),(%u,%u,%u),%u\\>\\>\\>}\n| {{node " "handle | func handle} | {%p | %p}}\n| {accessPolicyWindow | {base_ptr | num_bytes | " @@ -754,8 +773,29 @@ class hipGraphKernelNode : public hipGraphNode { kernelAttr_.accessPolicyWindow.hitRatio, kernelAttr_.accessPolicyWindow.hitProp, kernelAttr_.accessPolicyWindow.missProp, kernelAttr_.cooperative); label = buffer; - } else { - label = std::to_string(GetID()) + "\n" + function->name(); + } + else if (flag == hipGraphDebugDotFlagsKernelNodeAttributes) { + sprintf(buffer, + "{\n%s\n| {ID | %d | %s}\n" + "| {accessPolicyWindow | {base_ptr | num_bytes | " + "hitRatio | hitProp | missProp} |\n| {%p | %ld | %f | %d | %d}}\n| {cooperative | " + "%u}\n| {priority | 0}\n}", + label_.c_str(), GetID(), function->name().c_str(), + kernelAttr_.accessPolicyWindow.base_ptr, kernelAttr_.accessPolicyWindow.num_bytes, + kernelAttr_.accessPolicyWindow.hitRatio, kernelAttr_.accessPolicyWindow.hitProp, + kernelAttr_.accessPolicyWindow.missProp, kernelAttr_.cooperative); + label = buffer; + } + else if (flag == hipGraphDebugDotFlagsKernelNodeParams) { + sprintf(buffer, "%d\n%s\n\\<\\<\\<(%u,%u,%u),(%u,%u,%u),%u\\>\\>\\>", + GetID(), function->name().c_str(), pKernelParams_->gridDim.x, + pKernelParams_->gridDim.y, pKernelParams_->gridDim.z, + pKernelParams_->blockDim.x, pKernelParams_->blockDim.y, + pKernelParams_->blockDim.z, pKernelParams_->sharedMemBytes); + label = buffer; + } + else { + label = std::to_string(GetID()) + "\n" + function->name() + "\n"; } return label; } From b2011d0b73923c5962e2c4c64afc15cce57ccdf6 Mon Sep 17 00:00:00 2001 From: Sourabh Betigeri Date: Tue, 7 Feb 2023 20:06:40 +0000 Subject: [PATCH 27/56] SWDEV-375194 - Fixes graph dot file descriptions to include labelId matching CUDA Change-Id: Ia0e6b77e38678b13457f86755ed8f3b852c9b7a1 --- src/hip_graph_internal.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hip_graph_internal.hpp b/src/hip_graph_internal.hpp index 935779a6..a8d9c699 100644 --- a/src/hip_graph_internal.hpp +++ b/src/hip_graph_internal.hpp @@ -383,7 +383,7 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute { fout << "\"" << fromNodeName << "\" -> \"" << toNodeName << "\"" << std::endl; } } - virtual std::string GetLabel() { return (std::to_string(id_) + "\n" + label_); } + virtual std::string GetLabel(hipGraphDebugDotFlags flag) { return (std::to_string(id_) + "\n" + label_); } unsigned int GetEnabled() const { return isEnabled_; } void SetEnabled(unsigned int isEnabled) { isEnabled_ = isEnabled; } }; From 8a2ef62a5b023ba35f0c626f81591afd4c741365 Mon Sep 17 00:00:00 2001 From: sdashmiz Date: Mon, 23 Jan 2023 15:50:41 -0500 Subject: [PATCH 28/56] SWDEV-377862 - wrong format for memcpy1d Signed-off-by: sdashmiz Change-Id: Ide6d252083b1812b0ad9cd182b1435e3e59b3aa1 --- src/hip_graph_internal.hpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/hip_graph_internal.hpp b/src/hip_graph_internal.hpp index a8d9c699..952bb8ee 100644 --- a/src/hip_graph_internal.hpp +++ b/src/hip_graph_internal.hpp @@ -496,7 +496,7 @@ struct ihipGraph { ihipGraph* clone() const; void GenerateDOT(std::ostream& fout, hipGraphDebugDotFlags flag) { fout << "subgraph cluster_" << GetID() << " {" << std::endl; - fout << "graph[style=\"dashed\" label=\"graph_" << GetID() << "\"];\n"; + fout << "label=\"graph_" << GetID() <<"\"graph[style=\"dashed\"];\n"; for (auto node : vertices_) { node->GenerateDOTNode(GetID(), fout, flag); } @@ -1371,9 +1371,15 @@ class hipGraphMemcpyNode1D : public hipGraphNode { if (flag == hipGraphDebugDotFlagsMemcpyNodeParams || flag == hipGraphDebugDotFlagsVerbose) { char buffer[500]; sprintf(buffer, - "{\n%s\n| {{ID | node handle | dst | src | count | kind } | {%u | %p | %p | %p | " - "%zu | %s}}}", - label_.c_str(), GetID(), this, dst_, src_, count_, memcpyDirection.c_str()); + "{\n%s\n| {{ID | node handle} | {%u | %p}}\n| {kind | %s}\n| {{srcPtr | dstPtr} | " + "{pitch " + "| ptr | xsize | ysize | pitch | ptr | xsize | size} | {%zu | %p | %zu | %zu | %zu | %p " + "| %zu " + "| %zu}}\n| {{srcPos | {{x | %zu} | {y | %zu} | {z | %zu}}} | {dstPos | {{x | %zu} | {y " + "| " + "%zu} | {z | %zu}}} | {Extent | {{Width | %zu} | {Height | %zu} | {Depth | %zu}}}}\n}", + label_.c_str(), GetID(), this, memcpyDirection.c_str(), (size_t)0, + src_, (size_t)0, (size_t)0, (size_t)0, dst_, (size_t)0, (size_t)0, (size_t)0, (size_t)0, (size_t)0, (size_t)0, (size_t)0, (size_t)0, count_, (size_t)1, (size_t)1); label = buffer; } else { label = std::to_string(GetID()) + "\n" + label_ + "\n(" + memcpyDirection + "," + From e71ab7957edb9a14d5ad81ade6618f4cf4b41a00 Mon Sep 17 00:00:00 2001 From: victzhan Date: Wed, 8 Feb 2023 16:15:10 -0500 Subject: [PATCH 29/56] SWDEV-376995 - Added if statement so that when AQL dispatch failed it returns corresponding hip error instead of hipSuccess Change-Id: I0724fbcf0833c5dd8b2e4d6e443b3e226046dddf --- src/hip_module.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/hip_module.cpp b/src/hip_module.cpp index ae5d2660..76821cfe 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -336,10 +336,16 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f, // Capture the kernel arguments if (CL_SUCCESS != kernelCommand->captureAndValidate()) { - delete kernelCommand; + kernelCommand->release(); return hipErrorOutOfMemory; } + + if (kernelCommand->status() == CL_INVALID_OPERATION) { + kernelCommand->release(); + return hipErrorIllegalState; + } command = kernelCommand; + return hipSuccess; } From af79ae6c8734fde3c5831dab19efafac54c08956 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 13 Feb 2023 23:42:35 -0500 Subject: [PATCH 30/56] SWDEV-366831 - Revert "File reorg backward compatibility message changed to #error" This reverts commit 312dff7b794337aa040be0691acc78e9f968a8d2. Reason for revert: Need app fixes first Change-Id: I1fea9a2ca8d57ef79a4f407bbc0d906976292eb1 --- header_template.hpp.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/header_template.hpp.in b/header_template.hpp.in index 5ec051d5..2a23abb1 100644 --- a/header_template.hpp.in +++ b/header_template.hpp.in @@ -23,7 +23,7 @@ #define @include_guard@ #if defined(__GNUC__) -#error "This file is deprecated. Use header files from HIP_PATH/include, where HIP_PATH corresponds to HIP install path" +#warning "This file is deprecated. Use header files from HIP_PATH/include, where HIP_PATH corresponds to HIP install path" #else #pragma message("This file is deprecated. Use header files from HIP_PATH/include, where HIP_PATH corresponds to HIP install path") #endif From 20237f84df4a775726e9e15b43b616daa2f15cfd Mon Sep 17 00:00:00 2001 From: kjayapra-amd Date: Mon, 13 Feb 2023 18:18:23 -0800 Subject: [PATCH 31/56] SWDEV-381898 - Move the lock after code object loading since COMGR is thread safe now and VDI also have locks. Change-Id: I15547f3ae3711d12a1ddf5cb7e9abd40a95ded89 --- src/hip_platform.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/hip_platform.cpp b/src/hip_platform.cpp index aebd7f99..20d13162 100644 --- a/src/hip_platform.cpp +++ b/src/hip_platform.cpp @@ -673,8 +673,6 @@ void PlatformState::init() { } hipError_t PlatformState::loadModule(hipModule_t* module, const char* fname, const void* image) { - amd::ScopedLock lock(lock_); - if (module == nullptr) { return hipErrorInvalidValue; } @@ -689,6 +687,7 @@ hipError_t PlatformState::loadModule(hipModule_t* module, const char* fname, con *module = dynCo->module(); assert(*module != nullptr); + amd::ScopedLock lock(lock_); if (dynCO_map_.find(*module) != dynCO_map_.end()) { delete dynCo; return hipErrorAlreadyMapped; From ea6b5f0545b487c080584d73677500d2f960ee75 Mon Sep 17 00:00:00 2001 From: kjayapra-amd Date: Mon, 9 Jan 2023 15:05:03 -0800 Subject: [PATCH 32/56] SWDEV-376697 - Pass uncached memory flag when hipDeviceMallocUncached memory is requested. Change-Id: Idc12948047fdb69876e31edf34a8bcda46c9c303 --- src/hip_memory.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 3eb2ad1e..da7ed30a 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -527,7 +527,9 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag if (flags == hipDeviceMallocDefault) { ihipFlags = 0; } else if (flags == hipDeviceMallocFinegrained) { - ihipFlags = CL_MEM_SVM_ATOMICS | ROCCLR_MEM_HSA_PSEUDO_FINE_GRAIN; + ihipFlags = CL_MEM_SVM_ATOMICS; + } else if (flags == hipDeviceMallocUncached) { + ihipFlags = CL_MEM_SVM_ATOMICS | ROCCLR_MEM_HSA_UNCACHED; } else if (flags == hipMallocSignalMemory) { ihipFlags = CL_MEM_SVM_ATOMICS | CL_MEM_SVM_FINE_GRAIN_BUFFER | ROCCLR_MEM_HSA_SIGNAL_MEMORY; if (sizeBytes != 8) { From 101d84566fd901dea2bbbd7da65cb2640b7dc470 Mon Sep 17 00:00:00 2001 From: Ioannis Assiouras Date: Thu, 16 Feb 2023 01:27:08 +0000 Subject: [PATCH 33/56] SWDEV-381402 - convert host_device to host_context of type amd::Context* Change-Id: Ic1cb9a3fa64e16699fad7e9ec6679f1d34b14bef --- src/hip_context.cpp | 6 +++--- src/hip_device_runtime.cpp | 2 +- src/hip_hmm.cpp | 4 ++-- src/hip_internal.hpp | 2 +- src/hip_memory.cpp | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/hip_context.cpp b/src/hip_context.cpp index 515e1813..307f8452 100644 --- a/src/hip_context.cpp +++ b/src/hip_context.cpp @@ -29,7 +29,7 @@ std::vector g_devices; namespace hip { thread_local TlsAggregator tls; -Device* host_device = nullptr; +amd::Context* host_context = nullptr; //init() is only to be called from the HIP_INIT macro only once bool init() { @@ -74,7 +74,7 @@ bool init() { if (CL_SUCCESS != hContext->create(nullptr)) { hContext->release(); } - host_device = new Device(hContext, -1); + host_context = hContext; PlatformState::instance().init(); return true; @@ -113,7 +113,7 @@ amd::HostQueue* getNullStream(amd::Context& ctx) { } // If it's a pure SVM allocation with system memory access, then it shouldn't matter which device // runtime selects by default - if (hip::host_device->asContext() == &ctx) { + if (hip::host_context == &ctx) { // Return current... return getNullStream(); } diff --git a/src/hip_device_runtime.cpp b/src/hip_device_runtime.cpp index def9bcb0..0288c885 100644 --- a/src/hip_device_runtime.cpp +++ b/src/hip_device_runtime.cpp @@ -602,7 +602,7 @@ hipError_t hipSetDeviceFlags ( unsigned int flags ) { switch (scheduleFlag) { case hipDeviceScheduleAuto: // Current behavior is different from the spec, due to MT usage in runtime - if (hip::host_device->devices().size() >= std::thread::hardware_concurrency()) { + if (hip::host_context->devices().size() >= std::thread::hardware_concurrency()) { device->SetActiveWait(false); break; } diff --git a/src/hip_hmm.cpp b/src/hip_hmm.cpp index 00687f84..be4c6cb4 100644 --- a/src/hip_hmm.cpp +++ b/src/hip_hmm.cpp @@ -233,8 +233,8 @@ hipError_t ihipMallocManaged(void** ptr, size_t size, unsigned int align) { return hipSuccess; } - assert((hip::host_device->asContext()!= nullptr) && "Current host context must be valid"); - amd::Context& ctx = *hip::host_device->asContext(); + assert((hip::host_context != nullptr) && "Current host context must be valid"); + amd::Context& ctx = *hip::host_context; const amd::Device& dev = *ctx.devices()[0]; diff --git a/src/hip_internal.hpp b/src/hip_internal.hpp index 2c3f7fb9..a416c1e7 100644 --- a/src/hip_internal.hpp +++ b/src/hip_internal.hpp @@ -513,7 +513,7 @@ namespace hip { extern thread_local TlsAggregator tls; /// Device representing the host - for pinned memory - extern Device* host_device; + extern amd::Context* host_context; extern bool init(); diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index da7ed30a..2fdb28b5 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -274,7 +274,7 @@ hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags) bool useHostDevice = (flags & CL_MEM_SVM_FINE_GRAIN_BUFFER) != 0; amd::Context* curDevContext = hip::getCurrentDevice()->asContext(); - amd::Context* amdContext = useHostDevice ? hip::host_device->asContext() : curDevContext; + amd::Context* amdContext = useHostDevice ? hip::host_context : curDevContext; if (amdContext == nullptr) { return hipErrorOutOfMemory; @@ -1155,7 +1155,7 @@ hipError_t ihipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags) if (hostPtr == nullptr || sizeBytes == 0 || flags > 15) { return hipErrorInvalidValue; } else { - amd::Memory* mem = new (*hip::host_device->asContext()) amd::Buffer(*hip::host_device->asContext(), + amd::Memory* mem = new (*hip::host_context) amd::Buffer(*hip::host_context, CL_MEM_USE_HOST_PTR | CL_MEM_SVM_ATOMICS, sizeBytes); constexpr bool sysMemAlloc = false; From 7ffdbb3e182840f1d830ef4d83bda210ec455917 Mon Sep 17 00:00:00 2001 From: Rakesh Roy Date: Wed, 17 Aug 2022 23:36:56 +0530 Subject: [PATCH 34/56] SWDEV-338733 - Implement hipArrayGet* APIs - Add implementation for hipArrayGetInfo, hipArrayGetDescriptor & hipArray3DGetDescriptor APIs Change-Id: I181a472066006bc3bd0d987408ea67e218310983 --- include/hip/amd_detail/hip_prof_str.h | 101 ++++++++++++++++-- .../nvidia_detail/nvidia_hip_runtime_api.h | 15 +++ src/amdhip.def | 3 + src/hip_hcc.def.in | 3 + src/hip_hcc.map.in | 9 ++ src/hip_memory.cpp | 98 +++++++++++++++-- 6 files changed, 215 insertions(+), 14 deletions(-) diff --git a/include/hip/amd_detail/hip_prof_str.h b/include/hip/amd_detail/hip_prof_str.h index 7ec70fdc..d72fd38d 100644 --- a/include/hip/amd_detail/hip_prof_str.h +++ b/include/hip/amd_detail/hip_prof_str.h @@ -370,10 +370,11 @@ enum hip_api_id_t { HIP_API_ID_hipGraphMemFreeNodeGetParams = 357, HIP_API_ID_hipModuleLaunchCooperativeKernel = 358, HIP_API_ID_hipModuleLaunchCooperativeKernelMultiDevice = 359, - HIP_API_ID_LAST = 359, + HIP_API_ID_hipArray3DGetDescriptor = 360, + HIP_API_ID_hipArrayGetDescriptor = 361, + HIP_API_ID_hipArrayGetInfo = 362, + HIP_API_ID_LAST = 362, - HIP_API_ID_hipArray3DGetDescriptor = HIP_API_ID_NONE, - HIP_API_ID_hipArrayGetDescriptor = HIP_API_ID_NONE, HIP_API_ID_hipBindTexture = HIP_API_ID_NONE, HIP_API_ID_hipBindTexture2D = HIP_API_ID_NONE, HIP_API_ID_hipBindTextureToArray = HIP_API_ID_NONE, @@ -420,8 +421,11 @@ static inline const char* hip_api_name(const uint32_t id) { case HIP_API_ID___hipPopCallConfiguration: return "__hipPopCallConfiguration"; case HIP_API_ID___hipPushCallConfiguration: return "__hipPushCallConfiguration"; case HIP_API_ID_hipArray3DCreate: return "hipArray3DCreate"; + case HIP_API_ID_hipArray3DGetDescriptor: return "hipArray3DGetDescriptor"; case HIP_API_ID_hipArrayCreate: return "hipArrayCreate"; case HIP_API_ID_hipArrayDestroy: return "hipArrayDestroy"; + case HIP_API_ID_hipArrayGetDescriptor: return "hipArrayGetDescriptor"; + case HIP_API_ID_hipArrayGetInfo: return "hipArrayGetInfo"; case HIP_API_ID_hipChooseDevice: return "hipChooseDevice"; case HIP_API_ID_hipConfigureCall: return "hipConfigureCall"; case HIP_API_ID_hipCreateSurfaceObject: return "hipCreateSurfaceObject"; @@ -782,8 +786,11 @@ static inline uint32_t hipApiIdByName(const char* name) { if (strcmp("__hipPopCallConfiguration", name) == 0) return HIP_API_ID___hipPopCallConfiguration; if (strcmp("__hipPushCallConfiguration", name) == 0) return HIP_API_ID___hipPushCallConfiguration; if (strcmp("hipArray3DCreate", name) == 0) return HIP_API_ID_hipArray3DCreate; + if (strcmp("hipArray3DGetDescriptor", name) == 0) return HIP_API_ID_hipArray3DGetDescriptor; if (strcmp("hipArrayCreate", name) == 0) return HIP_API_ID_hipArrayCreate; if (strcmp("hipArrayDestroy", name) == 0) return HIP_API_ID_hipArrayDestroy; + if (strcmp("hipArrayGetDescriptor", name) == 0) return HIP_API_ID_hipArrayGetDescriptor; + if (strcmp("hipArrayGetInfo", name) == 0) return HIP_API_ID_hipArrayGetInfo; if (strcmp("hipChooseDevice", name) == 0) return HIP_API_ID_hipChooseDevice; if (strcmp("hipConfigureCall", name) == 0) return HIP_API_ID_hipConfigureCall; if (strcmp("hipCreateSurfaceObject", name) == 0) return HIP_API_ID_hipCreateSurfaceObject; @@ -1164,6 +1171,12 @@ typedef struct hip_api_data_s { const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray; HIP_ARRAY3D_DESCRIPTOR pAllocateArray__val; } hipArray3DCreate; + struct { + HIP_ARRAY3D_DESCRIPTOR* pArrayDescriptor; + HIP_ARRAY3D_DESCRIPTOR pArrayDescriptor__val; + hipArray* array; + hipArray array__val; + } hipArray3DGetDescriptor; struct { hipArray** pHandle; hipArray* pHandle__val; @@ -1174,6 +1187,22 @@ typedef struct hip_api_data_s { hipArray* array; hipArray array__val; } hipArrayDestroy; + struct { + HIP_ARRAY_DESCRIPTOR* pArrayDescriptor; + HIP_ARRAY_DESCRIPTOR pArrayDescriptor__val; + hipArray* array; + hipArray array__val; + } hipArrayGetDescriptor; + struct { + hipChannelFormatDesc* desc; + hipChannelFormatDesc desc__val; + hipExtent* extent; + hipExtent extent__val; + unsigned int* flags; + unsigned int flags__val; + hipArray* array; + hipArray array__val; + } hipArrayGetInfo; struct { int* device; int device__val; @@ -3252,6 +3281,11 @@ typedef struct hip_api_data_s { cb_data.args.hipArray3DCreate.array = (hipArray**)array; \ cb_data.args.hipArray3DCreate.pAllocateArray = (const HIP_ARRAY3D_DESCRIPTOR*)pAllocateArray; \ }; +// hipArray3DGetDescriptor[('HIP_ARRAY3D_DESCRIPTOR*', 'pArrayDescriptor'), ('hipArray*', 'array')] +#define INIT_hipArray3DGetDescriptor_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipArray3DGetDescriptor.pArrayDescriptor = (HIP_ARRAY3D_DESCRIPTOR*)pArrayDescriptor; \ + cb_data.args.hipArray3DGetDescriptor.array = (hipArray*)array; \ +}; // hipArrayCreate[('hipArray**', 'pHandle'), ('const HIP_ARRAY_DESCRIPTOR*', 'pAllocateArray')] #define INIT_hipArrayCreate_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipArrayCreate.pHandle = (hipArray**)array; \ @@ -3261,6 +3295,18 @@ typedef struct hip_api_data_s { #define INIT_hipArrayDestroy_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipArrayDestroy.array = (hipArray*)array; \ }; +// hipArrayGetDescriptor[('HIP_ARRAY_DESCRIPTOR*', 'pArrayDescriptor'), ('hipArray*', 'array')] +#define INIT_hipArrayGetDescriptor_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipArrayGetDescriptor.pArrayDescriptor = (HIP_ARRAY_DESCRIPTOR*)pArrayDescriptor; \ + cb_data.args.hipArrayGetDescriptor.array = (hipArray*)array; \ +}; +// hipArrayGetInfo[('hipChannelFormatDesc*', 'desc'), ('hipExtent*', 'extent'), ('unsigned int*', 'flags'), ('hipArray*', 'array')] +#define INIT_hipArrayGetInfo_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipArrayGetInfo.desc = (hipChannelFormatDesc*)desc; \ + cb_data.args.hipArrayGetInfo.extent = (hipExtent*)extent; \ + cb_data.args.hipArrayGetInfo.flags = (unsigned int*)flags; \ + cb_data.args.hipArrayGetInfo.array = (hipArray*)array; \ +}; // hipChooseDevice[('int*', 'device'), ('const hipDeviceProp_t*', 'prop')] #define INIT_hipChooseDevice_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipChooseDevice.device = (int*)device; \ @@ -5376,10 +5422,6 @@ typedef struct hip_api_data_s { #define INIT_CB_ARGS_DATA(cb_id, cb_data) INIT_##cb_id##_CB_ARGS_DATA(cb_data) // Macros for non-public API primitives -// hipArray3DGetDescriptor() -#define INIT_hipArray3DGetDescriptor_CB_ARGS_DATA(cb_data) {}; -// hipArrayGetDescriptor() -#define INIT_hipArrayGetDescriptor_CB_ARGS_DATA(cb_data) {}; // hipBindTexture() #define INIT_hipBindTexture_CB_ARGS_DATA(cb_data) {}; // hipBindTexture2D() @@ -5478,6 +5520,11 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { if (data->args.hipArray3DCreate.array) data->args.hipArray3DCreate.array__val = *(data->args.hipArray3DCreate.array); if (data->args.hipArray3DCreate.pAllocateArray) data->args.hipArray3DCreate.pAllocateArray__val = *(data->args.hipArray3DCreate.pAllocateArray); break; +// hipArray3DGetDescriptor[('HIP_ARRAY3D_DESCRIPTOR*', 'pArrayDescriptor'), ('hipArray*', 'array')] + case HIP_API_ID_hipArray3DGetDescriptor: + if (data->args.hipArray3DGetDescriptor.pArrayDescriptor) data->args.hipArray3DGetDescriptor.pArrayDescriptor__val = *(data->args.hipArray3DGetDescriptor.pArrayDescriptor); + if (data->args.hipArray3DGetDescriptor.array) data->args.hipArray3DGetDescriptor.array__val = *(data->args.hipArray3DGetDescriptor.array); + break; // hipArrayCreate[('hipArray**', 'pHandle'), ('const HIP_ARRAY_DESCRIPTOR*', 'pAllocateArray')] case HIP_API_ID_hipArrayCreate: if (data->args.hipArrayCreate.pHandle) data->args.hipArrayCreate.pHandle__val = *(data->args.hipArrayCreate.pHandle); @@ -5487,6 +5534,18 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { case HIP_API_ID_hipArrayDestroy: if (data->args.hipArrayDestroy.array) data->args.hipArrayDestroy.array__val = *(data->args.hipArrayDestroy.array); break; +// hipArrayGetDescriptor[('HIP_ARRAY_DESCRIPTOR*', 'pArrayDescriptor'), ('hipArray*', 'array')] + case HIP_API_ID_hipArrayGetDescriptor: + if (data->args.hipArrayGetDescriptor.pArrayDescriptor) data->args.hipArrayGetDescriptor.pArrayDescriptor__val = *(data->args.hipArrayGetDescriptor.pArrayDescriptor); + if (data->args.hipArrayGetDescriptor.array) data->args.hipArrayGetDescriptor.array__val = *(data->args.hipArrayGetDescriptor.array); + break; +// hipArrayGetInfo[('hipChannelFormatDesc*', 'desc'), ('hipExtent*', 'extent'), ('unsigned int*', 'flags'), ('hipArray*', 'array')] + case HIP_API_ID_hipArrayGetInfo: + if (data->args.hipArrayGetInfo.desc) data->args.hipArrayGetInfo.desc__val = *(data->args.hipArrayGetInfo.desc); + if (data->args.hipArrayGetInfo.extent) data->args.hipArrayGetInfo.extent__val = *(data->args.hipArrayGetInfo.extent); + if (data->args.hipArrayGetInfo.flags) data->args.hipArrayGetInfo.flags__val = *(data->args.hipArrayGetInfo.flags); + if (data->args.hipArrayGetInfo.array) data->args.hipArrayGetInfo.array__val = *(data->args.hipArrayGetInfo.array); + break; // hipChooseDevice[('int*', 'device'), ('const hipDeviceProp_t*', 'prop')] case HIP_API_ID_hipChooseDevice: if (data->args.hipChooseDevice.device) data->args.hipChooseDevice.device__val = *(data->args.hipChooseDevice.device); @@ -6881,6 +6940,14 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da else { oss << ", pAllocateArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArray3DCreate.pAllocateArray__val); } oss << ")"; break; + case HIP_API_ID_hipArray3DGetDescriptor: + oss << "hipArray3DGetDescriptor("; + if (data->args.hipArray3DGetDescriptor.pArrayDescriptor == NULL) oss << "pArrayDescriptor=NULL"; + else { oss << "pArrayDescriptor="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArray3DGetDescriptor.pArrayDescriptor__val); } + if (data->args.hipArray3DGetDescriptor.array == NULL) oss << ", array=NULL"; + else { oss << ", array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArray3DGetDescriptor.array__val); } + oss << ")"; + break; case HIP_API_ID_hipArrayCreate: oss << "hipArrayCreate("; if (data->args.hipArrayCreate.pHandle == NULL) oss << "pHandle=NULL"; @@ -6895,6 +6962,26 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da else { oss << "array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayDestroy.array__val); } oss << ")"; break; + case HIP_API_ID_hipArrayGetDescriptor: + oss << "hipArrayGetDescriptor("; + if (data->args.hipArrayGetDescriptor.pArrayDescriptor == NULL) oss << "pArrayDescriptor=NULL"; + else { oss << "pArrayDescriptor="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetDescriptor.pArrayDescriptor__val); } + if (data->args.hipArrayGetDescriptor.array == NULL) oss << ", array=NULL"; + else { oss << ", array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetDescriptor.array__val); } + oss << ")"; + break; + case HIP_API_ID_hipArrayGetInfo: + oss << "hipArrayGetInfo("; + if (data->args.hipArrayGetInfo.desc == NULL) oss << "desc=NULL"; + else { oss << "desc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetInfo.desc__val); } + if (data->args.hipArrayGetInfo.extent == NULL) oss << ", extent=NULL"; + else { oss << ", extent="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetInfo.extent__val); } + if (data->args.hipArrayGetInfo.flags == NULL) oss << ", flags=NULL"; + else { oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetInfo.flags__val); } + if (data->args.hipArrayGetInfo.array == NULL) oss << ", array=NULL"; + else { oss << ", array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetInfo.array__val); } + oss << ")"; + break; case HIP_API_ID_hipChooseDevice: oss << "hipChooseDevice("; if (data->args.hipChooseDevice.device == NULL) oss << "device=NULL"; diff --git a/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/include/hip/nvidia_detail/nvidia_hip_runtime_api.h index 0c492b7c..a06f9b45 100644 --- a/include/hip/nvidia_detail/nvidia_hip_runtime_api.h +++ b/include/hip/nvidia_detail/nvidia_hip_runtime_api.h @@ -3186,6 +3186,21 @@ inline static hipError_t hipArray3DCreate(hiparray* pHandle, return hipCUResultTohipError(cuArray3DCreate(pHandle, pAllocateArray)); } +inline static hipError_t hipArrayGetInfo(hipChannelFormatDesc* desc, hipExtent* extent, + unsigned int* flags, hipArray* array) { + return hipCUDAErrorTohipError(cudaArrayGetInfo(desc, extent, flags, array)); +} + +inline static hipError_t hipArrayGetDescriptor(HIP_ARRAY_DESCRIPTOR* pArrayDescriptor, + hipArray* array) { + return hipCUResultTohipError(cuArrayGetDescriptor(pArrayDescriptor, (CUarray)array)); +} + +inline static hipError_t hipArray3DGetDescriptor(HIP_ARRAY3D_DESCRIPTOR* pArrayDescriptor, + hipArray* array) { + return hipCUResultTohipError(cuArray3DGetDescriptor(pArrayDescriptor, (CUarray)array)); +} + inline static hipError_t hipStreamBeginCapture(hipStream_t stream, hipStreamCaptureMode mode) { return hipCUDAErrorTohipError(cudaStreamBeginCapture(stream, mode)); } diff --git a/src/amdhip.def b/src/amdhip.def index 9a1d3e6e..70279e21 100644 --- a/src/amdhip.def +++ b/src/amdhip.def @@ -103,6 +103,9 @@ hipMemPoolImportPointer hipArrayCreate hipArray3DCreate hipArrayDestroy +hipArrayGetInfo +hipArrayGetDescriptor +hipArray3DGetDescriptor hipMallocArray hipMemAdvise hipMemAllocPitch diff --git a/src/hip_hcc.def.in b/src/hip_hcc.def.in index 187c0fdd..129fa7c4 100644 --- a/src/hip_hcc.def.in +++ b/src/hip_hcc.def.in @@ -103,6 +103,9 @@ hipMemPoolImportPointer hipArrayCreate hipArray3DCreate hipArrayDestroy +hipArrayGetInfo +hipArrayGetDescriptor +hipArray3DGetDescriptor hipMallocArray hipMemAdvise hipMemAllocPitch diff --git a/src/hip_hcc.map.in b/src/hip_hcc.map.in index a9adf372..81251cca 100644 --- a/src/hip_hcc.map.in +++ b/src/hip_hcc.map.in @@ -517,3 +517,12 @@ global: local: *; } hip_5.3; + +hip_5.6 { +global: + hipArrayGetInfo; + hipArrayGetDescriptor; + hipArray3DGetDescriptor; +local: + *; +} hip_5.5; \ No newline at end of file diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 2fdb28b5..40f5a528 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -1047,6 +1047,7 @@ hipError_t ihipArrayCreate(hipArray** array, (*array)->depth = pAllocateArray->Depth; (*array)->Format = pAllocateArray->Format; (*array)->NumChannels = pAllocateArray->NumChannels; + (*array)->flags = pAllocateArray->Flags; { amd::ScopedLock lock(hip::hipArraySetLock); hip::hipArraySet.insert(*array); @@ -3541,22 +3542,105 @@ hipError_t hipArrayDestroy(hipArray* array) { HIP_RETURN(ihipArrayDestroy(array)); } -hipError_t hipArray3DGetDescriptor(HIP_ARRAY3D_DESCRIPTOR* pArrayDescriptor, - hipArray* array) { - HIP_INIT_API(hipArray3DGetDescriptor, pArrayDescriptor, array); +hipError_t ihipArray3DGetDescriptor(HIP_ARRAY3D_DESCRIPTOR* desc, + hipArray* array) { + { + amd::ScopedLock lock(hip::hipArraySetLock); + if (hip::hipArraySet.find(array) == hip::hipArraySet.end()) { + return hipErrorInvalidHandle; + } + } - assert(false && "Unimplemented"); + desc->Width = array->width; + desc->Height = array->height; + desc->Depth = array->depth; + desc->Format = array->Format; + desc->NumChannels = array->NumChannels; + desc->Flags = array->flags; - HIP_RETURN(hipSuccess); + return hipSuccess; +} + +hipError_t hipArrayGetInfo(hipChannelFormatDesc* desc, + hipExtent* extent, + unsigned int* flags, + hipArray* array) { + HIP_INIT_API(hipArrayGetInfo, desc, extent, flags, array); + CHECK_STREAM_CAPTURE_SUPPORTED(); + + if (array == nullptr) { + HIP_RETURN(hipErrorInvalidHandle); + } + + // If all output parameters are nullptr, then no need to proceed further + if ((desc == nullptr) && (extent == nullptr) && (flags == nullptr)) { + HIP_RETURN(hipSuccess); + } + + HIP_ARRAY3D_DESCRIPTOR array3DDescriptor; + hipError_t status = ihipArray3DGetDescriptor(&array3DDescriptor, array); + + // Fill each output parameter + if (status == hipSuccess) { + if (desc != nullptr) { + *desc = hip::getChannelFormatDesc(array3DDescriptor.NumChannels, array3DDescriptor.Format); + } + + if (extent != nullptr) { + extent->width = array3DDescriptor.Width; + extent->height = array3DDescriptor.Height; + extent->depth = array3DDescriptor.Depth; + } + + if (flags != nullptr) { + *flags = array3DDescriptor.Flags; + } + } + + HIP_RETURN(status); } hipError_t hipArrayGetDescriptor(HIP_ARRAY_DESCRIPTOR* pArrayDescriptor, hipArray* array) { HIP_INIT_API(hipArrayGetDescriptor, pArrayDescriptor, array); + CHECK_STREAM_CAPTURE_SUPPORTED(); - assert(false && "Unimplemented"); + if (array == nullptr) { + HIP_RETURN(hipErrorInvalidHandle); + } - HIP_RETURN(hipSuccess); + if (pArrayDescriptor == nullptr) { + HIP_RETURN(hipErrorInvalidValue); + } + + HIP_ARRAY3D_DESCRIPTOR array3DDescriptor; + hipError_t status = ihipArray3DGetDescriptor(&array3DDescriptor, array); + + // Fill each output parameter + if (status == hipSuccess) { + pArrayDescriptor->Width = array3DDescriptor.Width; + pArrayDescriptor->Height = array3DDescriptor.Height; + pArrayDescriptor->Format = array3DDescriptor.Format; + pArrayDescriptor->NumChannels = array3DDescriptor.NumChannels; + } + + HIP_RETURN(status); +} + +hipError_t hipArray3DGetDescriptor(HIP_ARRAY3D_DESCRIPTOR* pArrayDescriptor, + hipArray* array) { + HIP_INIT_API(hipArray3DGetDescriptor, pArrayDescriptor, array); + CHECK_STREAM_CAPTURE_SUPPORTED(); + + if (array == nullptr) { + HIP_RETURN(hipErrorInvalidHandle); + } + + if (pArrayDescriptor == nullptr) { + HIP_RETURN(hipErrorInvalidValue); + } + + HIP_RETURN(ihipArray3DGetDescriptor(pArrayDescriptor, array)); } hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy, From f75ae39187c0c669ab3859a581fdacb6c0b54baf Mon Sep 17 00:00:00 2001 From: victzhan Date: Thu, 16 Feb 2023 12:41:20 -0500 Subject: [PATCH 35/56] SWDEV-376995 - Fixed misplaced checking Change-Id: Ia3bddabfa7fc76066541fb81723136ae6cffb0a4 --- src/hip_module.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/hip_module.cpp b/src/hip_module.cpp index 76821cfe..41b9038d 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -340,10 +340,6 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f, return hipErrorOutOfMemory; } - if (kernelCommand->status() == CL_INVALID_OPERATION) { - kernelCommand->release(); - return hipErrorIllegalState; - } command = kernelCommand; return hipSuccess; @@ -398,6 +394,12 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, hip::Event* eStop = reinterpret_cast(stopEvent); eStop->BindCommand(*command, false); } + + if (command->status() == CL_INVALID_OPERATION) { + command->release(); + return hipErrorIllegalState; + } + command->release(); return hipSuccess; From 2278fb82040fda8619b4fe0cc7ccc15bac8906fc Mon Sep 17 00:00:00 2001 From: Konstantin Zhuravlyov Date: Fri, 17 Feb 2023 12:07:11 -0500 Subject: [PATCH 36/56] SWDEV-1 - Add missing gpu cases in getProcName Change-Id: I16103ab213cc70f388690df85e6a03e7a408384d --- src/amd_hsa_elf.hpp | 78 +++++++++++++++++++------------- src/hip_code_object.cpp | 45 ++++++++++++++++++ src/hiprtc/hiprtcComgrHelper.cpp | 45 ++++++++++++++++++ 3 files changed, 137 insertions(+), 31 deletions(-) diff --git a/src/amd_hsa_elf.hpp b/src/amd_hsa_elf.hpp index 45cf5c22..ca22fd1f 100644 --- a/src/amd_hsa_elf.hpp +++ b/src/amd_hsa_elf.hpp @@ -22,6 +22,9 @@ THE SOFTWARE. #pragma once +// This header file is partially copied from +// https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/BinaryFormat/ELF.h + // AMDGPU OS for HSA compatible compute kernels. enum { ELFOSABI_AMDGPU_HSA = 64, ELFOSABI_AMDGPU_PAL = 65, ELFOSABI_AMDGPU_MESA3D = 66 }; @@ -57,38 +60,51 @@ enum : unsigned { EF_AMDGPU_MACH_R600_FIRST = EF_AMDGPU_MACH_R600_R600, EF_AMDGPU_MACH_R600_LAST = EF_AMDGPU_MACH_R600_TURKS, - EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020, - EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021, - EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022, - EF_AMDGPU_MACH_AMDGCN_GFX701 = 0x023, - EF_AMDGPU_MACH_AMDGCN_GFX702 = 0x024, - EF_AMDGPU_MACH_AMDGCN_GFX703 = 0x025, - EF_AMDGPU_MACH_AMDGCN_GFX704 = 0x026, - EF_AMDGPU_MACH_AMDGCN_RESERVED_0X027 = 0x027, - EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028, - EF_AMDGPU_MACH_AMDGCN_GFX802 = 0x029, - EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a, - EF_AMDGPU_MACH_AMDGCN_GFX810 = 0x02b, - EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c, - EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d, - EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e, - EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f, - EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030, - EF_AMDGPU_MACH_AMDGCN_GFX909 = 0x031, - EF_AMDGPU_MACH_AMDGCN_GFX90C = 0x032, - EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033, - EF_AMDGPU_MACH_AMDGCN_GFX1011 = 0x034, - EF_AMDGPU_MACH_AMDGCN_GFX1012 = 0x035, - EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036, - EF_AMDGPU_MACH_AMDGCN_GFX1031 = 0x037, - EF_AMDGPU_MACH_AMDGCN_GFX1032 = 0x038, - EF_AMDGPU_MACH_AMDGCN_GFX1033 = 0x039, - EF_AMDGPU_MACH_AMDGCN_GFX602 = 0x03a, - EF_AMDGPU_MACH_AMDGCN_GFX705 = 0x03b, - EF_AMDGPU_MACH_AMDGCN_GFX805 = 0x03c, - EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f, + // AMDGCN-based processors. + EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020, + EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021, + EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022, + EF_AMDGPU_MACH_AMDGCN_GFX701 = 0x023, + EF_AMDGPU_MACH_AMDGCN_GFX702 = 0x024, + EF_AMDGPU_MACH_AMDGCN_GFX703 = 0x025, + EF_AMDGPU_MACH_AMDGCN_GFX704 = 0x026, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X27 = 0x027, + EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028, + EF_AMDGPU_MACH_AMDGCN_GFX802 = 0x029, + EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a, + EF_AMDGPU_MACH_AMDGCN_GFX810 = 0x02b, + EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c, + EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d, + EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e, + EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f, + EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030, + EF_AMDGPU_MACH_AMDGCN_GFX909 = 0x031, + EF_AMDGPU_MACH_AMDGCN_GFX90C = 0x032, + EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033, + EF_AMDGPU_MACH_AMDGCN_GFX1011 = 0x034, + EF_AMDGPU_MACH_AMDGCN_GFX1012 = 0x035, + EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036, + EF_AMDGPU_MACH_AMDGCN_GFX1031 = 0x037, + EF_AMDGPU_MACH_AMDGCN_GFX1032 = 0x038, + EF_AMDGPU_MACH_AMDGCN_GFX1033 = 0x039, + EF_AMDGPU_MACH_AMDGCN_GFX602 = 0x03a, + EF_AMDGPU_MACH_AMDGCN_GFX705 = 0x03b, + EF_AMDGPU_MACH_AMDGCN_GFX805 = 0x03c, + EF_AMDGPU_MACH_AMDGCN_GFX1035 = 0x03d, + EF_AMDGPU_MACH_AMDGCN_GFX1034 = 0x03e, + EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f, + EF_AMDGPU_MACH_AMDGCN_GFX940 = 0x040, + EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041, + EF_AMDGPU_MACH_AMDGCN_GFX1013 = 0x042, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X43 = 0x043, + EF_AMDGPU_MACH_AMDGCN_GFX1103 = 0x044, + EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045, + EF_AMDGPU_MACH_AMDGCN_GFX1101 = 0x046, + EF_AMDGPU_MACH_AMDGCN_GFX1102 = 0x047, + + // First/last AMDGCN-based processors. EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600, - EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX90A, + EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX1102, // Indicates if the "xnack" target feature is enabled for all code contained // in the object. diff --git a/src/hip_code_object.cpp b/src/hip_code_object.cpp index 0d5fb1b4..0709c1a0 100644 --- a/src/hip_code_object.cpp +++ b/src/hip_code_object.cpp @@ -172,6 +172,11 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp sramEccSupported = false; proc_name = "gfx90c"; break; + case EF_AMDGPU_MACH_AMDGCN_GFX940: + xnackSupported = true; + sramEccSupported = true; + proc_name = "gfx940"; + break; case EF_AMDGPU_MACH_AMDGCN_GFX1010: xnackSupported = true; sramEccSupported = false; @@ -187,6 +192,11 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp sramEccSupported = false; proc_name = "gfx1012"; break; + case EF_AMDGPU_MACH_AMDGCN_GFX1013: + xnackSupported = true; + sramEccSupported = false; + proc_name = "gfx1013"; + break; case EF_AMDGPU_MACH_AMDGCN_GFX1030: xnackSupported = false; sramEccSupported = false; @@ -207,6 +217,41 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp sramEccSupported = false; proc_name = "gfx1033"; break; + case EF_AMDGPU_MACH_AMDGCN_GFX1034: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1034"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1035: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1035"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1036: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1036"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1100: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1100"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1101: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1101"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1102: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1102"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1103: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1103"; + break; default: return false; } diff --git a/src/hiprtc/hiprtcComgrHelper.cpp b/src/hiprtc/hiprtcComgrHelper.cpp index b8dca86c..c3794c61 100644 --- a/src/hiprtc/hiprtcComgrHelper.cpp +++ b/src/hiprtc/hiprtcComgrHelper.cpp @@ -157,6 +157,11 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp sramEccSupported = false; proc_name = "gfx90c"; break; + case EF_AMDGPU_MACH_AMDGCN_GFX940: + xnackSupported = true; + sramEccSupported = true; + proc_name = "gfx940"; + break; case EF_AMDGPU_MACH_AMDGCN_GFX1010: xnackSupported = true; sramEccSupported = false; @@ -172,6 +177,11 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp sramEccSupported = false; proc_name = "gfx1012"; break; + case EF_AMDGPU_MACH_AMDGCN_GFX1013: + xnackSupported = true; + sramEccSupported = false; + proc_name = "gfx1013"; + break; case EF_AMDGPU_MACH_AMDGCN_GFX1030: xnackSupported = false; sramEccSupported = false; @@ -192,6 +202,41 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp sramEccSupported = false; proc_name = "gfx1033"; break; + case EF_AMDGPU_MACH_AMDGCN_GFX1034: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1034"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1035: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1035"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1036: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1036"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1100: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1100"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1101: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1101"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1102: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1102"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1103: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1103"; + break; default: return false; } From cb4273b0f4914937b670cfd441bea1880c0b696c Mon Sep 17 00:00:00 2001 From: Rakesh Roy Date: Mon, 20 Feb 2023 18:05:51 +0530 Subject: [PATCH 37/56] SWDEV-377782 - Fix segmentation fault for hipLaunchKernel - If fat binary doesn't contain code object for current gfx then inside FatBinaryInfo::ExtractFatBinary(), valid FatBinaryDeviceInfo address isn't stored in vector fatbin_dev_info_ - This raises segmentation fault during hipLaunchKernel Change-Id: I21017338d91edbd5d9cc2d37277f66558198a129 --- src/hip_fatbin.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/hip_fatbin.cpp b/src/hip_fatbin.cpp index bafc7436..c2b7ff75 100644 --- a/src/hip_fatbin.cpp +++ b/src/hip_fatbin.cpp @@ -23,13 +23,15 @@ FatBinaryInfo::FatBinaryInfo(const char* fname, const void* image) : fdesc_(amd: fname_ = std::string(); } - fatbin_dev_info_.resize(g_devices.size()); + fatbin_dev_info_.resize(g_devices.size(), nullptr); } FatBinaryInfo::~FatBinaryInfo() { - for (auto& fbd: fatbin_dev_info_) { - delete fbd; + for (auto* fbd: fatbin_dev_info_) { + if (fbd != nullptr) { + delete fbd; + } } if (fdesc_ > 0) { @@ -298,6 +300,10 @@ hipError_t FatBinaryInfo::AddDevProgram(const int device_id) { DeviceIdCheck(device_id); FatBinaryDeviceInfo* fbd_info = fatbin_dev_info_[device_id]; + if (fbd_info == nullptr) { + return hipErrorInvalidKernelFile; + } + // If fat binary was already added, skip this step and return success if (fbd_info->add_dev_prog_ == false) { amd::Context* ctx = g_devices[device_id]->asContext(); From 6e0b7f1672e47a42f0982b9756e0734707fa482f Mon Sep 17 00:00:00 2001 From: Ajay Date: Fri, 17 Feb 2023 22:46:39 +0000 Subject: [PATCH 38/56] SWDEV-384100 - HIP support for CUDA 12.0 Apps are failing to build due to undefined deprecated texture APIs Change-Id: I1fb64adc4bc0ba6ee6ecaa65d54b34da0327e6a3 --- include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/include/hip/nvidia_detail/nvidia_hip_runtime_api.h index a06f9b45..d7701826 100644 --- a/include/hip/nvidia_detail/nvidia_hip_runtime_api.h +++ b/include/hip/nvidia_detail/nvidia_hip_runtime_api.h @@ -39,6 +39,7 @@ THE SOFTWARE. #define CUDA_11030 11030 #define CUDA_11040 11040 #define CUDA_11060 11060 +#define CUDA_12000 12000 #ifdef __cplusplus extern "C" { @@ -2773,6 +2774,7 @@ inline static hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t return hipCUDAErrorTohipError(cudaFuncSetCacheConfig(func, cacheConfig)); } +#if CUDA_VERSION < CUDA_12000 __HIP_DEPRECATED inline static hipError_t hipBindTexture(size_t* offset, struct textureReference* tex, const void* devPtr, @@ -2786,6 +2788,8 @@ __HIP_DEPRECATED inline static hipError_t hipBindTexture2D( const hipChannelFormatDesc* desc, size_t width, size_t height, size_t pitch) { return hipCUDAErrorTohipError(cudaBindTexture2D(offset, tex, devPtr, desc, width, height, pitch)); } +#endif // CUDA_VERSION < CUDA_12000 + inline static hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f) { @@ -2818,10 +2822,12 @@ inline static hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDe return hipCUDAErrorTohipError(cudaGetTextureObjectResourceDesc( pResDesc, textureObject)); } +#if CUDA_VERSION < CUDA_12000 __HIP_DEPRECATED inline static hipError_t hipGetTextureAlignmentOffset( size_t* offset, const struct textureReference* texref) { return hipCUDAErrorTohipError(cudaGetTextureAlignmentOffset(offset,texref)); } +#endif inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array) { @@ -3067,6 +3073,7 @@ inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( blockSize, dynamicSMemSize, flags)); } +#if CUDA_VERSION < CUDA_12000 template inline static hipError_t hipBindTexture(size_t* offset, const struct texture& tex, const void* devPtr, size_t size = UINT_MAX) { @@ -3109,6 +3116,7 @@ __HIP_DEPRECATED inline static hipError_t hipBindTextureToArray( struct texture& tex, hipArray_const_t array) { return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array)); } +#endif // CUDA_VERSION < CUDA_12000 template inline static hipChannelFormatDesc hipCreateChannelDesc() { From 8a0dcba713dfb9c19c0eafc9993d86f4c969bb17 Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" Date: Thu, 16 Feb 2023 17:34:12 -0500 Subject: [PATCH 39/56] SWDEV-380145 - [GFX][CQE] Mathlibs and Blender build failed in both RT and CPL Jobs Remove --hip-device-lib-path from hip-config.cmake and let clang determine device lib path Change-Id: Ice009875624692bc2895020c9577b94ddbd6840f --- hip-config.cmake.in | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/hip-config.cmake.in b/hip-config.cmake.in index 7c4fe7f9..b96803ee 100755 --- a/hip-config.cmake.in +++ b/hip-config.cmake.in @@ -206,16 +206,6 @@ if(HIP_COMPILER STREQUAL "clang") if (NOT compilePropIsSet) hip_add_interface_compile_flags(hip::device -x hip) - if (NOT EXISTS ${AMD_DEVICE_LIBS_PREFIX}/amdgcn/bitcode) - # This path is to support an older build of the device library - # TODO: To be removed in the future. - if(WIN32) - hip_add_interface_compile_flags(hip::device -fms-extensions -fms-compatibility) - hip_add_interface_compile_flags(hip::device --hip-device-lib-path=\"${HIP_PATH}/lib/bitcode\") - else() - hip_add_interface_compile_flags(hip::device --hip-device-lib-path=\"${AMD_DEVICE_LIBS_PREFIX}/lib\") - endif() - endif() endif() hip_add_interface_link_flags(hip::device --hip-link) From d986f7ea8c8e141316e2ec86feb7664744e1eabe Mon Sep 17 00:00:00 2001 From: Ioannis Assiouras Date: Tue, 21 Feb 2023 14:30:27 +0000 Subject: [PATCH 40/56] SWDEV-384362 - Added explicit cast from __fp16 to _Float16 in rcph Change-Id: Ibbb11e928a80126ed7a8c7cd8bddebc74c38986d --- include/hip/amd_detail/amd_hip_fp16.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/hip/amd_detail/amd_hip_fp16.h b/include/hip/amd_detail/amd_hip_fp16.h index fda27219..ab6496d6 100644 --- a/include/hip/amd_detail/amd_hip_fp16.h +++ b/include/hip/amd_detail/amd_hip_fp16.h @@ -1569,7 +1569,7 @@ THE SOFTWARE. __half hrcp(__half x) { return __half_raw{ - __builtin_amdgcn_rcph(static_cast<__half_raw>(x).data)}; + static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))}; } inline __device__ @@ -1673,8 +1673,8 @@ THE SOFTWARE. inline __HOST_DEVICE__ __half2 h2rcp(__half2 x) { - return _Float16_2{__builtin_amdgcn_rcph(x.x), - __builtin_amdgcn_rcph(x.y)}; + return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(x.x)), + static_cast<_Float16>(__builtin_amdgcn_rcph(x.y))}; } inline __HOST_DEVICE__ From 420a9930759831a549e2ae928b6c641c054754eb Mon Sep 17 00:00:00 2001 From: Ioannis Assiouras Date: Wed, 8 Feb 2023 20:18:11 +0000 Subject: [PATCH 41/56] SWDEV-381402 - Derive hip::Stream from amd::HostQueue Change-Id: I6c1aca5eb350c32d974ae4ffcc725705355956d8 --- src/hip_code_object.cpp | 16 +- src/hip_context.cpp | 14 +- src/hip_device.cpp | 30 ++-- src/hip_device_runtime.cpp | 6 +- src/hip_event.cpp | 22 +-- src/hip_event.hpp | 12 +- src/hip_event_ipc.cpp | 15 +- src/hip_gl.cpp | 18 +-- src/hip_graph_helper.hpp | 12 +- src/hip_graph_internal.cpp | 243 +------------------------------ src/hip_graph_internal.hpp | 179 +++++++---------------- src/hip_hmm.cpp | 10 +- src/hip_internal.hpp | 46 +++--- src/hip_memory.cpp | 290 ++++++++++++++++++------------------- src/hip_module.cpp | 34 ++--- src/hip_platform.cpp | 8 +- src/hip_stream.cpp | 144 +++++++----------- src/hip_stream_ops.cpp | 4 +- src/hip_texture.cpp | 34 ++--- 19 files changed, 404 insertions(+), 733 deletions(-) diff --git a/src/hip_code_object.cpp b/src/hip_code_object.cpp index 0709c1a0..778783cf 100644 --- a/src/hip_code_object.cpp +++ b/src/hip_code_object.cpp @@ -32,7 +32,7 @@ THE SOFTWARE. #include hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, - amd::HostQueue& queue, bool isAsync = false); + hip::Stream& stream, bool isAsync = false); hipError_t ihipFree(void* ptr); // forward declaration of methods required for managed variables hipError_t ihipMallocManaged(void** ptr, size_t size, unsigned int align = 0); @@ -635,10 +635,10 @@ hipError_t DynCO::initDynManagedVars(const std::string& managedVar) { it->second->setManagedVarInfo(pointer, dvar->size()); // copy initial value to the managed variable to the managed memory allocated - amd::HostQueue* queue = hip::getNullStream(); - if (queue != nullptr) { + hip::Stream* stream = hip::getNullStream(); + if (stream != nullptr) { status = ihipMemcpy(pointer, reinterpret_cast
(dvar->device_ptr()), dvar->size(), - hipMemcpyDeviceToDevice, *queue); + hipMemcpyDeviceToDevice, *stream); if (status != hipSuccess) { ClPrint(amd::LOG_ERROR, amd::LOG_API, "Status %d, failed to copy device ptr:%s", status, managedVar.c_str()); @@ -658,7 +658,7 @@ hipError_t DynCO::initDynManagedVars(const std::string& managedVar) { } // copy managed memory pointer to the managed device variable status = ihipMemcpy(reinterpret_cast
(dvar->device_ptr()), &pointer, dvar->size(), - hipMemcpyHostToDevice, *queue); + hipMemcpyHostToDevice, *stream); if (status != hipSuccess) { ClPrint(amd::LOG_ERROR, amd::LOG_API, "Status %d, failed to copy device ptr:%s", status, managedVar.c_str()); @@ -895,10 +895,10 @@ hipError_t StatCO::initStatManagedVarDevicePtr(int deviceId) { DeviceVar* dvar = nullptr; IHIP_RETURN_ONFAIL(var->getStatDeviceVar(&dvar, deviceId)); - amd::HostQueue* queue = g_devices.at(deviceId)->NullStream(); - if (queue != nullptr) { + hip::Stream* stream = g_devices.at(deviceId)->NullStream(); + if (stream != nullptr) { err = ihipMemcpy(reinterpret_cast
(dvar->device_ptr()), var->getManagedVarPtr(), - dvar->size(), hipMemcpyHostToDevice, *queue); + dvar->size(), hipMemcpyHostToDevice, *stream); } else { ClPrint(amd::LOG_ERROR, amd::LOG_API, "Host Queue is NULL"); return hipErrorInvalidResourceHandle; diff --git a/src/hip_context.cpp b/src/hip_context.cpp index 307f8452..f639d4ff 100644 --- a/src/hip_context.cpp +++ b/src/hip_context.cpp @@ -91,21 +91,21 @@ void setCurrentDevice(unsigned int index) { amd::Os::setPreferredNumaNode(preferredNumaNode); } -amd::HostQueue* getQueue(hipStream_t stream) { +hip::Stream* getStream(hipStream_t stream) { if (stream == nullptr) { return getNullStream(); } else { - amd::HostQueue* queue = reinterpret_cast(stream)->asHostQueue(); - if (!(reinterpret_cast(stream)->Flags() & hipStreamNonBlocking)) { + hip::Stream* hip_stream = reinterpret_cast(stream); + if (!(hip_stream->Flags() & hipStreamNonBlocking)) { constexpr bool WaitNullStreamOnly = true; - iHipWaitActiveStreams(queue, WaitNullStreamOnly); + iHipWaitActiveStreams(hip_stream, WaitNullStreamOnly); } - return queue; + return hip_stream; } } // ================================================================================================ -amd::HostQueue* getNullStream(amd::Context& ctx) { +hip::Stream* getNullStream(amd::Context& ctx) { for (auto& it : g_devices) { if (it->asContext() == &ctx) { return it->NullStream(); @@ -131,7 +131,7 @@ int getDeviceID(amd::Context& ctx) { } // ================================================================================================ -amd::HostQueue* getNullStream() { +hip::Stream* getNullStream() { Device* device = getCurrentDevice(); return device ? device->NullStream() : nullptr; } diff --git a/src/hip_device.cpp b/src/hip_device.cpp index a3d059ac..2b83616d 100644 --- a/src/hip_device.cpp +++ b/src/hip_device.cpp @@ -26,25 +26,31 @@ namespace hip { // ================================================================================================ -amd::HostQueue* Device::NullStream(bool skip_alloc) { - amd::HostQueue* null_queue = null_stream_.asHostQueue(skip_alloc); - if (null_queue == nullptr) { +hip::Stream* Device::NullStream(bool skip_alloc) { + if (null_stream_ == nullptr && !skip_alloc) { + null_stream_ = new Stream(this, Stream::Priority::Normal, 0, true); + } + + if (null_stream_ == nullptr) { return nullptr; } // Wait for all active streams before executing commands on the default - iHipWaitActiveStreams(null_queue); - return null_queue; + iHipWaitActiveStreams(null_stream_); + return null_stream_; } // ================================================================================================ -Stream* Device::GetNullStream() { - amd::HostQueue* null_queue = null_stream_.asHostQueue(); - if (null_queue == nullptr) { +hip::Stream* Device::GetNullStream() { + if (null_stream_ == nullptr) { + null_stream_ = new Stream(this, Stream::Priority::Normal, 0, true); + } + + if (null_stream_ == nullptr) { return nullptr; } // Wait for all active streams before executing commands on the default - iHipWaitActiveStreams(null_queue); - return &null_stream_; + iHipWaitActiveStreams(null_stream_); + return null_stream_; } // ================================================================================================ @@ -128,6 +134,10 @@ Device::~Device() { if (default_mem_pool_ != nullptr) { default_mem_pool_->release(); } + + if (null_stream_!= nullptr) { + delete null_stream_; + } } } diff --git a/src/hip_device_runtime.cpp b/src/hip_device_runtime.cpp index 0288c885..19bed5ef 100644 --- a/src/hip_device_runtime.cpp +++ b/src/hip_device_runtime.cpp @@ -512,9 +512,9 @@ hipError_t hipDeviceSetSharedMemConfig ( hipSharedMemConfig config ) { hipError_t hipDeviceSynchronize ( void ) { HIP_INIT_API(hipDeviceSynchronize); - amd::HostQueue* queue = hip::getNullStream(); + hip::Stream* stream = hip::getNullStream(); - if (!queue) { + if (!stream) { HIP_RETURN(hipErrorOutOfMemory); } @@ -522,7 +522,7 @@ hipError_t hipDeviceSynchronize ( void ) { HIP_RETURN(hipErrorStreamCaptureUnsupported); } - queue->finish(); + stream->finish(); hip::Stream::syncNonBlockingStreams(hip::getCurrentDevice()->deviceId()); diff --git a/src/hip_event.cpp b/src/hip_event.cpp index f556cabe..83cbb9ef 100644 --- a/src/hip_event.cpp +++ b/src/hip_event.cpp @@ -177,12 +177,12 @@ int64_t EventDD::time(bool getStartTs) const { } } -hipError_t Event::streamWaitCommand(amd::Command*& command, amd::HostQueue* queue) { +hipError_t Event::streamWaitCommand(amd::Command*& command, hip::Stream* stream) { amd::Command::EventWaitList eventWaitList; if (event_ != nullptr) { eventWaitList.push_back(event_); } - command = new amd::Marker(*queue, kMarkerDisableFlush, eventWaitList); + command = new amd::Marker(*stream, kMarkerDisableFlush, eventWaitList); if (command == NULL) { return hipErrorOutOfMemory; @@ -196,17 +196,17 @@ hipError_t Event::enqueueStreamWaitCommand(hipStream_t stream, amd::Command* com } hipError_t Event::streamWait(hipStream_t stream, uint flags) { - amd::HostQueue* queue = hip::getQueue(stream); + hip::Stream* hip_stream = hip::getStream(stream); // Access to event_ object must be lock protected amd::ScopedLock lock(lock_); - if ((event_ == nullptr) || (event_->command().queue() == queue) || ready()) { + if ((event_ == nullptr) || (event_->command().queue() == hip_stream) || ready()) { return hipSuccess; } if (!event_->notifyCmdQueue()) { return hipErrorLaunchOutOfResources; } amd::Command* command; - hipError_t status = streamWaitCommand(command, queue); + hipError_t status = streamWaitCommand(command, hip_stream); if (status != hipSuccess) { return status; } @@ -218,7 +218,7 @@ hipError_t Event::streamWait(hipStream_t stream, uint flags) { return hipSuccess; } -hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* queue, +hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t ext_flags ) { if (command == nullptr) { int32_t releaseFlags = ((ext_flags == 0) ? flags : ext_flags) & @@ -231,7 +231,7 @@ hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* queue, releaseFlags = amd::Device::kCacheStateIgnore; } // Always submit a EventMarker. - command = new hip::EventMarker(*queue, !kMarkerDisableFlush, true, releaseFlags); + command = new hip::EventMarker(*stream, !kMarkerDisableFlush, true, releaseFlags); } return hipSuccess; } @@ -249,10 +249,10 @@ hipError_t Event::enqueueRecordCommand(hipStream_t stream, amd::Command* command } hipError_t Event::addMarker(hipStream_t stream, amd::Command* command, bool record) { - amd::HostQueue* queue = hip::getQueue(stream); + hip::Stream* hip_stream = hip::getStream(stream); // Keep the lock always at the beginning of this to avoid a race. SWDEV-277847 amd::ScopedLock lock(lock_); - hipError_t status = recordCommand(command, queue); + hipError_t status = recordCommand(command, hip_stream); if (status != hipSuccess) { return hipSuccess; } @@ -379,8 +379,8 @@ hipError_t hipEventRecord_common(hipEvent_t event, hipStream_t stream) { return hipErrorInvalidHandle; } hip::Event* e = reinterpret_cast(event); - amd::HostQueue* queue = hip::getQueue(stream); - if (g_devices[e->deviceId()]->devices()[0] != &queue->device()) { + hip::Stream* hip_stream = hip::getStream(stream); + if (g_devices[e->deviceId()]->devices()[0] != &hip_stream->device()) { return hipErrorInvalidHandle; } return e->addMarker(stream, nullptr, true); diff --git a/src/hip_event.hpp b/src/hip_event.hpp index e08ea33f..91a8193d 100644 --- a/src/hip_event.hpp +++ b/src/hip_event.hpp @@ -78,9 +78,9 @@ typedef struct ihipIpcEventShmem_s { class EventMarker : public amd::Marker { public: - EventMarker(amd::HostQueue& queue, bool disableFlush, bool markerTs = false, + EventMarker(amd::HostQueue& stream, bool disableFlush, bool markerTs = false, int32_t scope = amd::Device::kCacheStateInvalid) - : amd::Marker(queue, disableFlush) { + : amd::Marker(stream, disableFlush) { profilingInfo_.enabled_ = true; profilingInfo_.callback_ = nullptr; profilingInfo_.marker_ts_ = markerTs; @@ -116,11 +116,11 @@ class Event { virtual hipError_t synchronize(); hipError_t elapsedTime(Event& eStop, float& ms); - virtual hipError_t streamWaitCommand(amd::Command*& command, amd::HostQueue* queue); + virtual hipError_t streamWaitCommand(amd::Command*& command, hip::Stream* stream); virtual hipError_t enqueueStreamWaitCommand(hipStream_t stream, amd::Command* command); virtual hipError_t streamWait(hipStream_t stream, uint flags); - virtual hipError_t recordCommand(amd::Command*& command, amd::HostQueue* queue, + virtual hipError_t recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t flags = 0); virtual hipError_t enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record); hipError_t addMarker(hipStream_t stream, amd::Command* command, bool record); @@ -175,7 +175,7 @@ class Event { protected: amd::Monitor lock_; - amd::HostQueue* stream_; + hip::Stream* stream_; amd::Event* event_; int device_id_; //! Flag to indicate hipEventRecord has not been called. This is needed for @@ -224,7 +224,7 @@ class IPCEvent : public Event { hipError_t synchronize(); hipError_t query(); - hipError_t streamWaitCommand(amd::Command*& command, amd::HostQueue* queue); + hipError_t streamWaitCommand(amd::Command*& command, hip::Stream* stream); hipError_t enqueueStreamWaitCommand(hipStream_t stream, amd::Command* command); hipError_t streamWait(hipStream_t stream, uint flags); diff --git a/src/hip_event_ipc.cpp b/src/hip_event_ipc.cpp index 7385566b..706b3d44 100644 --- a/src/hip_event_ipc.cpp +++ b/src/hip_event_ipc.cpp @@ -102,8 +102,8 @@ hipError_t IPCEvent::synchronize() { return hipSuccess; } -hipError_t IPCEvent::streamWaitCommand(amd::Command*& command, amd::HostQueue* queue) { - command = new amd::Marker(*queue, false); +hipError_t IPCEvent::streamWaitCommand(amd::Command*& command, hip::Stream* stream) { + command = new amd::Marker(*stream, false); if (command == NULL) { return hipErrorOutOfMemory; } @@ -125,12 +125,12 @@ hipError_t IPCEvent::enqueueStreamWaitCommand(hipStream_t stream, amd::Command* } hipError_t IPCEvent::streamWait(hipStream_t stream, uint flags) { - amd::HostQueue* queue = hip::getQueue(stream); + hip::Stream* hip_stream = hip::getStream(stream); amd::ScopedLock lock(lock_); if(query() != hipSuccess) { amd::Command* command; - hipError_t status = streamWaitCommand(command, queue); + hipError_t status = streamWaitCommand(command, hip_stream); if (status != hipSuccess) { return status; } @@ -140,18 +140,17 @@ hipError_t IPCEvent::streamWait(hipStream_t stream, uint flags) { return hipSuccess; } -hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* queue, uint32_t flags) { +hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t flags) { bool unrecorded = isUnRecorded(); if (unrecorded) { - command = new amd::Marker(*queue, kMarkerDisableFlush); + command = new amd::Marker(*stream, kMarkerDisableFlush); } else { - return Event::recordCommand(command, queue); + return Event::recordCommand(command, stream); } return hipSuccess; } hipError_t IPCEvent::enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record) { - amd::HostQueue* queue = hip::getQueue(stream); bool unrecorded = isUnRecorded(); if (unrecorded) { amd::Event& tEvent = command->event(); diff --git a/src/hip_gl.cpp b/src/hip_gl.cpp index 216a2cb4..ce692753 100644 --- a/src/hip_gl.cpp +++ b/src/hip_gl.cpp @@ -637,13 +637,12 @@ hipError_t hipGraphicsMapResources(int count, hipGraphicsResource_t* resources, HIP_RETURN(hipErrorUnknown); } - amd::HostQueue* queue = hip::getQueue(stream); - if (nullptr == queue) { + hip::Stream* hip_stream = hip::getStream(stream); + if (nullptr == hip_stream) { HIP_RETURN(hipErrorUnknown); } - amd::HostQueue& hostQueue = *queue; - if (!hostQueue.context().glenv() || !hostQueue.context().glenv()->isAssociated()) { + if (!hip_stream->context().glenv() || !hip_stream->context().glenv()->isAssociated()) { LogWarning("\"amdContext\" is not created from GL context or share list"); HIP_RETURN(hipErrorUnknown); } @@ -658,7 +657,7 @@ hipError_t hipGraphicsMapResources(int count, hipGraphicsResource_t* resources, //! Now create command and enqueue amd::AcquireExtObjectsCommand* command = new amd::AcquireExtObjectsCommand( - hostQueue, nullWaitList, count, memObjects, CL_COMMAND_ACQUIRE_GL_OBJECTS); + *hip_stream, nullWaitList, count, memObjects, CL_COMMAND_ACQUIRE_GL_OBJECTS); if (command == nullptr) { HIP_RETURN(hipErrorUnknown); } @@ -712,13 +711,12 @@ hipError_t hipGraphicsUnmapResources(int count, hipGraphicsResource_t* resources } // Wait for the current host queue - hip::getQueue(stream)->finish(); + hip::getStream(stream)->finish(); - amd::HostQueue* queue = hip::getQueue(stream); - if (nullptr == queue) { + hip::Stream* hip_stream = hip::getStream(stream); + if (nullptr == hip_stream) { HIP_RETURN(hipErrorUnknown); } - amd::HostQueue& hostQueue = *queue; std::vector memObjects; hipError_t err = hipSetInteropObjects(count, reinterpret_cast(resources), memObjects); @@ -730,7 +728,7 @@ hipError_t hipGraphicsUnmapResources(int count, hipGraphicsResource_t* resources // Now create command and enqueue amd::ReleaseExtObjectsCommand* command = new amd::ReleaseExtObjectsCommand( - hostQueue, nullWaitList, count, memObjects, CL_COMMAND_RELEASE_GL_OBJECTS); + *hip_stream, nullWaitList, count, memObjects, CL_COMMAND_RELEASE_GL_OBJECTS); if (command == nullptr) { HIP_RETURN(hipErrorUnknown); } diff --git a/src/hip_graph_helper.hpp b/src/hip_graph_helper.hpp index 69780338..20d01165 100644 --- a/src/hip_graph_helper.hpp +++ b/src/hip_graph_helper.hpp @@ -5,9 +5,9 @@ hipError_t ihipMemcpy3D_validate(const hipMemcpy3DParms* p); hipError_t ihipMemcpy_validate(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind); hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, size_t sizeBytes, - hipMemcpyKind kind, amd::HostQueue& queue, bool isAsync = false); + hipMemcpyKind kind, hip::Stream& stream, bool isAsync = false); -void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, amd::HostQueue& queue); +void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, hip::Stream& stream); bool IsHtoHMemcpy(void* dst, const void* src, hipMemcpyKind kind); @@ -26,19 +26,19 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, - amd::HostQueue* queue, void** kernelParams, void** extra, + hip::Stream* stream, void** kernelParams, void** extra, hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, uint32_t params, uint32_t gridId, uint32_t numGrids, uint64_t prevGridSum, uint64_t allGridSum, uint32_t firstDevice); hipError_t ihipMemcpy3DCommand(amd::Command*& command, const hipMemcpy3DParms* p, - amd::HostQueue* queue); + hip::Stream* stream); hipError_t ihipMemsetCommand(std::vector& commands, void* dst, int64_t value, - size_t valueSize, size_t sizeBytes, amd::HostQueue* queue); + size_t valueSize, size_t sizeBytes, hip::Stream* stream); hipError_t ihipMemset3DCommand(std::vector& commands, hipPitchedPtr pitchedDevPtr, - int value, hipExtent extent, amd::HostQueue* queue, size_t elementSize = 1); + int value, hipExtent extent, hip::Stream* stream, size_t elementSize = 1); hipError_t ihipMemcpySymbol_validate(const void* symbol, size_t sizeBytes, size_t offset, size_t& sym_size, hipDeviceptr_t& device_ptr); diff --git a/src/hip_graph_internal.cpp b/src/hip_graph_internal.cpp index 68bcab0a..5733cc5c 100644 --- a/src/hip_graph_internal.cpp +++ b/src/hip_graph_internal.cpp @@ -98,56 +98,6 @@ hipError_t hipGraphMemcpyNode1D::ValidateParams(void* dst, const void* src, size return hipSuccess; } -hipError_t hipGraphMemcpyNode1D::SetCommandParams(void* dst, const void* src, size_t count, - hipMemcpyKind kind) { - hipError_t status = ihipMemcpy_validate(dst, src, count, kind); - if (status != hipSuccess) { - return status; - } - size_t sOffsetOrig = 0; - amd::Memory* origSrcMemory = getMemoryObject(src, sOffsetOrig); - size_t dOffsetOrig = 0; - amd::Memory* origDstMemory = getMemoryObject(dst, dOffsetOrig); - - size_t sOffset = 0; - amd::Memory* srcMemory = getMemoryObject(src, sOffset); - size_t dOffset = 0; - amd::Memory* dstMemory = getMemoryObject(dst, dOffset); - - if ((srcMemory == nullptr) && (dstMemory != nullptr)) { - if (origDstMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) { - return hipErrorInvalidValue; - } - amd::WriteMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*dstMemory->asBuffer(), dOffset, count, src); - } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) { - if (origSrcMemory->getContext().devices()[0] != srcMemory->getContext().devices()[0]) { - return hipErrorInvalidValue; - } - amd::ReadMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*srcMemory->asBuffer(), sOffset, count, dst); - } else if ((srcMemory != nullptr) && (dstMemory != nullptr)) { - if (origDstMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) { - return hipErrorInvalidValue; - } - if (origSrcMemory->getContext().devices()[0] != srcMemory->getContext().devices()[0]) { - return hipErrorInvalidValue; - } - amd::CopyMemoryP2PCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, count); - // Make sure runtime has valid memory for the command execution. P2P access - // requires page table mapping on the current device to another GPU memory - if (!static_cast(command)->validateMemory()) { - delete command; - return hipErrorInvalidValue; - } - } else { - amd::CopyMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, count); - } - return hipSuccess; -} - hipError_t hipGraphMemcpyNode::ValidateParams(const hipMemcpy3DParms* pNodeParams) { hipError_t status = ihipMemcpy3D_validate(pNodeParams); if (status != hipSuccess) { @@ -297,185 +247,6 @@ hipError_t hipGraphMemcpyNode::ValidateParams(const hipMemcpy3DParms* pNodeParam return hipSuccess; } -hipError_t hipGraphMemcpyNode::SetCommandParams(const hipMemcpy3DParms* pNodeParams) { - hipError_t status = ihipMemcpy3D_validate(pNodeParams); - if (status != hipSuccess) { - return status; - } - const HIP_MEMCPY3D pCopy = hip::getDrvMemcpy3DDesc(*pNodeParams); - // If {src/dst}MemoryType is hipMemoryTypeUnified, {src/dst}Device and {src/dst}Pitch specify the - // (unified virtual address space) base address of the source data and the bytes per row to apply. - // {src/dst}Array is ignored. - hipMemoryType srcMemoryType = pCopy.srcMemoryType; - if (srcMemoryType == hipMemoryTypeUnified) { - srcMemoryType = - amd::MemObjMap::FindMemObj(pCopy.srcDevice) ? hipMemoryTypeDevice : hipMemoryTypeHost; - if (srcMemoryType == hipMemoryTypeHost) { - // {src/dst}Host may be unitialized. Copy over {src/dst}Device into it if we detect system - // memory. - const_cast(&pCopy)->srcHost = pCopy.srcDevice; - } - } - hipMemoryType dstMemoryType = pCopy.dstMemoryType; - if (dstMemoryType == hipMemoryTypeUnified) { - dstMemoryType = - amd::MemObjMap::FindMemObj(pCopy.dstDevice) ? hipMemoryTypeDevice : hipMemoryTypeHost; - if (srcMemoryType == hipMemoryTypeHost) { - const_cast(&pCopy)->dstHost = pCopy.dstDevice; - } - } - - // If {src/dst}MemoryType is hipMemoryTypeHost, check if the memory was prepinned. - // In that case upgrade the copy type to hipMemoryTypeDevice to avoid extra pinning. - if (srcMemoryType == hipMemoryTypeHost) { - amd::Memory* mem = amd::MemObjMap::FindMemObj(pCopy.srcHost); - srcMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost; - if (srcMemoryType == hipMemoryTypeDevice) { - const_cast(&pCopy)->srcDevice = const_cast(pCopy.srcHost); - } - } - if (dstMemoryType == hipMemoryTypeHost) { - amd::Memory* mem = amd::MemObjMap::FindMemObj(pCopy.dstHost); - dstMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost; - if (dstMemoryType == hipMemoryTypeDevice) { - const_cast(&pCopy)->dstDevice = const_cast(pCopy.dstDevice); - } - } - - amd::Coord3D srcOrigin = {pCopy.srcXInBytes, pCopy.srcY, pCopy.srcZ}; - amd::Coord3D dstOrigin = {pCopy.dstXInBytes, pCopy.dstY, pCopy.dstZ}; - amd::Coord3D copyRegion = {pCopy.WidthInBytes, pCopy.Height, pCopy.Depth}; - - if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeDevice)) { - // Host to Device. - - amd::Memory* dstMemory; - amd::BufferRect srcRect; - amd::BufferRect dstRect; - - status = - ihipMemcpyHtoDValidate(pCopy.srcHost, pCopy.dstDevice, srcOrigin, dstOrigin, copyRegion, - pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight, pCopy.dstPitch, - pCopy.dstPitch * pCopy.dstHeight, dstMemory, srcRect, dstRect); - if (status != hipSuccess) { - return status; - } - amd::WriteMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*dstMemory, {dstRect.start_, 0, 0}, copyRegion, pCopy.srcHost, dstRect, - srcRect); - } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeHost)) { - // Device to Host. - amd::Memory* srcMemory; - amd::BufferRect srcRect; - amd::BufferRect dstRect; - status = - ihipMemcpyDtoHValidate(pCopy.srcDevice, pCopy.dstHost, srcOrigin, dstOrigin, copyRegion, - pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight, pCopy.dstPitch, - pCopy.dstPitch * pCopy.dstHeight, srcMemory, srcRect, dstRect); - if (status != hipSuccess) { - return status; - } - amd::ReadMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*srcMemory, {srcRect.start_, 0, 0}, copyRegion, pCopy.dstHost, srcRect, - dstRect); - command->setSource(*srcMemory); - command->setOrigin({srcRect.start_, 0, 0}); - command->setSize(copyRegion); - command->setDestination(pCopy.dstHost); - command->setBufRect(srcRect); - command->setHostRect(dstRect); - } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice)) { - // Device to Device. - amd::Memory* srcMemory; - amd::Memory* dstMemory; - amd::BufferRect srcRect; - amd::BufferRect dstRect; - - status = ihipMemcpyDtoDValidate(pCopy.srcDevice, pCopy.dstDevice, srcOrigin, dstOrigin, - copyRegion, pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight, - pCopy.dstPitch, pCopy.dstPitch * pCopy.dstHeight, srcMemory, - dstMemory, srcRect, dstRect); - if (status != hipSuccess) { - return status; - } - amd::CopyMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*srcMemory, *dstMemory, {srcRect.start_, 0, 0}, {dstRect.start_, 0, 0}, - copyRegion, srcRect, dstRect); - } else if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeArray)) { - amd::Image* dstImage; - amd::BufferRect srcRect; - - status = - ihipMemcpyHtoAValidate(pCopy.srcHost, pCopy.dstArray, srcOrigin, dstOrigin, copyRegion, - pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight, dstImage, srcRect); - if (status != hipSuccess) { - return status; - } - amd::WriteMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*dstImage, dstOrigin, copyRegion, - static_cast(pCopy.srcHost) + srcRect.start_, pCopy.srcPitch, - pCopy.srcPitch * pCopy.srcHeight); - } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeHost)) { - // Image to Host. - amd::Image* srcImage; - amd::BufferRect dstRect; - - status = - ihipMemcpyAtoHValidate(pCopy.srcArray, pCopy.dstHost, srcOrigin, dstOrigin, copyRegion, - pCopy.dstPitch, pCopy.dstPitch * pCopy.dstHeight, srcImage, dstRect); - if (status != hipSuccess) { - return status; - } - amd::ReadMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*srcImage, srcOrigin, copyRegion, - static_cast(pCopy.dstHost) + dstRect.start_, pCopy.dstPitch, - pCopy.dstPitch * pCopy.dstHeight); - } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeArray)) { - // Device to Image. - amd::Image* dstImage; - amd::Memory* srcMemory; - amd::BufferRect dstRect; - amd::BufferRect srcRect; - status = ihipMemcpyDtoAValidate(pCopy.srcDevice, pCopy.dstArray, srcOrigin, dstOrigin, - copyRegion, pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight, - dstImage, srcMemory, dstRect, srcRect); - if (status != hipSuccess) { - return status; - } - amd::CopyMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*srcMemory, *dstImage, srcOrigin, dstOrigin, copyRegion, srcRect, dstRect); - } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeDevice)) { - // Image to Device. - amd::BufferRect srcRect; - amd::BufferRect dstRect; - amd::Memory* dstMemory; - amd::Image* srcImage; - status = ihipMemcpyAtoDValidate(pCopy.srcArray, pCopy.dstDevice, srcOrigin, dstOrigin, - copyRegion, pCopy.dstPitch, pCopy.dstPitch * pCopy.dstHeight, - dstMemory, srcImage, srcRect, dstRect); - if (status != hipSuccess) { - return status; - } - amd::CopyMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*srcImage, *dstMemory, srcOrigin, dstOrigin, copyRegion, srcRect, dstRect); - } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeArray)) { - amd::Image* srcImage; - amd::Image* dstImage; - - status = ihipMemcpyAtoAValidate(pCopy.srcArray, pCopy.dstArray, srcOrigin, dstOrigin, - copyRegion, srcImage, dstImage); - if (status != hipSuccess) { - return status; - } - amd::CopyMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*srcImage, *dstImage, srcOrigin, dstOrigin, copyRegion); - } else { - return hipErrorInvalidValue; - } - return hipSuccess; -} - - bool ihipGraph::isGraphValid(ihipGraph* pGraph) { amd::ScopedLock lock(graphSetLock_); if (graphSet_.find(pGraph) == graphSet_.end()) { @@ -685,7 +456,9 @@ hipError_t hipGraphExec::CreateStreams(uint32_t num_streams) { auto stream = new hip::Stream(hip::getCurrentDevice(), hip::Stream::Priority::Normal, hipStreamNonBlocking); if (stream == nullptr || !stream->Create()) { - delete stream; + if (stream != nullptr) { + stream->release(); + } ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed to create parallel stream!\n"); return hipErrorOutOfMemory; } @@ -708,7 +481,7 @@ hipError_t hipGraphExec::Init() { hipError_t FillCommands(std::vector>& parallelLists, std::unordered_map>& nodeWaitLists, std::vector& levelOrder, std::vector& rootCommands, - amd::Command*& endCommand, amd::HostQueue* queue) { + amd::Command*& endCommand, hip::Stream* stream) { hipError_t status; for (auto& node : levelOrder) { // TODO: clone commands from next launch @@ -758,7 +531,7 @@ hipError_t FillCommands(std::vector>& parallelLists, } } if (!graphLastCmdWaitList.empty()) { - endCommand = new amd::Marker(*queue, false, graphLastCmdWaitList); + endCommand = new amd::Marker(*stream, false, graphLastCmdWaitList); if (endCommand == nullptr) { return hipErrorOutOfMemory; } @@ -787,8 +560,8 @@ void UpdateStream(std::vector>& parallelLists, hip::Stream* st hipError_t hipGraphExec::Run(hipStream_t stream) { hipError_t status; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + + if (hip::getStream(stream) == nullptr) { return hipErrorInvalidResourceHandle; } if (flags_ == hipGraphInstantiateFlagAutoFreeOnLaunch) { @@ -802,7 +575,7 @@ hipError_t hipGraphExec::Run(hipStream_t stream) { std::vector rootCommands; amd::Command* endCommand = nullptr; status = - FillCommands(parallelLists_, nodeWaitLists_, levelOrder_, rootCommands, endCommand, queue); + FillCommands(parallelLists_, nodeWaitLists_, levelOrder_, rootCommands, endCommand, hip_stream); if (status != hipSuccess) { return status; } diff --git a/src/hip_graph_internal.hpp b/src/hip_graph_internal.hpp index 952bb8ee..4f0b5dd3 100644 --- a/src/hip_graph_internal.hpp +++ b/src/hip_graph_internal.hpp @@ -38,7 +38,7 @@ typedef hipGraphNode* Node; hipError_t FillCommands(std::vector>& parallelLists, std::unordered_map>& nodeWaitLists, std::vector& levelOrder, std::vector& rootCommands, - amd::Command*& endCommand, amd::HostQueue* queue); + amd::Command*& endCommand, hip::Stream* stream); void UpdateStream(std::vector>& parallelLists, hip::Stream* stream, hipGraphExec* ptr); @@ -155,7 +155,6 @@ struct hipGraphNodeDOTAttribute { struct hipGraphNode : public hipGraphNodeDOTAttribute { protected: hip::Stream* stream_ = nullptr; - amd::HostQueue* queue_; uint32_t level_; unsigned int id_; hipGraphNodeType type_; @@ -222,16 +221,15 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute { return true; } - amd::HostQueue* GetQueue() { return queue_; } + hip::Stream* GetQueue() { return stream_; } virtual void SetStream(hip::Stream* stream, hipGraphExec* ptr = nullptr) { stream_ = stream; - queue_ = stream->asHostQueue(); } /// Create amd::command for the graph node - virtual hipError_t CreateCommand(amd::HostQueue* queue) { + virtual hipError_t CreateCommand(hip::Stream* stream) { commands_.clear(); - queue_ = queue; + stream_ = stream; return hipSuccess; } /// Return node unique ID @@ -350,8 +348,8 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute { (type_ == hipGraphNodeTypeKernel || type_ == hipGraphNodeTypeMemcpy || type_ == hipGraphNodeTypeMemset)) { amd::Command::EventWaitList waitList; - amd::HostQueue* queue = hip::getQueue(stream); - amd::Command* command = new amd::Marker(*queue, !kMarkerDisableFlush, waitList); + hip::Stream* hip_stream = hip::getStream(stream); + amd::Command* command = new amd::Marker(*hip_stream, !kMarkerDisableFlush, waitList); command->enqueue(); command->release(); return; @@ -575,7 +573,9 @@ struct hipGraphExec { // new commands are launched for every launch they are destroyed as and when command is // terminated after it complete execution for (auto stream : parallel_streams_) { - delete stream; + if (stream != nullptr) { + stream->release(); + } } for (auto it = clonedNodes_.begin(); it != clonedNodes_.end(); it++) delete it->second; amd::ScopedLock lock(graphExecSetLock_); @@ -645,7 +645,6 @@ struct hipChildGraphNode : public hipGraphNode { void SetStream(hip::Stream* stream, hipGraphExec* ptr = nullptr) { stream_ = stream; - queue_ = stream->asHostQueue(); UpdateStream(parallelLists_, stream, ptr); } @@ -654,8 +653,8 @@ struct hipChildGraphNode : public hipGraphNode { std::vector& GetCommands() { return parallelLists_[0].back()->GetCommands(); } // Create child graph node commands and set waitlists - hipError_t CreateCommand(amd::HostQueue* queue) { - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t CreateCommand(hip::Stream* stream) { + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } @@ -663,7 +662,7 @@ struct hipChildGraphNode : public hipGraphNode { std::vector rootCommands; amd::Command* endCommand = nullptr; status = FillCommands(parallelLists_, nodeWaitLists_, childGraphlevelOrder_, rootCommands, - endCommand, queue); + endCommand, stream); for (auto& cmd : rootCommands) { commands_.push_back(cmd); } @@ -933,14 +932,14 @@ class hipGraphKernelNode : public hipGraphNode { return new hipGraphKernelNode(static_cast(*this)); } - hipError_t CreateCommand(amd::HostQueue* queue) { + hipError_t CreateCommand(hip::Stream* stream) { hipFunction_t func = nullptr; hipError_t status = validateKernelParams(pKernelParams_, &func, - queue ? hip::getDeviceID(queue->context()) : -1); + stream ? hip::getDeviceID(stream->context()) : -1); if (hipSuccess != status) { return status; } - status = hipGraphNode::CreateCommand(queue); + status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } @@ -951,7 +950,7 @@ class hipGraphKernelNode : public hipGraphNode { pKernelParams_->gridDim.y * pKernelParams_->blockDim.y, pKernelParams_->gridDim.z * pKernelParams_->blockDim.z, pKernelParams_->blockDim.x, pKernelParams_->blockDim.y, pKernelParams_->blockDim.z, pKernelParams_->sharedMemBytes, - queue, pKernelParams_->kernelParams, pKernelParams_->extra, nullptr, nullptr, 0, 0, 0, 0, 0, + stream, pKernelParams_->kernelParams, pKernelParams_->extra, nullptr, nullptr, 0, 0, 0, 0, 0, 0, 0); commands_.emplace_back(command); return status; @@ -1044,22 +1043,6 @@ class hipGraphKernelNode : public hipGraphNode { } return hipSuccess; } - // ToDo: use this when commands are cloned and command params are to be updated - hipError_t SetCommandParams(const hipKernelNodeParams* params) { - // updates kernel params - hipError_t status = validateKernelParams(params); - if (hipSuccess != status) { - return status; - } - size_t globalWorkOffset[3] = {0}; - size_t globalWorkSize[3] = {params->gridDim.x, params->gridDim.y, params->gridDim.z}; - size_t localWorkSize[3] = {params->blockDim.x, params->blockDim.y, params->blockDim.z}; - reinterpret_cast(commands_[0]) - ->setSizes(globalWorkOffset, globalWorkSize, localWorkSize); - reinterpret_cast(commands_[0]) - ->setSharedMemBytes(params->sharedMemBytes); - return hipSuccess; - } hipError_t SetParams(hipGraphNode* node) { const hipGraphKernelNode* kernelNode = static_cast(node); @@ -1110,17 +1093,17 @@ class hipGraphMemcpyNode : public hipGraphNode { return new hipGraphMemcpyNode(static_cast(*this)); } - hipError_t CreateCommand(amd::HostQueue* queue) { + hipError_t CreateCommand(hip::Stream* stream) { if (IsHtoHMemcpy(pCopyParams_->dstPtr.ptr, pCopyParams_->srcPtr.ptr, pCopyParams_->kind)) { return hipSuccess; } - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } commands_.reserve(1); amd::Command* command; - status = ihipMemcpy3DCommand(command, pCopyParams_, queue); + status = ihipMemcpy3DCommand(command, pCopyParams_, stream); commands_.emplace_back(command); return status; } @@ -1129,7 +1112,7 @@ class hipGraphMemcpyNode : public hipGraphNode { if (isEnabled_ && IsHtoHMemcpy(pCopyParams_->dstPtr.ptr, pCopyParams_->srcPtr.ptr, pCopyParams_->kind)) { ihipHtoHMemcpy(pCopyParams_->dstPtr.ptr, pCopyParams_->srcPtr.ptr, pCopyParams_->extent.width * pCopyParams_->extent.height * - pCopyParams_->extent.depth, *hip::getQueue(stream)); + pCopyParams_->extent.depth, *hip::getStream(stream)); return; } hipGraphNode::EnqueueCommands(stream); @@ -1150,8 +1133,6 @@ class hipGraphMemcpyNode : public hipGraphNode { const hipGraphMemcpyNode* memcpyNode = static_cast(node); return SetParams(memcpyNode->pCopyParams_); } - // ToDo: use this when commands are cloned and command params are to be updated - hipError_t SetCommandParams(const hipMemcpy3DParms* pNodeParams); hipError_t ValidateParams(const hipMemcpy3DParms* pNodeParams); std::string GetLabel(hipGraphDebugDotFlags flag) { const HIP_MEMCPY3D pCopy = hip::getDrvMemcpy3DDesc(*pCopyParams_); @@ -1256,17 +1237,17 @@ class hipGraphMemcpyNode1D : public hipGraphNode { return new hipGraphMemcpyNode1D(static_cast(*this)); } - virtual hipError_t CreateCommand(amd::HostQueue* queue) { + virtual hipError_t CreateCommand(hip::Stream* stream) { if (IsHtoHMemcpy(dst_, src_, kind_)) { return hipSuccess; } - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } commands_.reserve(1); amd::Command* command = nullptr; - status = ihipMemcpyCommand(command, dst_, src_, count_, kind_, *queue); + status = ihipMemcpyCommand(command, dst_, src_, count_, kind_, *stream); commands_.emplace_back(command); return status; } @@ -1281,14 +1262,14 @@ class hipGraphMemcpyNode1D : public hipGraphNode { if (isEnabled_) { //HtoH if (isH2H) { - ihipHtoHMemcpy(dst_, src_, count_, *hip::getQueue(stream)); + ihipHtoHMemcpy(dst_, src_, count_, *hip::getStream(stream)); return; } amd::Command* command = commands_[0]; amd::HostQueue* cmdQueue = command->queue(); - amd::HostQueue* queue = hip::getQueue(stream); + hip::Stream* hip_stream = hip::getStream(stream); - if (cmdQueue == queue) { + if (cmdQueue == hip_stream) { command->enqueue(); command->release(); return; @@ -1296,7 +1277,7 @@ class hipGraphMemcpyNode1D : public hipGraphNode { amd::Command::EventWaitList waitList; amd::Command* depdentMarker = nullptr; - amd::Command* cmd = queue->getLastQueuedCommand(true); + amd::Command* cmd = hip_stream->getLastQueuedCommand(true); if (cmd != nullptr) { waitList.push_back(cmd); amd::Command* depdentMarker = new amd::Marker(*cmdQueue, true, waitList); @@ -1313,7 +1294,7 @@ class hipGraphMemcpyNode1D : public hipGraphNode { if (cmd != nullptr) { waitList.clear(); waitList.push_back(cmd); - amd::Command* depdentMarker = new amd::Marker(*queue, true, waitList); + amd::Command* depdentMarker = new amd::Marker(*hip_stream, true, waitList); if (depdentMarker != nullptr) { depdentMarker->enqueue(); // Make sure future commands of queue synced with command depdentMarker->release(); @@ -1322,8 +1303,8 @@ class hipGraphMemcpyNode1D : public hipGraphNode { } } else { amd::Command::EventWaitList waitList; - amd::HostQueue* queue = hip::getQueue(stream); - amd::Command* command = new amd::Marker(*queue, !kMarkerDisableFlush, waitList); + hip::Stream* hip_stream = hip::getStream(stream); + amd::Command* command = new amd::Marker(*hip_stream, !kMarkerDisableFlush, waitList); command->enqueue(); command->release(); } @@ -1346,8 +1327,6 @@ class hipGraphMemcpyNode1D : public hipGraphNode { return SetParams(memcpy1DNode->dst_, memcpy1DNode->src_, memcpy1DNode->count_, memcpy1DNode->kind_); } - // ToDo: use this when commands are cloned and command params are to be updated - hipError_t SetCommandParams(void* dst, const void* src, size_t count, hipMemcpyKind kind); static hipError_t ValidateParams(void* dst, const void* src, size_t count, hipMemcpyKind kind); std::string GetLabel(hipGraphDebugDotFlags flag) { size_t sOffsetOrig = 0; @@ -1414,8 +1393,8 @@ class hipGraphMemcpyNodeFromSymbol : public hipGraphMemcpyNode1D { static_cast(*this)); } - hipError_t CreateCommand(amd::HostQueue* queue) { - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t CreateCommand(hip::Stream* stream) { + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } @@ -1428,7 +1407,7 @@ class hipGraphMemcpyNodeFromSymbol : public hipGraphMemcpyNode1D { if (status != hipSuccess) { return status; } - status = ihipMemcpyCommand(command, dst_, device_ptr, count_, kind_, *queue); + status = ihipMemcpyCommand(command, dst_, device_ptr, count_, kind_, *stream); if (status != hipSuccess) { return status; } @@ -1474,18 +1453,6 @@ class hipGraphMemcpyNodeFromSymbol : public hipGraphMemcpyNode1D { return SetParams(memcpyNode->dst_, memcpyNode->symbol_, memcpyNode->count_, memcpyNode->offset_, memcpyNode->kind_); } - // ToDo: use this when commands are cloned and command params are to be updated - hipError_t SetCommandParams(void* dst, const void* symbol, size_t count, size_t offset, - hipMemcpyKind kind) { - size_t sym_size = 0; - hipDeviceptr_t device_ptr = nullptr; - - hipError_t status = ihipMemcpySymbol_validate(symbol, count, offset, sym_size, device_ptr); - if (status != hipSuccess) { - return status; - } - return hipGraphMemcpyNode1D::SetCommandParams(dst, device_ptr, count, kind); - } }; class hipGraphMemcpyNodeToSymbol : public hipGraphMemcpyNode1D { const void* symbol_; @@ -1504,8 +1471,8 @@ class hipGraphMemcpyNodeToSymbol : public hipGraphMemcpyNode1D { return new hipGraphMemcpyNodeToSymbol(static_cast(*this)); } - hipError_t CreateCommand(amd::HostQueue* queue) { - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t CreateCommand(hip::Stream* stream) { + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } @@ -1518,7 +1485,7 @@ class hipGraphMemcpyNodeToSymbol : public hipGraphMemcpyNode1D { if (status != hipSuccess) { return status; } - status = ihipMemcpyCommand(command, device_ptr, src_, count_, kind_, *queue); + status = ihipMemcpyCommand(command, device_ptr, src_, count_, kind_, *stream); if (status != hipSuccess) { return status; } @@ -1562,18 +1529,6 @@ class hipGraphMemcpyNodeToSymbol : public hipGraphMemcpyNode1D { return SetParams(memcpyNode->src_, memcpyNode->symbol_, memcpyNode->count_, memcpyNode->offset_, memcpyNode->kind_); } - // ToDo: use this when commands are cloned and command params are to be updated - hipError_t SetCommandParams(const void* symbol, const void* src, size_t count, size_t offset, - hipMemcpyKind kind) { - size_t sym_size = 0; - hipDeviceptr_t device_ptr = nullptr; - - hipError_t status = ihipMemcpySymbol_validate(symbol, count, offset, sym_size, device_ptr); - if (status != hipSuccess) { - return status; - } - return hipGraphMemcpyNode1D::SetCommandParams(device_ptr, src, count, kind); - } }; class hipGraphMemsetNode : public hipGraphNode { @@ -1633,21 +1588,21 @@ class hipGraphMemsetNode : public hipGraphNode { } } - hipError_t CreateCommand(amd::HostQueue* queue) { - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t CreateCommand(hip::Stream* stream) { + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } if (pMemsetParams_->height == 1) { size_t sizeBytes = pMemsetParams_->width * pMemsetParams_->elementSize; hipError_t status = ihipMemsetCommand(commands_, pMemsetParams_->dst, pMemsetParams_->value, - pMemsetParams_->elementSize, sizeBytes, queue); + pMemsetParams_->elementSize, sizeBytes, stream); } else { hipError_t status = ihipMemset3DCommand( commands_, {pMemsetParams_->dst, pMemsetParams_->pitch, pMemsetParams_->width * pMemsetParams_->elementSize, pMemsetParams_->height}, - pMemsetParams_->value, {pMemsetParams_->width * pMemsetParams_->elementSize, pMemsetParams_->height, 1}, queue, pMemsetParams_->elementSize); + pMemsetParams_->value, {pMemsetParams_->width * pMemsetParams_->elementSize, pMemsetParams_->height, 1}, stream, pMemsetParams_->elementSize); } return status; } @@ -1706,15 +1661,15 @@ class hipGraphEventRecordNode : public hipGraphNode { return new hipGraphEventRecordNode(static_cast(*this)); } - hipError_t CreateCommand(amd::HostQueue* queue) { - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t CreateCommand(hip::Stream* stream) { + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } hip::Event* e = reinterpret_cast(event_); commands_.reserve(1); amd::Command* command = nullptr; - status = e->recordCommand(command, queue); + status = e->recordCommand(command, stream); commands_.emplace_back(command); return status; } @@ -1744,16 +1699,6 @@ class hipGraphEventRecordNode : public hipGraphNode { static_cast(node); return SetParams(eventRecordNode->event_); } - // ToDo: use this when commands are cloned and command params are to be updated - hipError_t SetCommandParams(hipEvent_t event) { - amd::HostQueue* queue; - if (!commands_.empty()) { - queue = commands_[0]->queue(); - commands_[0]->release(); - } - commands_.clear(); - return CreateCommand(queue); - } }; class hipGraphEventWaitNode : public hipGraphNode { @@ -1769,15 +1714,15 @@ class hipGraphEventWaitNode : public hipGraphNode { return new hipGraphEventWaitNode(static_cast(*this)); } - hipError_t CreateCommand(amd::HostQueue* queue) { - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t CreateCommand(hip::Stream* stream) { + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } hip::Event* e = reinterpret_cast(event_); commands_.reserve(1); amd::Command* command; - status = e->streamWaitCommand(command, queue); + status = e->streamWaitCommand(command, stream); commands_.emplace_back(command); return status; } @@ -1806,16 +1751,6 @@ class hipGraphEventWaitNode : public hipGraphNode { const hipGraphEventWaitNode* eventWaitNode = static_cast(node); return SetParams(eventWaitNode->event_); } - // ToDo: use this when commands are cloned and command params are to be updated - hipError_t SetCommandParams(hipEvent_t event) { - amd::HostQueue* queue; - if (!commands_.empty()) { - queue = commands_[0]->queue(); - commands_[0]->release(); - } - commands_.clear(); - return CreateCommand(queue); - } }; class hipGraphHostNode : public hipGraphNode { @@ -1836,14 +1771,14 @@ class hipGraphHostNode : public hipGraphNode { return new hipGraphHostNode(static_cast(*this)); } - hipError_t CreateCommand(amd::HostQueue* queue) { - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t CreateCommand(hip::Stream* stream) { + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } amd::Command::EventWaitList waitList; commands_.reserve(1); - amd::Command* command = new amd::Marker(*queue, !kMarkerDisableFlush, waitList); + amd::Command* command = new amd::Marker(*stream, !kMarkerDisableFlush, waitList); commands_.emplace_back(command); return hipSuccess; } @@ -1885,8 +1820,6 @@ class hipGraphHostNode : public hipGraphNode { const hipGraphHostNode* hostNode = static_cast(node); return SetParams(hostNode->pNodeParams_); } - // ToDo: use this when commands are cloned and command params are to be updated - hipError_t SetCommandParams(const hipHostNodeParams* params); }; class hipGraphEmptyNode : public hipGraphNode { @@ -1898,14 +1831,14 @@ class hipGraphEmptyNode : public hipGraphNode { return new hipGraphEmptyNode(static_cast(*this)); } - hipError_t CreateCommand(amd::HostQueue* queue) { - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t CreateCommand(hip::Stream* stream) { + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } amd::Command::EventWaitList waitList; commands_.reserve(1); - amd::Command* command = new amd::Marker(*queue, !kMarkerDisableFlush, waitList); + amd::Command* command = new amd::Marker(*stream, !kMarkerDisableFlush, waitList); commands_.emplace_back(command); return hipSuccess; } @@ -1925,8 +1858,8 @@ class hipGraphMemAllocNode : public hipGraphNode { return new hipGraphMemAllocNode(static_cast(*this)); } - virtual hipError_t CreateCommand(amd::HostQueue* queue) { - auto error = hipGraphNode::CreateCommand(queue); + virtual hipError_t CreateCommand(hip::Stream* stream) { + auto error = hipGraphNode::CreateCommand(stream); auto ptr = Execute(stream_); return error; } @@ -1966,8 +1899,8 @@ class hipGraphMemFreeNode : public hipGraphNode { return new hipGraphMemFreeNode(static_cast(*this)); } - virtual hipError_t CreateCommand(amd::HostQueue* queue) { - auto error = hipGraphNode::CreateCommand(queue); + virtual hipError_t CreateCommand(hip::Stream* stream) { + auto error = hipGraphNode::CreateCommand(stream); Execute(stream_); return error; } diff --git a/src/hip_hmm.cpp b/src/hip_hmm.cpp index be4c6cb4..ec201663 100644 --- a/src/hip_hmm.cpp +++ b/src/hip_hmm.cpp @@ -94,7 +94,7 @@ hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device, HIP_RETURN(hipErrorInvalidDevice); } - amd::HostQueue* queue = nullptr; + hip::Stream* hip_stream = nullptr; amd::Device* dev = nullptr; bool cpu_access = false; @@ -106,19 +106,19 @@ hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device, // Pick the specified stream or Null one from the provided device if (device == hipCpuDeviceId) { cpu_access = true; - queue = (stream == nullptr) ? hip::getCurrentDevice()->NullStream() : hip::getQueue(stream); + hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->NullStream() : hip::getStream(stream); } else { dev = g_devices[device]->devices()[0]; - queue = (stream == nullptr) ? g_devices[device]->NullStream() : hip::getQueue(stream); + hip_stream = (stream == nullptr) ? g_devices[device]->NullStream() : hip::getStream(stream); } - if (queue == nullptr) { + if (hip_stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } amd::Command::EventWaitList waitList; amd::SvmPrefetchAsyncCommand* command = - new amd::SvmPrefetchAsyncCommand(*queue, waitList, dev_ptr, count, dev, cpu_access); + new amd::SvmPrefetchAsyncCommand(*hip_stream, waitList, dev_ptr, count, dev, cpu_access); if (command == nullptr) { return hipErrorOutOfMemory; } diff --git a/src/hip_internal.hpp b/src/hip_internal.hpp index a416c1e7..84782cb6 100644 --- a/src/hip_internal.hpp +++ b/src/hip_internal.hpp @@ -225,12 +225,11 @@ class stream_per_thread { namespace hip { class Device; class MemoryPool; - class Stream { + class Stream : public amd::HostQueue { public: enum Priority : int { High = -1, Normal = 0, Low = 1 }; private: - amd::HostQueue* queue_; mutable amd::Monitor lock_; Device* device_; Priority priority_; @@ -260,18 +259,20 @@ namespace hip { /// Capture events std::unordered_set captureEvents_; unsigned long long captureID_; + + static inline CommandQueue::Priority convertToQueuePriority(Priority p){ + return p == Priority::High ? amd::CommandQueue::Priority::High : p == Priority::Low ? + amd::CommandQueue::Priority::Low : amd::CommandQueue::Priority::Normal; + } + public: Stream(Device* dev, Priority p = Priority::Normal, unsigned int f = 0, bool null_stream = false, const std::vector& cuMask = {}, hipStreamCaptureStatus captureStatus = hipStreamCaptureStatusNone); - ~Stream(); + /// Creates the hip stream object, including AMD host queue bool Create(); - - /// Get device AMD host queue object. The method can allocate the queue - amd::HostQueue* asHostQueue(bool skip_alloc = false); - - void Finish() const; + virtual bool terminate() override; /// Get device ID associated with the current stream; int DeviceId() const; /// Get HIP device associated with the stream @@ -378,6 +379,7 @@ namespace hip { parallelCaptureStreams_.erase(it); } } + static bool existsActiveStreamForDevice(hip::Device* device); }; /// HIP Device class @@ -389,7 +391,7 @@ namespace hip { /// Store it here so we don't have to loop through the device list every time int deviceId_; /// ROCclr host queue for default streams - Stream null_stream_; + Stream* null_stream_ = nullptr; /// Store device flags unsigned int flags_; /// Maintain list of user enabled peers @@ -398,7 +400,6 @@ namespace hip { /// True if this device is active bool isActive_; - std::vector queues_; MemoryPool* default_mem_pool_; MemoryPool* current_mem_pool_; @@ -408,7 +409,6 @@ namespace hip { public: Device(amd::Context* ctx, int devId): context_(ctx), deviceId_(devId), - null_stream_(this, Stream::Priority::Normal, 0, true), flags_(hipDeviceScheduleSpin), isActive_(false), default_mem_pool_(nullptr), @@ -445,22 +445,16 @@ namespace hip { void setFlags(unsigned int flags) { flags_ = flags; } void Reset(); - amd::HostQueue* NullStream(bool skip_alloc = false); - Stream* GetNullStream(); + hip::Stream* NullStream(bool skip_alloc = false); + Stream* GetNullStream(); - void SaveQueue(amd::HostQueue* queue) { - amd::ScopedLock lock(lock_); - queues_.push_back(queue); - } bool GetActiveStatus() { amd::ScopedLock lock(lock_); if (isActive_) return true; - for (int i = 0; i < queues_.size(); i++) { - if (queues_[i]->GetQueueStatus()) { - isActive_ = true; - return true; - } + if (Stream::existsActiveStreamForDevice(this)) { + isActive_ = true; + return true; } return false; } @@ -524,11 +518,11 @@ namespace hip { /// Get ROCclr queue associated with hipStream /// Note: This follows the CUDA spec to sync with default streams /// and Blocking streams - extern amd::HostQueue* getQueue(hipStream_t stream); + extern hip::Stream* getStream(hipStream_t stream); /// Get default stream associated with the ROCclr context - extern amd::HostQueue* getNullStream(amd::Context&); + extern hip::Stream* getNullStream(amd::Context&); /// Get default stream of the thread - extern amd::HostQueue* getNullStream(); + extern hip::Stream* getNullStream(); /// Get device ID associated with the ROCclr context int getDeviceID(amd::Context& ctx); /// Check if stream is valid @@ -542,7 +536,7 @@ extern void WaitThenDecrementSignal(hipStream_t stream, hipError_t status, void* /// Wait all active streams on the blocking queue. The method enqueues a wait command and /// doesn't stall the current thread -extern void iHipWaitActiveStreams(amd::HostQueue* blocking_queue, bool wait_null_stream = false); +extern void iHipWaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stream = false); extern std::vector g_devices; extern hipError_t ihipDeviceGetCount(int* count); diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 40f5a528..d642354a 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -78,9 +78,9 @@ hipError_t ihipFree(void *ptr) { auto dev = g_devices[device_id]; // Skip stream allocation, since if it wasn't allocated until free, then the device wasn't used constexpr bool SkipStreamAlloc = true; - amd::HostQueue* queue = dev->NullStream(SkipStreamAlloc); - if (queue != nullptr) { - queue->finish(); + hip::Stream* stream = dev->NullStream(SkipStreamAlloc); + if (stream != nullptr) { + stream->finish(); } hip::Stream::syncNonBlockingStreams(device_id); // Find out if memory belongs to any memory pool @@ -195,15 +195,15 @@ hipError_t hipSignalExternalSemaphoresAsync( if (extSemArray == nullptr || paramsArray == nullptr) { HIP_RETURN(hipErrorInvalidValue); } - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } for (unsigned int i = 0; i < numExtSems; i++) { if (extSemArray[i] != nullptr) { amd::ExternalSemaphoreCmd* command = - new amd::ExternalSemaphoreCmd(*queue, extSemArray[i], paramsArray[i].params.fence.value, + new amd::ExternalSemaphoreCmd(*hip_stream, extSemArray[i], paramsArray[i].params.fence.value, amd::ExternalSemaphoreCmd::COMMAND_SIGNAL_EXTSEMAPHORE); if (command == nullptr) { return hipErrorOutOfMemory; @@ -227,15 +227,15 @@ hipError_t hipWaitExternalSemaphoresAsync(const hipExternalSemaphore_t* extSemAr if (extSemArray == nullptr || paramsArray == nullptr) { HIP_RETURN(hipErrorInvalidValue); } - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } for (unsigned int i = 0; i < numExtSems; i++) { if (extSemArray[i] != nullptr) { amd::ExternalSemaphoreCmd* command = - new amd::ExternalSemaphoreCmd(*queue, extSemArray[i], paramsArray[i].params.fence.value, + new amd::ExternalSemaphoreCmd(*hip_stream, extSemArray[i], paramsArray[i].params.fence.value, amd::ExternalSemaphoreCmd::COMMAND_WAIT_EXTSEMAPHORE); if (command == nullptr) { return hipErrorOutOfMemory; @@ -343,35 +343,35 @@ hipError_t ihipMemcpy_validate(void* dst, const void* src, size_t sizeBytes, } hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, size_t sizeBytes, - hipMemcpyKind kind, amd::HostQueue& queue, bool isAsync) { + hipMemcpyKind kind, hip::Stream& stream, bool isAsync) { amd::Command::EventWaitList waitList; size_t sOffset = 0; amd::Memory* srcMemory = getMemoryObject(src, sOffset); size_t dOffset = 0; amd::Memory* dstMemory = getMemoryObject(dst, dOffset); - amd::Device* queueDevice = &queue.device(); + amd::Device* queueDevice = &stream.device(); amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); if ((srcMemory == nullptr) && (dstMemory != nullptr)) { - amd::HostQueue* pQueue = &queue; + hip::Stream* pStream = &stream; if (queueDevice != dstMemory->getContext().devices()[0]) { - pQueue = hip::getNullStream(dstMemory->getContext()); - amd::Command* cmd = queue.getLastQueuedCommand(true); + pStream = hip::getNullStream(dstMemory->getContext()); + amd::Command* cmd = stream.getLastQueuedCommand(true); if (cmd != nullptr) { waitList.push_back(cmd); } } - command = new amd::WriteMemoryCommand(*pQueue, CL_COMMAND_WRITE_BUFFER, waitList, + command = new amd::WriteMemoryCommand(*pStream, CL_COMMAND_WRITE_BUFFER, waitList, *dstMemory->asBuffer(), dOffset, sizeBytes, src, 0, 0, copyMetadata); } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) { - amd::HostQueue* pQueue = &queue; + hip::Stream* pStream = &stream; if (queueDevice != srcMemory->getContext().devices()[0]) { - pQueue = hip::getNullStream(srcMemory->getContext()); - amd::Command* cmd = queue.getLastQueuedCommand(true); + pStream = hip::getNullStream(srcMemory->getContext()); + amd::Command* cmd = stream.getLastQueuedCommand(true); if (cmd != nullptr) { waitList.push_back(cmd); } } - command = new amd::ReadMemoryCommand(*pQueue, CL_COMMAND_READ_BUFFER, waitList, + command = new amd::ReadMemoryCommand(*pStream, CL_COMMAND_READ_BUFFER, waitList, *srcMemory->asBuffer(), sOffset, sizeBytes, dst, 0, 0, copyMetadata); } else if ((srcMemory != nullptr) && (dstMemory != nullptr)) { // Check if the queue device doesn't match the device on any memory object. @@ -380,7 +380,7 @@ hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, if ((srcMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) && ((srcMemory->getContext().devices().size() == 1) && (dstMemory->getContext().devices().size() == 1))) { - command = new amd::CopyMemoryP2PCommand(queue, CL_COMMAND_COPY_BUFFER, waitList, + command = new amd::CopyMemoryP2PCommand(stream, CL_COMMAND_COPY_BUFFER, waitList, *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes); if (command == nullptr) { return hipErrorOutOfMemory; @@ -392,12 +392,12 @@ hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, return hipErrorInvalidValue; } } else { - amd::HostQueue* pQueue = &queue; + hip::Stream* pStream = &stream; if ((srcMemory->getContext().devices()[0] == dstMemory->getContext().devices()[0]) && (queueDevice != srcMemory->getContext().devices()[0])) { copyMetadata.copyEnginePreference_ = amd::CopyMetadata::CopyEnginePreference::NONE; - pQueue = hip::getNullStream(srcMemory->getContext()); - amd::Command* cmd = queue.getLastQueuedCommand(true); + pStream = hip::getNullStream(srcMemory->getContext()); + amd::Command* cmd = stream.getLastQueuedCommand(true); if (cmd != nullptr) { waitList.push_back(cmd); } @@ -405,22 +405,22 @@ hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, // Scenarios such as DtoH where dst is pinned memory if ((queueDevice != srcMemory->getContext().devices()[0]) && (dstMemory->getContext().devices().size() != 1)) { - pQueue = hip::getNullStream(srcMemory->getContext()); - amd::Command* cmd = queue.getLastQueuedCommand(true); + pStream = hip::getNullStream(srcMemory->getContext()); + amd::Command* cmd = stream.getLastQueuedCommand(true); if (cmd != nullptr) { waitList.push_back(cmd); } // Scenarios such as HtoD where src is pinned memory } else if ((queueDevice != dstMemory->getContext().devices()[0]) && (srcMemory->getContext().devices().size() != 1)) { - pQueue = hip::getNullStream(dstMemory->getContext()); - amd::Command* cmd = queue.getLastQueuedCommand(true); + pStream = hip::getNullStream(dstMemory->getContext()); + amd::Command* cmd = stream.getLastQueuedCommand(true); if (cmd != nullptr) { waitList.push_back(cmd); } } } - command = new amd::CopyMemoryCommand(*pQueue, CL_COMMAND_COPY_BUFFER, waitList, + command = new amd::CopyMemoryCommand(*pStream, CL_COMMAND_COPY_BUFFER, waitList, *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes, copyMetadata); } @@ -445,13 +445,13 @@ bool IsHtoHMemcpy(void* dst, const void* src, hipMemcpyKind kind) { } return false; } -void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, amd::HostQueue& queue) { - queue.finish(); +void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, hip::Stream& stream) { + stream.finish(); memcpy(dst, src, sizeBytes); } // ================================================================================================ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, - amd::HostQueue& queue, bool isAsync = false) { + hip::Stream& stream, bool isAsync = false) { hipError_t status; if (sizeBytes == 0) { // Skip if nothing needs writing. @@ -470,7 +470,7 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin size_t dOffset = 0; amd::Memory* dstMemory = getMemoryObject(dst, dOffset); if (srcMemory == nullptr && dstMemory == nullptr) { - ihipHtoHMemcpy(dst, src, sizeBytes, queue); + ihipHtoHMemcpy(dst, src, sizeBytes, stream); return hipSuccess; } else if ((srcMemory == nullptr) && (dstMemory != nullptr)) { isAsync = false; @@ -483,7 +483,7 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin isP2P = true; } amd::Command* command = nullptr; - status = ihipMemcpyCommand(command, dst, src, sizeBytes, kind, queue, isAsync); + status = ihipMemcpyCommand(command, dst, src, sizeBytes, kind, stream, isAsync); if (status != hipSuccess) { return status; } @@ -491,22 +491,22 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin if (!isAsync) { command->awaitCompletion(); } else if (isP2P) { - amd::HostQueue* pQueue = hip::getNullStream(dstMemory->getContext()); + hip::Stream* pStream = hip::getNullStream(dstMemory->getContext()); amd::Command::EventWaitList waitList; waitList.push_back(command); - amd::Command* depdentMarker = new amd::Marker(*pQueue, false, waitList); + amd::Command* depdentMarker = new amd::Marker(*pStream, false, waitList); if (depdentMarker != nullptr) { depdentMarker->enqueue(); depdentMarker->release(); } } else { amd::HostQueue* newQueue = command->queue(); - if (newQueue != &queue) { + if (newQueue != &stream) { amd::Command::EventWaitList waitList; amd::Command* cmd = newQueue->getLastQueuedCommand(true); if (cmd != nullptr) { waitList.push_back(cmd); - amd::Command* depdentMarker = new amd::Marker(queue, true, waitList); + amd::Command* depdentMarker = new amd::Marker(stream, true, waitList); if (depdentMarker != nullptr) { depdentMarker->enqueue(); depdentMarker->release(); @@ -611,18 +611,18 @@ hipError_t hipFree(void* ptr) { hipError_t hipMemcpy_common(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream = nullptr) { CHECK_STREAM_CAPTURING(); - amd::HostQueue* queue = nullptr; + hip::Stream* hip_stream = nullptr; if (stream != nullptr) { - queue = hip::getQueue(stream); + hip_stream = hip::getStream(stream); } else { - queue = hip::getNullStream(); + hip_stream = hip::getNullStream(); } - if (queue == nullptr) { + if (hip_stream == nullptr) { return hipErrorInvalidValue; } - return ihipMemcpy(dst, src, sizeBytes, kind, *queue); + return ihipMemcpy(dst, src, sizeBytes, kind, *hip_stream); } hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { @@ -643,12 +643,12 @@ hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes, HIP_RETURN(hipErrorContextIsDestroyed); } - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } - HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *queue, false)); + HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *hip_stream, false)); } hipError_t hipMemPtrGetInfo(void *ptr, size_t *size) { @@ -697,9 +697,9 @@ hipError_t ihipArrayDestroy(hipArray* array) { } for (auto& dev : g_devices) { - amd::HostQueue* queue = dev->NullStream(true); - if (queue != nullptr) { - queue->finish(); + hip::Stream* stream = dev->NullStream(true); + if (stream != nullptr) { + stream->finish(); } } @@ -1205,9 +1205,9 @@ hipError_t ihipHostUnregister(void* hostPtr) { // Wait on the device, associated with the current memory object during allocation auto device_id = mem->getUserData().deviceId; - amd::HostQueue* queue = g_devices[device_id]->NullStream(true); - if (queue != nullptr) { - queue->finish(); + hip::Stream* stream = g_devices[device_id]->NullStream(true); + if (stream != nullptr) { + stream->finish(); } amd::MemObjMap::RemoveMemObj(hostPtr); @@ -1392,11 +1392,11 @@ hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice, size_t ByteCount) { HIP_INIT_API(hipMemcpyHtoD, dstDevice, srcHost, ByteCount); CHECK_STREAM_CAPTURING(); - amd::HostQueue* queue = hip::getQueue(nullptr); - if (queue == nullptr) { + hip::Stream* stream = hip::getStream(nullptr); + if (stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } - HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *queue)); + HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *stream)); } hipError_t hipMemcpyDtoH(void* dstHost, @@ -1404,11 +1404,11 @@ hipError_t hipMemcpyDtoH(void* dstHost, size_t ByteCount) { HIP_INIT_API(hipMemcpyDtoH, dstHost, srcDevice, ByteCount); CHECK_STREAM_CAPTURING(); - amd::HostQueue* queue = hip::getQueue(nullptr); - if (queue == nullptr) { + hip::Stream* stream = hip::getStream(nullptr); + if (stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } - HIP_RETURN_DURATION(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *queue)); + HIP_RETURN_DURATION(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *stream)); } hipError_t hipMemcpyDtoD(hipDeviceptr_t dstDevice, @@ -1416,22 +1416,22 @@ hipError_t hipMemcpyDtoD(hipDeviceptr_t dstDevice, size_t ByteCount) { HIP_INIT_API(hipMemcpyDtoD, dstDevice, srcDevice, ByteCount); CHECK_STREAM_CAPTURING(); - amd::HostQueue* queue = hip::getQueue(nullptr); - if (queue == nullptr) { + hip::Stream* stream = hip::getStream(nullptr); + if (stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } - HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *queue)); + HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *stream)); } hipError_t hipMemcpyAsync_common(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) { STREAM_CAPTURE(hipMemcpyAsync, stream, dst, src, sizeBytes, kind); - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } - return ihipMemcpy(dst, src, sizeBytes, kind, *queue, true); + return ihipMemcpy(dst, src, sizeBytes, kind, *hip_stream, true); } hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, @@ -1452,12 +1452,12 @@ hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice, void* srcHost, size_t By HIP_INIT_API(hipMemcpyHtoDAsync, dstDevice, srcHost, ByteCount, stream); hipMemcpyKind kind = hipMemcpyHostToDevice; STREAM_CAPTURE(hipMemcpyHtoDAsync, stream, dstDevice, srcHost, ByteCount, kind); - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } HIP_RETURN_DURATION( - ihipMemcpy(dstDevice, srcHost, ByteCount, kind, *queue, true)); + ihipMemcpy(dstDevice, srcHost, ByteCount, kind, *hip_stream, true)); } hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount, @@ -1465,12 +1465,12 @@ hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice HIP_INIT_API(hipMemcpyDtoDAsync, dstDevice, srcDevice, ByteCount, stream); hipMemcpyKind kind = hipMemcpyDeviceToDevice; STREAM_CAPTURE(hipMemcpyDtoDAsync, stream, dstDevice, srcDevice, ByteCount, kind); - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } HIP_RETURN_DURATION( - ihipMemcpy(dstDevice, srcDevice, ByteCount, kind, *queue, true)); + ihipMemcpy(dstDevice, srcDevice, ByteCount, kind, *hip_stream, true)); } hipError_t hipMemcpyDtoHAsync(void* dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, @@ -1478,12 +1478,12 @@ hipError_t hipMemcpyDtoHAsync(void* dstHost, hipDeviceptr_t srcDevice, size_t By HIP_INIT_API(hipMemcpyDtoHAsync, dstHost, srcDevice, ByteCount, stream); hipMemcpyKind kind = hipMemcpyDeviceToHost; STREAM_CAPTURE(hipMemcpyDtoHAsync, stream, dstHost, srcDevice, ByteCount, kind); - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } HIP_RETURN_DURATION( - ihipMemcpy(dstHost, srcDevice, ByteCount, kind, *queue, true)); + ihipMemcpy(dstHost, srcDevice, ByteCount, kind, *hip_stream, true)); } hipError_t ihipMemcpyAtoDValidate(hipArray* srcArray, void* dstDevice, amd::Coord3D& srcOrigin, @@ -1532,7 +1532,7 @@ hipError_t ihipMemcpyAtoDValidate(hipArray* srcArray, void* dstDevice, amd::Coor hipError_t ihipMemcpyAtoDCommand(amd::Command*& command, hipArray* srcArray, void* dstDevice, amd::Coord3D srcOrigin, amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t dstRowPitch, size_t dstSlicePitch, - amd::HostQueue* queue) { + hip::Stream* stream) { amd::BufferRect srcRect; amd::BufferRect dstRect; amd::Memory* dstMemory; @@ -1544,7 +1544,7 @@ hipError_t ihipMemcpyAtoDCommand(amd::Command*& command, hipArray* srcArray, voi return status; } - amd::CopyMemoryCommand* cpyMemCmd = new amd::CopyMemoryCommand(*queue, CL_COMMAND_COPY_IMAGE_TO_BUFFER, + amd::CopyMemoryCommand* cpyMemCmd = new amd::CopyMemoryCommand(*stream, CL_COMMAND_COPY_IMAGE_TO_BUFFER, amd::Command::EventWaitList{}, *srcImage, *dstMemory, srcOrigin, dstOrigin, copyRegion, srcRect, dstRect); @@ -1606,7 +1606,7 @@ hipError_t ihipMemcpyDtoAValidate(void* srcDevice, hipArray* dstArray, amd::Coor hipError_t ihipMemcpyDtoACommand(amd::Command*& command, void* srcDevice, hipArray* dstArray, amd::Coord3D srcOrigin, amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch, - amd::HostQueue* queue) { + hip::Stream* stream) { amd::Image* dstImage; amd::Memory* srcMemory; amd::BufferRect dstRect; @@ -1617,7 +1617,7 @@ hipError_t ihipMemcpyDtoACommand(amd::Command*& command, void* srcDevice, hipArr if (status != hipSuccess) { return status; } - amd::CopyMemoryCommand* cpyMemCmd = new amd::CopyMemoryCommand(*queue, CL_COMMAND_COPY_BUFFER_TO_IMAGE, + amd::CopyMemoryCommand* cpyMemCmd = new amd::CopyMemoryCommand(*stream, CL_COMMAND_COPY_BUFFER_TO_IMAGE, amd::Command::EventWaitList{}, *srcMemory, *dstImage, srcOrigin, dstOrigin, copyRegion, srcRect, dstRect); @@ -1679,7 +1679,7 @@ hipError_t ihipMemcpyDtoDValidate(void* srcDevice, void* dstDevice, amd::Coord3D hipError_t ihipMemcpyDtoDCommand(amd::Command*& command, void* srcDevice, void* dstDevice, amd::Coord3D srcOrigin, amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch, - size_t dstRowPitch, size_t dstSlicePitch, amd::HostQueue* queue) { + size_t dstRowPitch, size_t dstSlicePitch, hip::Stream* stream) { amd::Memory* srcMemory; amd::Memory* dstMemory; amd::BufferRect srcRect; @@ -1694,7 +1694,7 @@ hipError_t ihipMemcpyDtoDCommand(amd::Command*& command, void* srcDevice, void* amd::Coord3D srcStart(srcRect.start_, 0, 0); amd::Coord3D dstStart(dstRect.start_, 0, 0); amd::CopyMemoryCommand* copyCommand = new amd::CopyMemoryCommand( - *queue, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{}, *srcMemory, *dstMemory, + *stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{}, *srcMemory, *dstMemory, srcStart, dstStart, copyRegion, srcRect, dstRect); if (copyCommand == nullptr) { @@ -1744,7 +1744,7 @@ hipError_t ihipMemcpyDtoHValidate(void* srcDevice, void* dstHost, amd::Coord3D& hipError_t ihipMemcpyDtoHCommand(amd::Command*& command, void* srcDevice, void* dstHost, amd::Coord3D srcOrigin, amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch, - size_t dstRowPitch, size_t dstSlicePitch, amd::HostQueue* queue, + size_t dstRowPitch, size_t dstSlicePitch, hip::Stream* stream, bool isAsync = false) { amd::Memory* srcMemory; amd::BufferRect srcRect; @@ -1758,7 +1758,7 @@ hipError_t ihipMemcpyDtoHCommand(amd::Command*& command, void* srcDevice, void* amd::Coord3D srcStart(srcRect.start_, 0, 0); amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); amd::ReadMemoryCommand* readCommand = - new amd::ReadMemoryCommand(*queue, CL_COMMAND_READ_BUFFER_RECT, amd::Command::EventWaitList{}, + new amd::ReadMemoryCommand(*stream, CL_COMMAND_READ_BUFFER_RECT, amd::Command::EventWaitList{}, *srcMemory, srcStart, copyRegion, dstHost, srcRect, dstRect, copyMetadata); @@ -1809,7 +1809,7 @@ hipError_t ihipMemcpyHtoDValidate(const void* srcHost, void* dstDevice, amd::Coo hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, const void* srcHost, void* dstDevice, amd::Coord3D srcOrigin, amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch, - size_t dstRowPitch, size_t dstSlicePitch, amd::HostQueue* queue, + size_t dstRowPitch, size_t dstSlicePitch, hip::Stream* stream, bool isAsync = false) { amd::Memory* dstMemory; amd::BufferRect srcRect; @@ -1824,7 +1824,7 @@ hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, const void* srcHost, vo amd::Coord3D dstStart(dstRect.start_, 0, 0); amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); amd::WriteMemoryCommand* writeCommand = new amd::WriteMemoryCommand( - *queue, CL_COMMAND_WRITE_BUFFER_RECT, amd::Command::EventWaitList{}, *dstMemory, dstStart, + *stream, CL_COMMAND_WRITE_BUFFER_RECT, amd::Command::EventWaitList{}, *dstMemory, dstStart, copyRegion, srcHost, dstRect, srcRect, copyMetadata); if (writeCommand == nullptr) { @@ -1842,7 +1842,7 @@ hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, const void* srcHost, vo hipError_t ihipMemcpyHtoH(const void* srcHost, void* dstHost, amd::Coord3D srcOrigin, amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, - amd::HostQueue* queue) { + hip::Stream* stream) { if ((srcHost == nullptr) || (dstHost == nullptr)) { return hipErrorInvalidValue; } @@ -1859,8 +1859,8 @@ hipError_t ihipMemcpyHtoH(const void* srcHost, void* dstHost, amd::Coord3D srcOr return hipErrorInvalidValue; } - if (queue) { - queue->finish(); + if (stream) { + stream->finish(); } for (size_t slice = 0; slice < copyRegion[2]; slice++) { @@ -1909,7 +1909,7 @@ hipError_t ihipMemcpyAtoAValidate(hipArray* srcArray, hipArray* dstArray, amd::C hipError_t ihipMemcpyAtoACommand(amd::Command*& command, hipArray* srcArray, hipArray* dstArray, amd::Coord3D srcOrigin, amd::Coord3D dstOrigin, - amd::Coord3D copyRegion, amd::HostQueue* queue) { + amd::Coord3D copyRegion, hip::Stream* stream) { amd::Image* srcImage; amd::Image* dstImage; @@ -1919,7 +1919,7 @@ hipError_t ihipMemcpyAtoACommand(amd::Command*& command, hipArray* srcArray, hip return status; } - amd::CopyMemoryCommand* cpyMemCmd = new amd::CopyMemoryCommand(*queue, CL_COMMAND_COPY_IMAGE, + amd::CopyMemoryCommand* cpyMemCmd = new amd::CopyMemoryCommand(*stream, CL_COMMAND_COPY_IMAGE, amd::Command::EventWaitList{}, *srcImage, *dstImage, srcOrigin, dstOrigin, copyRegion); @@ -1968,7 +1968,7 @@ hipError_t ihipMemcpyHtoAValidate(const void* srcHost, hipArray* dstArray, hipError_t ihipMemcpyHtoACommand(amd::Command*& command, const void* srcHost, hipArray* dstArray, amd::Coord3D srcOrigin, amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch, - amd::HostQueue* queue, bool isAsync = false) { + hip::Stream* stream, bool isAsync = false) { amd::Image* dstImage; amd::BufferRect srcRect; @@ -1980,7 +1980,7 @@ hipError_t ihipMemcpyHtoACommand(amd::Command*& command, const void* srcHost, hi amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); amd::WriteMemoryCommand* writeMemCmd = new amd::WriteMemoryCommand( - *queue, CL_COMMAND_WRITE_IMAGE, amd::Command::EventWaitList{}, *dstImage, dstOrigin, + *stream, CL_COMMAND_WRITE_IMAGE, amd::Command::EventWaitList{}, *dstImage, dstOrigin, copyRegion, static_cast(srcHost) + srcRect.start_, srcRowPitch, srcSlicePitch, copyMetadata); @@ -2029,7 +2029,7 @@ hipError_t ihipMemcpyAtoHValidate(hipArray* srcArray, void* dstHost, amd::Coord3 hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, hipArray* srcArray, void* dstHost, amd::Coord3D srcOrigin, amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t dstRowPitch, size_t dstSlicePitch, - amd::HostQueue* queue, bool isAsync = false) { + hip::Stream* stream, bool isAsync = false) { amd::Image* srcImage; amd::BufferRect dstRect; amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); @@ -2041,7 +2041,7 @@ hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, hipArray* srcArray, voi } amd::ReadMemoryCommand* readMemCmd = new amd::ReadMemoryCommand( - *queue, CL_COMMAND_READ_IMAGE, amd::Command::EventWaitList{}, *srcImage, srcOrigin, + *stream, CL_COMMAND_READ_IMAGE, amd::Command::EventWaitList{}, *srcImage, srcOrigin, copyRegion, static_cast(dstHost) + dstRect.start_, dstRowPitch, dstSlicePitch, copyMetadata); @@ -2058,7 +2058,7 @@ hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, hipArray* srcArray, voi } hipError_t ihipGetMemcpyParam3DCommand(amd::Command*& command, const HIP_MEMCPY3D* pCopy, - amd::HostQueue* queue) { + hip::Stream* stream) { // If {src/dst}MemoryType is hipMemoryTypeUnified, {src/dst}Device and {src/dst}Pitch specify the // (unified virtual address space) base address of the source data and the bytes per row to apply. // {src/dst}Array is ignored. @@ -2106,41 +2106,41 @@ hipError_t ihipGetMemcpyParam3DCommand(amd::Command*& command, const HIP_MEMCPY3 // Host to Device. return ihipMemcpyHtoDCommand(command, pCopy->srcHost, pCopy->dstDevice, srcOrigin, dstOrigin, copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight, - pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, queue); + pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, stream); } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeHost)) { // Device to Host. return ihipMemcpyDtoHCommand(command, pCopy->srcDevice, pCopy->dstHost, srcOrigin, dstOrigin, copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight, - pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, queue); + pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, stream); } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice)) { // Device to Device. return ihipMemcpyDtoDCommand(command, pCopy->srcDevice, pCopy->dstDevice, srcOrigin, dstOrigin, copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight, - pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, queue); + pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, stream); } else if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeArray)) { // Host to Image. return ihipMemcpyHtoACommand(command, pCopy->srcHost, pCopy->dstArray, srcOrigin, dstOrigin, copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight, - queue); + stream); } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeHost)) { // Image to Host. return ihipMemcpyAtoHCommand(command, pCopy->srcArray, pCopy->dstHost, srcOrigin, dstOrigin, copyRegion, pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, - queue); + stream); } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeArray)) { // Device to Image. return ihipMemcpyDtoACommand(command, pCopy->srcDevice, pCopy->dstArray, srcOrigin, dstOrigin, copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight, - queue); + stream); } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeDevice)) { // Image to Device. return ihipMemcpyAtoDCommand(command, pCopy->srcArray, pCopy->dstDevice, srcOrigin, dstOrigin, copyRegion, pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, - queue); + stream); } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeArray)) { // Image to Image. return ihipMemcpyAtoACommand(command, pCopy->srcArray, pCopy->dstArray, srcOrigin, dstOrigin, - copyRegion, queue); + copyRegion, stream); } else { ShouldNotReachHere(); } @@ -2212,14 +2212,14 @@ hipError_t ihipMemcpyParam3D(const HIP_MEMCPY3D* pCopy, hipStream_t stream, bool // Host to Host. return ihipMemcpyHtoH(pCopy->srcHost, pCopy->dstHost, srcOrigin, dstOrigin, copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight, pCopy->dstPitch, - pCopy->dstPitch * pCopy->dstHeight, hip::getQueue(stream)); + pCopy->dstPitch * pCopy->dstHeight, hip::getStream(stream)); } else { amd::Command* command; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } - status = ihipGetMemcpyParam3DCommand(command, pCopy, queue); + status = ihipGetMemcpyParam3DCommand(command, pCopy, hip_stream); if (status != hipSuccess) return status; // Transfers from device memory to pageable host memory and transfers from any host memory to any host memory @@ -2507,13 +2507,13 @@ hipError_t ihipMemcpyAtoD(hipArray* srcArray, void* dstDevice, amd::Coord3D srcO amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t dstRowPitch, size_t dstSlicePitch, hipStream_t stream, bool isAsync = false) { amd::Command* command; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } hipError_t status = ihipMemcpyAtoDCommand(command, srcArray, dstDevice, srcOrigin, dstOrigin, copyRegion, - dstRowPitch, dstSlicePitch, queue); + dstRowPitch, dstSlicePitch, hip_stream); if (status != hipSuccess) return status; return ihipMemcpyCmdEnqueue(command, isAsync); } @@ -2521,13 +2521,13 @@ hipError_t ihipMemcpyDtoA(void* srcDevice, hipArray* dstArray, amd::Coord3D srcO amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch, hipStream_t stream, bool isAsync = false) { amd::Command* command; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } hipError_t status = ihipMemcpyDtoACommand(command, srcDevice, dstArray, srcOrigin, dstOrigin, copyRegion, - srcRowPitch, srcSlicePitch, queue); + srcRowPitch, srcSlicePitch, hip_stream); if (status != hipSuccess) return status; return ihipMemcpyCmdEnqueue(command, isAsync); } @@ -2536,13 +2536,13 @@ hipError_t ihipMemcpyDtoD(void* srcDevice, void* dstDevice, amd::Coord3D srcOrig size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, hipStream_t stream, bool isAsync = false) { amd::Command* command; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } hipError_t status = ihipMemcpyDtoDCommand(command, srcDevice, dstDevice, srcOrigin, dstOrigin, copyRegion, srcRowPitch, srcSlicePitch, dstRowPitch, - dstSlicePitch, queue); + dstSlicePitch, hip_stream); if (status != hipSuccess) return status; return ihipMemcpyCmdEnqueue(command, isAsync); } @@ -2551,13 +2551,13 @@ hipError_t ihipMemcpyDtoH(void* srcDevice, void* dstHost, amd::Coord3D srcOrigin size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, hipStream_t stream, bool isAsync = false) { amd::Command* command; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } hipError_t status = ihipMemcpyDtoHCommand(command, srcDevice, dstHost, srcOrigin, dstOrigin, copyRegion, srcRowPitch, srcSlicePitch, dstRowPitch, - dstSlicePitch, queue, isAsync); + dstSlicePitch, hip_stream, isAsync); if (status != hipSuccess) return status; return ihipMemcpyCmdEnqueue(command, isAsync); } @@ -2566,13 +2566,13 @@ hipError_t ihipMemcpyHtoD(const void* srcHost, void* dstDevice, amd::Coord3D src size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, hipStream_t stream, bool isAsync = false) { amd::Command* command; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } hipError_t status = ihipMemcpyHtoDCommand(command, srcHost, dstDevice, srcOrigin, dstOrigin, copyRegion, srcRowPitch, srcSlicePitch, dstRowPitch, - dstSlicePitch, queue, isAsync); + dstSlicePitch, hip_stream, isAsync); if (status != hipSuccess) return status; return ihipMemcpyCmdEnqueue(command, isAsync); } @@ -2580,12 +2580,12 @@ hipError_t ihipMemcpyAtoA(hipArray* srcArray, hipArray* dstArray, amd::Coord3D s amd::Coord3D dstOrigin, amd::Coord3D copyRegion, hipStream_t stream, bool isAsync = false) { amd::Command* command; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } hipError_t status = ihipMemcpyAtoACommand(command, srcArray, dstArray, srcOrigin, dstOrigin, - copyRegion, queue); + copyRegion, hip_stream); if (status != hipSuccess) return status; return ihipMemcpyCmdEnqueue(command, isAsync); } @@ -2593,13 +2593,13 @@ hipError_t ihipMemcpyHtoA(const void* srcHost, hipArray* dstArray, amd::Coord3D amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch, hipStream_t stream, bool isAsync = false) { amd::Command* command; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } hipError_t status = ihipMemcpyHtoACommand(command, srcHost, dstArray, srcOrigin, dstOrigin, copyRegion, - srcRowPitch, srcSlicePitch, queue, isAsync); + srcRowPitch, srcSlicePitch, hip_stream, isAsync); if (status != hipSuccess) return status; return ihipMemcpyCmdEnqueue(command, isAsync); } @@ -2607,13 +2607,13 @@ hipError_t ihipMemcpyAtoH(hipArray* srcArray, void* dstHost, amd::Coord3D srcOri amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t dstRowPitch, size_t dstSlicePitch, hipStream_t stream, bool isAsync = false) { amd::Command* command; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } hipError_t status = ihipMemcpyAtoHCommand(command, srcArray, dstHost, srcOrigin, dstOrigin, copyRegion, - dstRowPitch, dstSlicePitch, queue, isAsync); + dstRowPitch, dstSlicePitch, hip_stream, isAsync); if (status != hipSuccess) return status; return ihipMemcpyCmdEnqueue(command, isAsync); } @@ -2673,9 +2673,9 @@ hipError_t ihipMemcpy3D_validate(const hipMemcpy3DParms* p) { } hipError_t ihipMemcpy3DCommand(amd::Command*& command, const hipMemcpy3DParms* p, - amd::HostQueue* queue) { + hip::Stream* stream) { const HIP_MEMCPY3D desc = hip::getDrvMemcpy3DDesc(*p); - return ihipGetMemcpyParam3DCommand(command, &desc, queue); + return ihipGetMemcpyParam3DCommand(command, &desc, stream); } hipError_t ihipMemcpy3D(const hipMemcpy3DParms* p, hipStream_t stream, bool isAsync = false) { @@ -2733,8 +2733,8 @@ hipError_t hipDrvMemcpy3DAsync(const HIP_MEMCPY3D* pCopy, hipStream_t stream) { hipError_t packFillMemoryCommand(amd::Command*& command, amd::Memory* memory, size_t offset, int64_t value, size_t valueSize, size_t sizeBytes, - amd::HostQueue* queue) { - if ((memory == nullptr) || (queue == nullptr)) { + hip::Stream* stream) { + if ((memory == nullptr) || (stream == nullptr)) { return hipErrorInvalidValue; } @@ -2744,7 +2744,7 @@ hipError_t packFillMemoryCommand(amd::Command*& command, amd::Memory* memory, si // surface=[pitch, width, height] amd::Coord3D surface(sizeBytes, sizeBytes, 1); amd::FillMemoryCommand* fillMemCommand = - new amd::FillMemoryCommand(*queue, CL_COMMAND_FILL_BUFFER, waitList, *memory->asBuffer(), + new amd::FillMemoryCommand(*stream, CL_COMMAND_FILL_BUFFER, waitList, *memory->asBuffer(), &value, valueSize, fillOffset, fillSize, surface); if (fillMemCommand == nullptr) { return hipErrorOutOfMemory; @@ -2810,7 +2810,7 @@ hipError_t ihipGraphMemsetParams_validate(const hipMemsetParams* pNodeParams) { } hipError_t ihipMemsetCommand(std::vector& commands, void* dst, int64_t value, - size_t valueSize, size_t sizeBytes, amd::HostQueue* queue) { + size_t valueSize, size_t sizeBytes, hip::Stream* stream) { hipError_t hip_error = hipSuccess; auto aligned_dst = amd::alignUp(reinterpret_cast
(dst), sizeof(uint64_t)); size_t offset = 0; @@ -2820,7 +2820,7 @@ hipError_t ihipMemsetCommand(std::vector& commands, void* dst, in amd::Command* command; hip_error = packFillMemoryCommand(command, memory, offset, value, valueSize, sizeBytes, - queue); + stream); commands.push_back(command); return hip_error; @@ -2854,8 +2854,8 @@ hipError_t ihipMemset(void* dst, int64_t value, size_t valueSize, size_t sizeByt } } std::vector commands; - amd::HostQueue* queue = hip::getQueue(stream); - hip_error = ihipMemsetCommand(commands, dst, value, valueSize, sizeBytes, queue); + hip::Stream* hip_stream = hip::getStream(stream); + hip_error = ihipMemsetCommand(commands, dst, value, valueSize, sizeBytes, hip_stream); if (hip_error != hipSuccess) { break; } @@ -2972,13 +2972,13 @@ hipError_t ihipMemset3D_validate(hipPitchedPtr pitchedDevPtr, int value, hipExte } hipError_t ihipMemset3DCommand(std::vector &commands, hipPitchedPtr pitchedDevPtr, - int value, hipExtent extent, amd::HostQueue* queue, size_t elementSize = 1) { + int value, hipExtent extent, hip::Stream* stream, size_t elementSize = 1) { size_t offset = 0; auto sizeBytes = extent.width * extent.height * extent.depth; amd::Memory* memory = getMemoryObject(pitchedDevPtr.ptr, offset); if (pitchedDevPtr.pitch == extent.width) { return ihipMemsetCommand(commands, pitchedDevPtr.ptr, value, elementSize, - static_cast(sizeBytes), queue); + static_cast(sizeBytes), stream); } // Workaround for cases when pitch > row until fill kernel will be updated to support pitch. // Fall back to filling one row at a time. @@ -2994,7 +2994,7 @@ hipError_t ihipMemset3DCommand(std::vector &commands, hipPitchedP } amd::FillMemoryCommand* command; command = new amd::FillMemoryCommand( - *queue, CL_COMMAND_FILL_BUFFER, amd::Command::EventWaitList{}, *memory->asBuffer(), + *stream, CL_COMMAND_FILL_BUFFER, amd::Command::EventWaitList{}, *memory->asBuffer(), &value, elementSize, origin, region, surface); commands.push_back(command); return hipSuccess; @@ -3025,9 +3025,9 @@ hipError_t ihipMemset3D(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent isAsync = true; } } - amd::HostQueue* queue = hip::getQueue(stream); + hip::Stream* hip_stream = hip::getStream(stream); std::vector commands; - status = ihipMemset3DCommand(commands, pitchedDevPtr, value, extent, queue); + status = ihipMemset3DCommand(commands, pitchedDevPtr, value, extent, hip_stream); if (status != hipSuccess) { return status; } @@ -3946,9 +3946,9 @@ hipError_t ihipMipmappedArrayDestroy(hipMipmappedArray_t mipmapped_array_ptr) { } for (auto& dev : g_devices) { - amd::HostQueue* queue = dev->NullStream(true); - if (queue != nullptr) { - queue->finish(); + hip::Stream* stream = dev->NullStream(true); + if (stream != nullptr) { + stream->finish(); } } diff --git a/src/hip_module.cpp b/src/hip_module.cpp index 41b9038d..98cc7fd5 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -305,7 +305,7 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, - amd::HostQueue* queue, void** kernelParams, void** extra, + hip::Stream* stream, void** kernelParams, void** extra, hipEvent_t startEvent = nullptr, hipEvent_t stopEvent = nullptr, uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0, uint64_t prevGridSum = 0, @@ -328,7 +328,7 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f, } amd::NDRangeKernelCommand* kernelCommand = new amd::NDRangeKernelCommand( - *queue, waitList, *kernel, ndrange, sharedMemBytes, params, gridId, numGrids, prevGridSum, + *stream, waitList, *kernel, ndrange, sharedMemBytes, params, gridId, numGrids, prevGridSum, allGridSum, firstDevice, profileNDRange); if (!kernelCommand) { return hipErrorOutOfMemory; @@ -371,9 +371,9 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, return status; } amd::Command* command = nullptr; - amd::HostQueue* queue = hip::getQueue(hStream); + hip::Stream* hip_stream = hip::getStream(hStream); status = ihipLaunchKernelCommand(command, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, - blockDimX, blockDimY, blockDimZ, sharedMemBytes, queue, + blockDimX, blockDimY, blockDimZ, sharedMemBytes, hip_stream, kernelParams, extra, startEvent, stopEvent, flags, params, gridId, numGrids, prevGridSum, allGridSum, firstDevice); if (status != hipSuccess) { @@ -544,8 +544,8 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* } if (launch.hStream != nullptr) { // Validate devices to make sure it dosn't have duplicates - amd::HostQueue* queue = reinterpret_cast(launch.hStream)->asHostQueue(); - auto device = &queue->vdev()->device(); + hip::Stream* hip_stream = reinterpret_cast(launch.hStream); + auto device = &hip_stream->vdev()->device(); for (int j = 0; j < numDevices; ++j) { if (mgpu_list[j] == device) { return hipErrorInvalidDevice; @@ -562,23 +562,23 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* // Sync the execution streams on all devices if ((flags & hipCooperativeLaunchMultiDeviceNoPreSync) == 0) { for (int i = 0; i < numDevices; ++i) { - amd::HostQueue* queue = - reinterpret_cast(launchParamsList[i].hStream)->asHostQueue(); - queue->finish(); + hip::Stream* hip_stream = + reinterpret_cast(launchParamsList[i].hStream); + hip_stream->finish(); } } for (int i = 0; i < numDevices; ++i) { const hipFunctionLaunchParams& launch = launchParamsList[i]; - amd::HostQueue* queue = reinterpret_cast(launch.hStream)->asHostQueue(); + hip::Stream* hip_stream = reinterpret_cast(launch.hStream); if (i == 0) { // The order of devices in the launch may not match the order in the global array for (size_t dev = 0; dev < g_devices.size(); ++dev) { // Find the matching device - if (&queue->vdev()->device() == g_devices[dev]->devices()[0]) { + if (&hip_stream->vdev()->device() == g_devices[dev]->devices()[0]) { // Save ROCclr index of the first device in the launch - firstDevice = queue->vdev()->device().index(); + firstDevice = hip_stream->vdev()->device().index(); break; } } @@ -608,9 +608,9 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* // Sync the execution streams on all devices if ((flags & hipCooperativeLaunchMultiDeviceNoPostSync) == 0) { for (int i = 0; i < numDevices; ++i) { - amd::HostQueue* queue = - reinterpret_cast(launchParamsList[i].hStream)->asHostQueue(); - queue->finish(); + hip::Stream* hip_stream = + reinterpret_cast(launchParamsList[i].hStream); + hip_stream->finish(); } } @@ -739,12 +739,12 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL return hipErrorInvalidValue; } - amd::HostQueue* queue = hip::getQueue(launch.stream); + hip::Stream* hip_stream = hip::getStream(launch.stream); hipFunction_t func = nullptr; // The order of devices in the launch may not match the order in the global array for (size_t dev = 0; dev < g_devices.size(); ++dev) { // Find the matching device and request the kernel function - if (&queue->vdev()->device() == g_devices[dev]->devices()[0]) { + if (&hip_stream->vdev()->device() == g_devices[dev]->devices()[0]) { IHIP_RETURN_ONFAIL(PlatformState::instance().getStatFunc(&func, launch.func, dev)); break; } diff --git a/src/hip_platform.cpp b/src/hip_platform.cpp index 20d13162..aecf5064 100644 --- a/src/hip_platform.cpp +++ b/src/hip_platform.cpp @@ -34,7 +34,7 @@ PlatformState* PlatformState::platform_; // Initiaized as nullptr by default // forward declaration of methods required for __hipRegisrterManagedVar hipError_t ihipMallocManaged(void** ptr, size_t size, unsigned int align = 0); hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, - amd::HostQueue& queue, bool isAsync = false); + hip::Stream& stream, bool isAsync = false); struct __CudaFatBinaryWrapper { unsigned int magic; @@ -146,9 +146,9 @@ extern "C" void __hipRegisterManagedVar( HIP_INIT_VOID(); hipError_t status = ihipMallocManaged(pointer, size, align); if (status == hipSuccess) { - amd::HostQueue* queue = hip::getNullStream(); - if (queue != nullptr) { - status = ihipMemcpy(*pointer, init_value, size, hipMemcpyHostToDevice, *queue); + hip::Stream* stream = hip::getNullStream(); + if (stream != nullptr) { + status = ihipMemcpy(*pointer, init_value, size, hipMemcpyHostToDevice, *stream); guarantee((status == hipSuccess), "Error during memcpy to managed memory!"); } else { ClPrint(amd::LOG_ERROR, amd::LOG_API, "Host Queue is NULL"); diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp index a0444334..e6c2839a 100644 --- a/src/hip_stream.cpp +++ b/src/hip_stream.cpp @@ -31,7 +31,8 @@ namespace hip { // ================================================================================================ Stream::Stream(hip::Device* dev, Priority p, unsigned int f, bool null_stream, const std::vector& cuMask, hipStreamCaptureStatus captureStatus) - : queue_(nullptr), + : amd::HostQueue(*dev->asContext(), *dev->devices()[0], 0, amd::CommandQueue::RealTimeDisabled, + convertToQueuePriority(p), cuMask), lock_("Stream Callback lock"), device_(dev), priority_(p), @@ -40,18 +41,11 @@ Stream::Stream(hip::Device* dev, Priority p, unsigned int f, bool null_stream, cuMask_(cuMask), captureStatus_(captureStatus), originStream_(false), - captureID_(0) {} - -// ================================================================================================ -Stream::~Stream() { - if (queue_ != nullptr) { - amd::ScopedLock lock(streamSetLock); - streamSet.erase(this); - - queue_->release(); - queue_ = nullptr; - } -} + captureID_(0) + { + amd::ScopedLock lock(streamSetLock); + streamSet.insert(this); + } // ================================================================================================ hipError_t Stream::EndCapture() { @@ -77,38 +71,16 @@ hipError_t Stream::EndCapture() { // ================================================================================================ bool Stream::Create() { - amd::CommandQueue::Priority p; - switch (priority_) { - case Priority::High: - p = amd::CommandQueue::Priority::High; - break; - case Priority::Low: - p = amd::CommandQueue::Priority::Low; - break; - case Priority::Normal: - default: - p = amd::CommandQueue::Priority::Normal; - break; - } - amd::HostQueue* queue = new amd::HostQueue(*device_->asContext(), *device_->devices()[0], - 0, amd::CommandQueue::RealTimeDisabled, - p, cuMask_); - - // Create a host queue - bool result = (queue != nullptr) ? queue->create() : false; - // Insert just created stream into the list of the blocking queues - if (result) { + return create(); +} + +// ================================================================================================ +bool Stream::terminate() { + { amd::ScopedLock lock(streamSetLock); - streamSet.insert(this); - queue_ = queue; - device_->SaveQueue(queue); - } else if (queue != nullptr) { - // Queue creation has failed, and virtual device associated with the queue may not be created. - // Just need to delete the queue instance. - delete queue; + streamSet.erase(this); } - - return result; + return HostQueue::terminate(); } // ================================================================================================ @@ -130,29 +102,6 @@ bool isValid(hipStream_t& stream) { return true; } -// ================================================================================================ -amd::HostQueue* Stream::asHostQueue(bool skip_alloc) { - if (queue_ != nullptr) { - return queue_; - } - // Access to the stream object is lock protected, because possible allocation - amd::ScopedLock l(Lock()); - if (queue_ == nullptr) { - // Create the host queue for the first time - if (!skip_alloc) { - Create(); - } - } - return queue_; -} - -// ================================================================================================ -void Stream::Finish() const { - if (queue_ != nullptr) { - queue_->finish(); - } -} - // ================================================================================================ int Stream::DeviceId() const { return device_->deviceId(); @@ -176,7 +125,7 @@ void Stream::syncNonBlockingStreams(int deviceId) { for (auto& it : streamSet) { if (it->Flags() & hipStreamNonBlocking) { if (it->DeviceId() == deviceId) { - it->asHostQueue()->finish(); + it->finish(); } } } @@ -203,7 +152,7 @@ void Stream::destroyAllStreams(int deviceId) { } } for (auto& it : toBeDeleted) { - delete it; + it->release(); } } @@ -211,36 +160,48 @@ bool Stream::StreamCaptureOngoing(void) { return (g_allCapturingStreams.empty() == true) ? false : true; } +bool Stream::existsActiveStreamForDevice(hip::Device* device) { + + amd::ScopedLock lock(streamSetLock); + + for (const auto& active_stream : streamSet) { + if ((active_stream->GetDevice() == device) && + active_stream->GetQueueStatus()) { + return true; + } + } + return false; +} + };// hip namespace // ================================================================================================ -void iHipWaitActiveStreams(amd::HostQueue* blocking_queue, bool wait_null_stream) { +void iHipWaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stream) { amd::Command::EventWaitList eventWaitList(0); bool submitMarker = 0; { amd::ScopedLock lock(streamSetLock); - for (const auto& stream : streamSet) { - amd::HostQueue* active_queue = stream->asHostQueue(); + for (const auto& active_stream : streamSet) { // If it's the current device - if ((&active_queue->device() == &blocking_queue->device()) && + if ((&active_stream->device() == &blocking_stream->device()) && // Make sure it's a default stream - ((stream->Flags() & hipStreamNonBlocking) == 0) && + ((active_stream->Flags() & hipStreamNonBlocking) == 0) && // and it's not the current stream - (active_queue != blocking_queue) && + (active_stream != blocking_stream) && // check for a wait on the null stream - (stream->Null() == wait_null_stream)) { + (active_stream->Null() == wait_null_stream)) { // Get the last valid command - amd::Command* command = active_queue->getLastQueuedCommand(true); + amd::Command* command = active_stream->getLastQueuedCommand(true); if (command != nullptr) { amd::Event& event = command->event(); // Check HW status of the ROCcrl event. // Note: not all ROCclr modes support HW status - bool ready = active_queue->device().IsHwEventReady(event); + bool ready = active_stream->device().IsHwEventReady(event); if (!ready) { ready = (command->status() == CL_COMPLETE); } - submitMarker |= active_queue->vdev()->isFenceDirty(); + submitMarker |= active_stream->vdev()->isFenceDirty(); // Check the current active status if (!ready) { command->notifyCmdQueue(); @@ -259,7 +220,7 @@ void iHipWaitActiveStreams(amd::HostQueue* blocking_queue, bool wait_null_stream // Check if we have to wait anything if (eventWaitList.size() > 0 || submitMarker) { - amd::Command* command = new amd::Marker(*blocking_queue, kMarkerDisableFlush, eventWaitList); + amd::Command* command = new amd::Marker(*blocking_stream, kMarkerDisableFlush, eventWaitList); if (command != nullptr) { command->enqueue(); command->release(); @@ -288,8 +249,11 @@ static hipError_t ihipStreamCreate(hipStream_t* stream, } hip::Stream* hStream = new hip::Stream(hip::getCurrentDevice(), priority, flags, false, cuMask); - if (hStream == nullptr || !hStream->Create()) { - delete hStream; + if (hStream == nullptr) { + return hipErrorOutOfMemory; + } + else if (!hStream->Create()) { + hStream->release(); return hipErrorOutOfMemory; } @@ -310,7 +274,7 @@ stream_per_thread::stream_per_thread() { stream_per_thread::~stream_per_thread() { for (auto &stream:m_streams) { if (stream != nullptr && hip::isValid(stream)) { - delete reinterpret_cast(stream); + reinterpret_cast(stream)->release(); stream = nullptr; } } @@ -449,7 +413,7 @@ hipError_t hipStreamSynchronize_common(hipStream_t stream) { } } // Wait for the current host queue - hip::getQueue(stream)->finish(); + hip::getStream(stream)->finish(); return hipSuccess; } @@ -498,7 +462,7 @@ hipError_t hipStreamDestroy(hipStream_t stream) { if (l_it != hip::tls.capture_streams_.end()) { hip::tls.capture_streams_.erase(l_it); } - delete s; + s->release(); HIP_RETURN(hipSuccess); } @@ -564,9 +528,9 @@ hipError_t hipStreamQuery_common(hipStream_t stream) { HIP_RETURN(hipErrorStreamCaptureUnsupported); } } - amd::HostQueue* hostQueue = hip::getQueue(stream); + hip::Stream* hip_stream = hip::getStream(stream); - amd::Command* command = hostQueue->getLastQueuedCommand(true); + amd::Command* command = hip_stream->getLastQueuedCommand(true); if (command == nullptr) { // Nothing was submitted to the queue return hipSuccess; @@ -604,13 +568,13 @@ hipError_t streamCallback_common(hipStream_t stream, StreamCallback* cbo, void* return hipErrorContextIsDestroyed; } - amd::HostQueue* hostQueue = hip::getQueue(stream); - amd::Command* last_command = hostQueue->getLastQueuedCommand(true); + hip::Stream* hip_stream = hip::getStream(stream); + amd::Command* last_command = hip_stream->getLastQueuedCommand(true); amd::Command::EventWaitList eventWaitList; if (last_command != nullptr) { eventWaitList.push_back(last_command); } - amd::Command* command = new amd::Marker(*hostQueue, !kMarkerDisableFlush, eventWaitList); + amd::Command* command = new amd::Marker(*hip_stream, !kMarkerDisableFlush, eventWaitList); if (command == nullptr) { return hipErrorInvalidValue; } @@ -630,7 +594,7 @@ hipError_t streamCallback_common(hipStream_t stream, StreamCallback* cbo, void* // Add the new barrier to stall the stream, until the callback is done eventWaitList.clear(); eventWaitList.push_back(command); - amd::Command* block_command = new amd::Marker(*hostQueue, !kMarkerDisableFlush, eventWaitList); + amd::Command* block_command = new amd::Marker(*hip_stream, !kMarkerDisableFlush, eventWaitList); if (block_command == nullptr) { return hipErrorInvalidValue; } diff --git a/src/hip_stream_ops.cpp b/src/hip_stream_ops.cpp index a3bed6cf..7032c4c6 100644 --- a/src/hip_stream_ops.cpp +++ b/src/hip_stream_ops.cpp @@ -69,11 +69,11 @@ hipError_t ihipStreamOperation(hipStream_t stream, cl_command_type cmdType, void return hipErrorInvalidValue; } - amd::HostQueue* queue = hip::getQueue(stream); + hip::Stream* hip_stream = hip::getStream(stream); amd::Command::EventWaitList waitList; amd::StreamOperationCommand* command = - new amd::StreamOperationCommand(*queue, cmdType, waitList, *memory->asBuffer(), + new amd::StreamOperationCommand(*hip_stream, cmdType, waitList, *memory->asBuffer(), value, mask, outFlags, offset, sizeBytes); if (command == nullptr) { diff --git a/src/hip_texture.cpp b/src/hip_texture.cpp index 610d93fe..9fead8d8 100644 --- a/src/hip_texture.cpp +++ b/src/hip_texture.cpp @@ -26,7 +26,7 @@ #include "platform/sampler.hpp" hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, - amd::HostQueue& queue, bool isAsync = false); + hip::Stream& stream, bool isAsync = false); hipError_t ihipFree(void* ptr); @@ -575,8 +575,8 @@ hipError_t hipBindTexture2D(size_t* offset, HIP_RETURN(err); } // Copy to device. - amd::HostQueue* queue = hip::getNullStream(); - HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *queue)); + hip::Stream* stream = hip::getNullStream(); + HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *stream)); } hipError_t ihipBindTextureToArray(const textureReference* texref, @@ -624,8 +624,8 @@ hipError_t hipBindTextureToArray(const textureReference* texref, HIP_RETURN(err); } // Copy to device. - amd::HostQueue* queue = hip::getNullStream(); - HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *queue)); + hip::Stream* stream = hip::getNullStream(); + HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *stream)); } hipError_t ihipBindTextureToMipmappedArray(const textureReference* texref, @@ -674,8 +674,8 @@ hipError_t hipBindTextureToMipmappedArray(const textureReference* texref, HIP_RETURN(err); } // Copy to device. - amd::HostQueue* queue = hip::getNullStream(); - HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *queue)); + hip::Stream* stream = hip::getNullStream(); + HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *stream)); } hipError_t hipUnbindTexture(const textureReference* texref) { @@ -701,8 +701,8 @@ hipError_t hipBindTexture(size_t* offset, HIP_RETURN(err); } // Copy to device. - amd::HostQueue* queue = hip::getNullStream(); - HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *queue)); + hip::Stream* stream = hip::getNullStream(); + HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *stream)); } hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, @@ -966,8 +966,8 @@ hipError_t hipTexRefSetArray(textureReference* texRef, HIP_RETURN(err); } // Copy to device. - amd::HostQueue* queue = hip::getNullStream(); - HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *queue)); + hip::Stream* stream = hip::getNullStream(); + HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *stream)); } hipError_t hipTexRefGetAddress(hipDeviceptr_t* dptr, @@ -1049,8 +1049,8 @@ hipError_t hipTexRefSetAddress(size_t* ByteOffset, HIP_RETURN(err); } // Copy to device. - amd::HostQueue* queue = hip::getNullStream(); - HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *queue)); + hip::Stream* stream = hip::getNullStream(); + HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *stream)); } hipError_t hipTexRefSetAddress2D(textureReference* texRef, @@ -1091,8 +1091,8 @@ hipError_t hipTexRefSetAddress2D(textureReference* texRef, HIP_RETURN(err); } // Copy to device. - amd::HostQueue* queue = hip::getNullStream(); - HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *queue)); + hip::Stream* stream = hip::getNullStream(); + HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *stream)); } hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f) { @@ -1454,8 +1454,8 @@ hipError_t hipTexRefSetMipmappedArray(textureReference* texRef, HIP_RETURN(err); } // Copy to device. - amd::HostQueue* queue = hip::getNullStream(); - HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *queue)); + hip::Stream* stream = hip::getNullStream(); + HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *stream)); } hipError_t hipTexObjectCreate(hipTextureObject_t* pTexObject, From dfa0846fdcff4fafac0126c7eb43ba99e788e42c Mon Sep 17 00:00:00 2001 From: Jaydeep Patel Date: Tue, 21 Feb 2023 12:50:02 +0000 Subject: [PATCH 42/56] SWDEV-319526 - SE ID size incresed. Change-Id: Ia7df94d172102b321f12c8caa821e3a7e4447a21 --- include/hip/amd_detail/amd_device_functions.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/hip/amd_detail/amd_device_functions.h b/include/hip/amd_detail/amd_device_functions.h index 30d2511e..57576f57 100644 --- a/include/hip/amd_detail/amd_device_functions.h +++ b/include/hip/amd_detail/amd_device_functions.h @@ -921,7 +921,7 @@ int __syncthreads_or(int predicate) PIPE_ID 7:6 Pipeline from which the wave was dispatched. CU_ID 11:8 Compute Unit the wave is assigned to. SH_ID 12 Shader Array (within an SE) the wave is assigned to. - SE_ID 14:13 Shader Engine the wave is assigned to. + SE_ID 15:13 Shader Engine the wave is assigned to. TG_ID 19:16 Thread-group ID VM_ID 23:20 Virtual Memory ID QUEUE_ID 26:24 Queue from which this wave was dispatched. @@ -934,7 +934,7 @@ int __syncthreads_or(int predicate) #define HW_ID_CU_ID_SIZE 4 #define HW_ID_CU_ID_OFFSET 8 -#define HW_ID_SE_ID_SIZE 2 +#define HW_ID_SE_ID_SIZE 3 #define HW_ID_SE_ID_OFFSET 13 /* From 093ec199087eb04e5b59cae859d2285b38472c86 Mon Sep 17 00:00:00 2001 From: pghafari Date: Tue, 14 Feb 2023 20:35:46 -0500 Subject: [PATCH 43/56] SWDEV-366520 - DLL_PROCESS_DETACH update making sure the current thread is not null before calling ihipDestroyDevice Change-Id: Ib75e3f9ee1c4e0065986e3fed1065907220f4379 --- src/hip_runtime.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/hip_runtime.cpp b/src/hip_runtime.cpp index 4c877806..78eb4aca 100644 --- a/src/hip_runtime.cpp +++ b/src/hip_runtime.cpp @@ -46,9 +46,14 @@ extern "C" BOOL WINAPI DllMain(HINSTANCE hinst, DWORD reason, LPVOID reserved) { } #endif // DEBUG break; - case DLL_PROCESS_DETACH: + case DLL_PROCESS_DETACH: { + amd::Thread* thread = amd::Thread::current(); + if (!(thread != nullptr || + ((thread = new amd::HostThread()) != nullptr && thread == amd::Thread::current()))) { + return true; + } ihipDestroyDevice(); - break; + } break; case DLL_THREAD_DETACH: { amd::Thread* thread = amd::Thread::current(); delete thread; From b56502ab72016dc7c3e822ff98e093bdf6fb2505 Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Fri, 20 Jan 2023 15:35:25 -0800 Subject: [PATCH 44/56] SWDEV-364604 - Add support for hipEventDisableSystemFence Change-Id: I1a6451c873fb22729ac61e4e80f8531251e990f0 --- src/hip_event.cpp | 26 ++++++++++++++++---------- src/hip_module.cpp | 11 +++++++++-- src/hip_stream.cpp | 8 ++++++++ 3 files changed, 33 insertions(+), 12 deletions(-) diff --git a/src/hip_event.cpp b/src/hip_event.cpp index 83cbb9ef..7ddac560 100644 --- a/src/hip_event.cpp +++ b/src/hip_event.cpp @@ -222,13 +222,12 @@ hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t ext_flags ) { if (command == nullptr) { int32_t releaseFlags = ((ext_flags == 0) ? flags : ext_flags) & - (hipEventReleaseToSystem | hipEventReleaseToDevice); - if (releaseFlags & hipEventReleaseToDevice) { - releaseFlags = amd::Device::kCacheStateAgent; - } else if (releaseFlags & hipEventReleaseToSystem) { - releaseFlags = amd::Device::kCacheStateSystem; - } else { + (hipEventReleaseToDevice | hipEventReleaseToSystem | + hipEventDisableSystemFence); + if (releaseFlags & hipEventDisableSystemFence) { releaseFlags = amd::Device::kCacheStateIgnore; + } else { + releaseFlags = amd::Device::kCacheStateInvalid; } // Always submit a EventMarker. command = new hip::EventMarker(*stream, !kMarkerDisableFlush, true, releaseFlags); @@ -279,14 +278,21 @@ bool isValid(hipEvent_t event) { // ================================================================================================ hipError_t ihipEventCreateWithFlags(hipEvent_t* event, unsigned flags) { unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming | - hipEventReleaseToDevice | hipEventReleaseToSystem | hipEventInterprocess; + hipEventReleaseToDevice | hipEventReleaseToSystem | + hipEventInterprocess | hipEventDisableSystemFence; - const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem); + const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem | + hipEventDisableSystemFence); // can't set any unsupported flags. - // can't set both release flags + // can set only one of the release flags. // if hipEventInterprocess flag is set, then hipEventDisableTiming flag also must be set const bool illegalFlags = (flags & ~supportedFlags) || - ((flags & releaseFlags) == releaseFlags) || + ([](unsigned int num){ + unsigned int bitcount; + for (bitcount = 0; num; bitcount++) { + num &= num - 1; + } + return bitcount; } (flags & releaseFlags) > 1) || ((flags & hipEventInterprocess) && !(flags & hipEventDisableTiming)); if (!illegalFlags) { hip::Event* e = nullptr; diff --git a/src/hip_module.cpp b/src/hip_module.cpp index 98cc7fd5..a3fa4919 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -388,11 +388,18 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, } } - command->enqueue(); - if (stopEvent != nullptr) { hip::Event* eStop = reinterpret_cast(stopEvent); + if (eStop->flags & hipEventDisableSystemFence) { + command->setEventScope(amd::Device::kCacheStateIgnore); + } else { + command->setEventScope(amd::Device::kCacheStateSystem); + } + // Enqueue Dispatch and bind the stop event + command->enqueue(); eStop->BindCommand(*command, false); + } else { + command->enqueue(); } if (command->status() == CL_INVALID_OPERATION) { diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp index e6c2839a..80ea3013 100644 --- a/src/hip_stream.cpp +++ b/src/hip_stream.cpp @@ -225,6 +225,14 @@ void iHipWaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stream) command->enqueue(); command->release(); } + + //Reset the dirty flag for all streams now that the marker is submitted + for (const auto& stream : streamSet) { + amd::HostQueue* active_queue = stream->asHostQueue(); + if (active_queue->vdev()->isFenceDirty()) { + active_queue->vdev()->resetFenceDirty(); + } + } } // Release all active commands. It's safe after the marker was enqueued From 7e9a47119ede1e5933c4e12736988fe327e98c0b Mon Sep 17 00:00:00 2001 From: Anusha GodavarthySurya Date: Wed, 25 Jan 2023 08:56:32 +0000 Subject: [PATCH 45/56] SWDEV-345571 - Added support for half/half2 data types for warp shuffle functions Change-Id: I276a55129d6527b73292d75eb18403acd2623c8a --- include/hip/amd_detail/amd_hip_fp16.h | 60 +++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/include/hip/amd_detail/amd_hip_fp16.h b/include/hip/amd_detail/amd_hip_fp16.h index ab6496d6..fa58c3e4 100644 --- a/include/hip/amd_detail/amd_hip_fp16.h +++ b/include/hip/amd_detail/amd_hip_fp16.h @@ -1710,6 +1710,66 @@ THE SOFTWARE. using half = __half; using half2 = __half2; #endif + #if !defined(__HIPCC_RTC__) + #include "amd_device_functions.h" + #include "amd_warp_functions.h" + __device__ + inline + __half __shfl(__half var, int src_lane, int width = warpSize) { + union { int i; __half h; } tmp; tmp.h = var; + tmp.i = __shfl(tmp.i, src_lane, width); + return tmp.h; + } + __device__ + inline + __half2 __shfl(__half2 var, int src_lane, int width = warpSize) { + union { int i; __half2 h; } tmp; tmp.h = var; + tmp.i = __shfl(tmp.i, src_lane, width); + return tmp.h; + } + __device__ + inline + __half __shfl_up(__half var, unsigned int lane_delta, int width = warpSize) { + union { int i; __half h; } tmp; tmp.h = var; + tmp.i = __shfl_up(tmp.i, lane_delta, width); + return tmp.h; + } + __device__ + inline + __half2 __shfl_up(__half2 var, unsigned int lane_delta, int width = warpSize) { + union { int i; __half2 h; } tmp; tmp.h = var; + tmp.i = __shfl_up(tmp.i, lane_delta, width); + return tmp.h; + } + __device__ + inline + __half __shfl_down(__half var, unsigned int lane_delta, int width = warpSize) { + union { int i; __half h; } tmp; tmp.h = var; + tmp.i = __shfl_down(tmp.i, lane_delta, width); + return tmp.h; + } + __device__ + inline + __half2 __shfl_down(__half2 var, unsigned int lane_delta, int width = warpSize) { + union { int i; __half2 h; } tmp; tmp.h = var; + tmp.i = __shfl_down(tmp.i, lane_delta, width); + return tmp.h; + } + __device__ + inline + __half __shfl_xor(__half var, int lane_mask, int width = warpSize) { + union { int i; __half h; } tmp; tmp.h = var; + tmp.i = __shfl_xor(tmp.i, lane_mask, width); + return tmp.h; + } + __device__ + inline + __half2 __shfl_xor(__half2 var, int lane_mask, int width = warpSize) { + union { int i; __half2 h; } tmp; tmp.h = var; + tmp.i = __shfl_xor(tmp.i, lane_mask, width); + return tmp.h; + } + #endif // !defined(__HIPCC_RTC__) #endif // defined(__cplusplus) #elif defined(__GNUC__) #include "hip_fp16_gcc.h" From 3c5b6ac52664c047a259a42fcf6fb278377c8efc Mon Sep 17 00:00:00 2001 From: Sarbojit Sarkar Date: Tue, 26 Jul 2022 09:41:49 +0000 Subject: [PATCH 46/56] SWDEV-293749 - Fix for operator mixup Change-Id: I61d1fea5a6ed2176dd92050c6d96cee1af3a39fb --- include/hip/amd_detail/amd_hip_complex.h | 17 ++++++---- include/hip/amd_detail/amd_hip_vector_types.h | 31 ++++++++----------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/include/hip/amd_detail/amd_hip_complex.h b/include/hip/amd_detail/amd_hip_complex.h index eba6eb53..9d9dfd5e 100644 --- a/include/hip/amd_detail/amd_hip_complex.h +++ b/include/hip/amd_detail/amd_hip_complex.h @@ -106,15 +106,20 @@ THE SOFTWARE. return lhs; \ } -#define COMPLEX_MUL_PREOP_OVERLOAD(type) \ - __HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) { \ - lhs = lhs * rhs; \ - return lhs; \ +#define COMPLEX_MUL_PREOP_OVERLOAD(type) \ + __HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) { \ + type temp{lhs}; \ + lhs.x = rhs.x * temp.x - rhs.y * temp.y; \ + lhs.y = rhs.y * temp.x + rhs.x * temp.y; \ + return lhs; \ } #define COMPLEX_DIV_PREOP_OVERLOAD(type) \ - __HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) { \ - lhs = lhs / rhs; \ + __HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) { \ + type temp; \ + temp.x = (lhs.x*rhs.x + lhs.y * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y); \ + temp.y = (lhs.y * rhs.x - lhs.x * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y); \ + lhs = temp; \ return lhs; \ } diff --git a/include/hip/amd_detail/amd_hip_vector_types.h b/include/hip/amd_detail/amd_hip_vector_types.h index dfd3b39a..8215fb02 100644 --- a/include/hip/amd_detail/amd_hip_vector_types.h +++ b/include/hip/amd_detail/amd_hip_vector_types.h @@ -544,6 +544,13 @@ template struct is_scalar : public integral_constant struct is_scalar : public integral_constant struct is_scalar : public integral_constant{x} -= y; } - template - __HOST_DEVICE__ - inline - constexpr - HIP_vector_type operator*( - const HIP_vector_type& x, const HIP_vector_type& y) noexcept - { - return HIP_vector_type{x} *= y; - } template __HOST_DEVICE__ inline @@ -737,15 +741,6 @@ template struct is_scalar : public integral_constant{x} *= y; } - template - __HOST_DEVICE__ - inline - constexpr - HIP_vector_type operator/( - const HIP_vector_type& x, const HIP_vector_type& y) noexcept - { - return HIP_vector_type{x} /= y; - } template __HOST_DEVICE__ inline From b1dcb69f712f74ff1c0f13dada72a92b508a49b9 Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Thu, 23 Feb 2023 11:27:47 -0500 Subject: [PATCH 47/56] SWDEV-353281 - Add support for MemPool in graphs Implement hipDeviceGetGraphMemAttribute, hipDeviceSetGraphMemAttribute and hipDeviceGraphMemTrim Change-Id: I4f8fc1250ce1e8b7636d43d59ba7343158e45088 --- src/hip_device.cpp | 24 ++++++++++++++++++++---- src/hip_graph.cpp | 35 ++++++++++++++++++++++------------- src/hip_graph_internal.hpp | 16 ++++------------ src/hip_internal.hpp | 10 ++++++++-- src/hip_mempool_impl.hpp | 6 ++++-- 5 files changed, 58 insertions(+), 33 deletions(-) diff --git a/src/hip_device.cpp b/src/hip_device.cpp index 2b83616d..092652d0 100644 --- a/src/hip_device.cpp +++ b/src/hip_device.cpp @@ -44,7 +44,7 @@ hip::Stream* Device::GetNullStream() { if (null_stream_ == nullptr) { null_stream_ = new Stream(this, Stream::Priority::Normal, 0, true); } - + if (null_stream_ == nullptr) { return nullptr; } @@ -60,6 +60,18 @@ bool Device::Create() { if (default_mem_pool_ == nullptr) { return false; } + + // Create graph memory pool + graph_mem_pool_ = new MemoryPool(this); + if (graph_mem_pool_ == nullptr) { + return false; + } + + uint64_t max_size = std::numeric_limits::max(); + // Use maximum value to hold memory, because current implementation doesn't support VM + // Note: the call for the threshold is always successful + auto error = graph_mem_pool_->SetAttribute(hipMemPoolAttrReleaseThreshold, &max_size); + // Current is default pool after device creation current_mem_pool_ = default_mem_pool_; return true; @@ -85,7 +97,7 @@ void Device::RemoveMemoryPool(MemoryPool* pool) { bool Device::FreeMemory(amd::Memory* memory, Stream* stream) { amd::ScopedLock lock(lock_); // Search for memory in the entire list of pools - for (auto& it : mem_pools_) { + for (auto it : mem_pools_) { if (it->FreeMemory(memory, stream)) { return true; } @@ -97,7 +109,7 @@ bool Device::FreeMemory(amd::Memory* memory, Stream* stream) { void Device::ReleaseFreedMemory(Stream* stream) { amd::ScopedLock lock(lock_); // Search for memory in the entire list of pools - for (auto& it : mem_pools_) { + for (auto it : mem_pools_) { it->ReleaseFreedMemory(stream); } } @@ -106,7 +118,7 @@ void Device::ReleaseFreedMemory(Stream* stream) { void Device::RemoveStreamFromPools(Stream* stream) { amd::ScopedLock lock(lock_); // Update all pools with the destroyed stream - for (auto& it : mem_pools_) { + for (auto it : mem_pools_) { it->RemoveStream(stream); } } @@ -135,6 +147,10 @@ Device::~Device() { default_mem_pool_->release(); } + if (graph_mem_pool_ != nullptr) { + graph_mem_pool_->release(); + } + if (null_stream_!= nullptr) { delete null_stream_; } diff --git a/src/hip_graph.cpp b/src/hip_graph.cpp index 642a4bd7..f3a93253 100644 --- a/src/hip_graph.cpp +++ b/src/hip_graph.cpp @@ -2216,54 +2216,63 @@ hipError_t hipDeviceGetGraphMemAttribute(int device, hipGraphMemAttributeType at if ((static_cast(device) >= g_devices.size()) || device < 0 || value == nullptr) { HIP_RETURN(hipErrorInvalidDevice); } - // later use this to access memory pool - auto* deviceHandle = g_devices[device]->devices()[0]; + hipError_t result = hipErrorInvalidValue; switch (attr) { case hipGraphMemAttrUsedMemCurrent: - *reinterpret_cast(value) = 0; + result = g_devices[device]->GetGraphMemoryPool()->GetAttribute( + hipMemPoolAttrUsedMemCurrent, value); break; case hipGraphMemAttrUsedMemHigh: - *reinterpret_cast(value) = 0; + result = g_devices[device]->GetGraphMemoryPool()->GetAttribute( + hipMemPoolAttrUsedMemHigh, value); break; case hipGraphMemAttrReservedMemCurrent: - *reinterpret_cast(value) = 0; + result = g_devices[device]->GetGraphMemoryPool()->GetAttribute( + hipMemPoolAttrReservedMemCurrent, value); break; case hipGraphMemAttrReservedMemHigh: - *reinterpret_cast(value) = 0; + result = g_devices[device]->GetGraphMemoryPool()->GetAttribute( + hipMemPoolAttrReservedMemHigh, value); break; default: - return HIP_RETURN(hipErrorInvalidValue); + break; } - return HIP_RETURN(hipSuccess); + return HIP_RETURN(result); } +// ================================================================================================ hipError_t hipDeviceSetGraphMemAttribute(int device, hipGraphMemAttributeType attr, void* value) { HIP_INIT_API(hipDeviceSetGraphMemAttribute, device, attr, value); if ((static_cast(device) >= g_devices.size()) || device < 0 || value == nullptr) { HIP_RETURN(hipErrorInvalidDevice); } - // later use this to access memory pool - auto* deviceHandle = g_devices[device]->devices()[0]; + hipError_t result = hipErrorInvalidValue; switch (attr) { case hipGraphMemAttrUsedMemHigh: + result = g_devices[device]->GetGraphMemoryPool()->SetAttribute( + hipMemPoolAttrUsedMemHigh, value); break; case hipGraphMemAttrReservedMemHigh: + result = g_devices[device]->GetGraphMemoryPool()->SetAttribute( + hipMemPoolAttrReservedMemHigh, value); break; default: - return HIP_RETURN(hipErrorInvalidValue); + break; } - return HIP_RETURN(hipSuccess); + return HIP_RETURN(result); } +// ================================================================================================ hipError_t hipDeviceGraphMemTrim(int device) { HIP_INIT_API(hipDeviceGraphMemTrim, device); if ((static_cast(device) >= g_devices.size()) || device < 0) { HIP_RETURN(hipErrorInvalidDevice); } - // not implemented yet + g_devices[device]->GetGraphMemoryPool()->TrimTo(0); return HIP_RETURN(hipSuccess); } +// ================================================================================================ hipError_t hipUserObjectCreate(hipUserObject_t* object_out, void* ptr, hipHostFn_t destroy, unsigned int initialRefcount, unsigned int flags) { HIP_INIT_API(hipUserObjectCreate, object_out, ptr, destroy, initialRefcount, flags); diff --git a/src/hip_graph_internal.hpp b/src/hip_graph_internal.hpp index 4f0b5dd3..38f72581 100644 --- a/src/hip_graph_internal.hpp +++ b/src/hip_graph_internal.hpp @@ -405,17 +405,9 @@ struct ihipGraph { , device_(device) { amd::ScopedLock lock(graphSetLock_); graphSet_.insert(this); - if (original == nullptr) { - // Create memory pool, associated with the graph - mem_pool_ = new hip::MemoryPool(device); - uint64_t max_size = std::numeric_limits::max(); - // Note: the call for the threshold is always successful - auto error = mem_pool_->SetAttribute(hipMemPoolAttrReleaseThreshold, &max_size); - } else { - mem_pool_ = original->mem_pool_; - mem_pool_->retain(); - } - }; + mem_pool_ = device->GetGraphMemoryPool(); + mem_pool_->retain(); + } ~ihipGraph() { for (auto node : vertices_) { @@ -430,7 +422,7 @@ struct ihipGraph { mem_pool_->release(); } - }; + } void AddManualNodeDuringCapture(hipGraphNode* node) { capturedNodes_.insert(node); } diff --git a/src/hip_internal.hpp b/src/hip_internal.hpp index 84782cb6..0ebb4b58 100644 --- a/src/hip_internal.hpp +++ b/src/hip_internal.hpp @@ -401,8 +401,9 @@ namespace hip { bool isActive_; - MemoryPool* default_mem_pool_; + MemoryPool* default_mem_pool_; //!< Default memory pool for this device MemoryPool* current_mem_pool_; + MemoryPool* graph_mem_pool_; //!< Memory pool, associated with graphs for this device std::set mem_pools_; @@ -412,7 +413,8 @@ namespace hip { flags_(hipDeviceScheduleSpin), isActive_(false), default_mem_pool_(nullptr), - current_mem_pool_(nullptr) + current_mem_pool_(nullptr), + graph_mem_pool_(nullptr) { assert(ctx != nullptr); } ~Device(); @@ -470,6 +472,9 @@ namespace hip { /// Get the default memory pool on the device MemoryPool* GetDefaultMemoryPool() const { return default_mem_pool_; } + /// Get the graph memory pool on the device + MemoryPool* GetGraphMemoryPool() const { return graph_mem_pool_; } + /// Add memory pool to the device void AddMemoryPool(MemoryPool* pool); @@ -484,6 +489,7 @@ namespace hip { /// Removes a destroyed stream from the safe list of memory pools void RemoveStreamFromPools(Stream* stream); + }; /// Thread Local Storage Variables Aggregator Class diff --git a/src/hip_mempool_impl.hpp b/src/hip_mempool_impl.hpp index 9d176b17..5e18cb35 100644 --- a/src/hip_mempool_impl.hpp +++ b/src/hip_mempool_impl.hpp @@ -213,17 +213,19 @@ class MemoryPool : public amd::ReferenceCountedObject { /// Set memory pool access by different devices void GetAccess(hip::Device* device, hipMemAccessFlags* flags); + /// Frees all busy memory + void FreeAllMemory(hip::Stream* stream = nullptr); + /// Accessors for the pool state bool EventDependencies() const { return (state_.event_dependencies_) ? true : false; } bool Opportunistic() const { return (state_.opportunistic_) ? true : false; } bool InternalDependencies() const { return (state_.internal_dependencies_) ? true : false; } - void FreeAllMemory(hip::Stream* stream = nullptr); + private: MemoryPool() = delete; MemoryPool(const MemoryPool&) = delete; MemoryPool& operator=(const MemoryPool&) = delete; - Heap busy_heap_; //!< Heap of busy allocations Heap free_heap_; //!< Heap of freed allocations struct { From 21a474f642abe969113b19f9fc1c80fd56d89dc8 Mon Sep 17 00:00:00 2001 From: sdashmiz Date: Fri, 17 Feb 2023 14:21:59 -0500 Subject: [PATCH 48/56] SWDEV-379388 - remove check for all streams done - cuda shows a different behaviour and different error need more investigation Signed-off-by: sdashmiz Change-Id: I68771102ba4dff6157bca34a4135cb245f023d08 --- src/hip_event.cpp | 3 --- src/hip_stream.cpp | 6 ------ 2 files changed, 9 deletions(-) diff --git a/src/hip_event.cpp b/src/hip_event.cpp index 7ddac560..747b69ce 100644 --- a/src/hip_event.cpp +++ b/src/hip_event.cpp @@ -410,9 +410,6 @@ hipError_t hipEventSynchronize(hipEvent_t event) { HIP_RETURN(hipErrorInvalidHandle); } - if (hip::Stream::StreamCaptureOngoing() == true) { - HIP_RETURN(hipErrorStreamCaptureUnsupported); - } hip::Event* e = reinterpret_cast(event); HIP_RETURN(e->synchronize()); } diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp index 80ea3013..1342ac72 100644 --- a/src/hip_stream.cpp +++ b/src/hip_stream.cpp @@ -502,12 +502,6 @@ hipError_t hipStreamWaitEvent_common(hipStream_t stream, hipEvent_t event, unsig return hipErrorContextIsDestroyed; } - if (stream != nullptr) { - // If still capturing return error - if (hip::Stream::StreamCaptureOngoing() == true) { - HIP_RETURN(hipErrorStreamCaptureIsolation); - } - } hip::Event* e = reinterpret_cast(event); return e->streamWait(stream, flags); } From 870ebfca928cdd04927375c11d4a73fab5f2ab37 Mon Sep 17 00:00:00 2001 From: Ioannis Assiouras Date: Wed, 22 Feb 2023 00:05:49 +0000 Subject: [PATCH 49/56] SWDEV-381402 - Remove unused getNullStream() from device. Make stream destructor private. Change-Id: Idde30a8bfe97a525bd9f9fb50698a5cb14b798fc --- src/hip_device.cpp | 16 +--------------- src/hip_graph_internal.cpp | 2 +- src/hip_internal.hpp | 4 ++++ src/hip_mempool.cpp | 6 +++--- 4 files changed, 9 insertions(+), 19 deletions(-) diff --git a/src/hip_device.cpp b/src/hip_device.cpp index 092652d0..8782b6c3 100644 --- a/src/hip_device.cpp +++ b/src/hip_device.cpp @@ -39,20 +39,6 @@ hip::Stream* Device::NullStream(bool skip_alloc) { return null_stream_; } -// ================================================================================================ -hip::Stream* Device::GetNullStream() { - if (null_stream_ == nullptr) { - null_stream_ = new Stream(this, Stream::Priority::Normal, 0, true); - } - - if (null_stream_ == nullptr) { - return nullptr; - } - // Wait for all active streams before executing commands on the default - iHipWaitActiveStreams(null_stream_); - return null_stream_; -} - // ================================================================================================ bool Device::Create() { // Create default memory pool @@ -152,7 +138,7 @@ Device::~Device() { } if (null_stream_!= nullptr) { - delete null_stream_; + null_stream_->release(); } } diff --git a/src/hip_graph_internal.cpp b/src/hip_graph_internal.cpp index 5733cc5c..f4060a0f 100644 --- a/src/hip_graph_internal.cpp +++ b/src/hip_graph_internal.cpp @@ -569,7 +569,7 @@ hipError_t hipGraphExec::Run(hipStream_t stream) { levelOrder_[0]->GetParentGraph()->FreeAllMemory(); } } - auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->GetNullStream() + auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->NullStream() : reinterpret_cast(stream); UpdateStream(parallelLists_, hip_stream, this); std::vector rootCommands; diff --git a/src/hip_internal.hpp b/src/hip_internal.hpp index 0ebb4b58..49a88326 100644 --- a/src/hip_internal.hpp +++ b/src/hip_internal.hpp @@ -380,6 +380,10 @@ namespace hip { } } static bool existsActiveStreamForDevice(hip::Device* device); + + /// The stream should be destroyed via release() rather than delete + private: + ~Stream() {}; }; /// HIP Device class diff --git a/src/hip_mempool.cpp b/src/hip_mempool.cpp index eea254f5..f798f8c8 100644 --- a/src/hip_mempool.cpp +++ b/src/hip_mempool.cpp @@ -70,7 +70,7 @@ hipError_t hipMallocAsync(void** dev_ptr, size_t size, hipStream_t stream) { if ((dev_ptr == nullptr) || (size == 0) || (!hip::isValid(stream))) { HIP_RETURN(hipErrorInvalidValue); } - auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->GetNullStream() : + auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->NullStream() : reinterpret_cast(stream); auto device = hip_stream->GetDevice(); auto mem_pool = device->GetCurrentMemoryPool(); @@ -92,7 +92,7 @@ hipError_t hipFreeAsync(void* dev_ptr, hipStream_t stream) { auto memory = getMemoryObject(dev_ptr, offset); if (memory != nullptr) { auto id = memory->getUserData().deviceId; - auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->GetNullStream() : + auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->NullStream() : reinterpret_cast(stream); if (!g_devices[id]->FreeMemory(memory, hip_stream)) { //! @todo It's not the most optimal logic. The current implementation has unconditional waits @@ -241,7 +241,7 @@ hipError_t hipMallocFromPoolAsync( STREAM_CAPTURE(hipMallocAsync, stream, mem_pool, size, dev_ptr); auto mpool = reinterpret_cast(mem_pool); - auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->GetNullStream() : + auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->NullStream() : reinterpret_cast(stream); *dev_ptr = mpool->AllocateMemory(size, hip_stream); HIP_RETURN(hipSuccess); From a969c1c8f9806da04e5d84a0fbc10bb17ed18603 Mon Sep 17 00:00:00 2001 From: pghafari Date: Thu, 23 Feb 2023 16:57:31 -0500 Subject: [PATCH 50/56] SWDEV-366515 - adding HIP_AD_FORMAT_SIGNED_INT16 Change-Id: I084d7b093049093ea032372593711f6dc0964af8 --- src/hip_conversions.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/hip_conversions.hpp b/src/hip_conversions.hpp index 757ccc79..ef928225 100644 --- a/src/hip_conversions.hpp +++ b/src/hip_conversions.hpp @@ -168,6 +168,12 @@ hipArray_Format getCL2hipArrayFormat(const cl_channel_type type) { case CL_SIGNED_INT8: return HIP_AD_FORMAT_SIGNED_INT8; + case CL_UNSIGNED_INT16: + return HIP_AD_FORMAT_UNSIGNED_INT16; + + case CL_SIGNED_INT16: + return HIP_AD_FORMAT_SIGNED_INT16; + case CL_SIGNED_INT32: return HIP_AD_FORMAT_SIGNED_INT32; From a77ac73f3c85795c2e6d37778b49ecdd4e4a7267 Mon Sep 17 00:00:00 2001 From: sdashmiz Date: Mon, 13 Feb 2023 13:49:55 -0500 Subject: [PATCH 51/56] SWDEV-382838 - inetrmittent failure - test fails intermittently because object is not cleared correctly Signed-off-by: sdashmiz Change-Id: I88daf3dc08bb83d6d3f047ff48a63c8f856fb0bf --- src/hip_graph.cpp | 4 ++++ src/hip_graph_internal.hpp | 9 +++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/hip_graph.cpp b/src/hip_graph.cpp index f3a93253..fddc49b0 100644 --- a/src/hip_graph.cpp +++ b/src/hip_graph.cpp @@ -2297,6 +2297,10 @@ hipError_t hipUserObjectRelease(hipUserObject_t object, unsigned int count) { if (object->referenceCount() < count || !hipUserObject::isUserObjvalid(object)) { HIP_RETURN(hipSuccess); } + //! If all the counts are gone not longer need the obj in the list + if (object->referenceCount() == count) { + hipUserObject::removeUSerObj(object); + } object->decreaseRefCount(count); HIP_RETURN(hipSuccess); } diff --git a/src/hip_graph_internal.hpp b/src/hip_graph_internal.hpp index 38f72581..819125c1 100644 --- a/src/hip_graph_internal.hpp +++ b/src/hip_graph_internal.hpp @@ -76,8 +76,8 @@ struct hipUserObject : public amd::ReferenceCountedObject { } static bool isUserObjvalid(hipUserObject* pUsertObj) { - amd::ScopedLock lock(UserObjectLock_); - if (ObjectSet_.find(pUsertObj) == ObjectSet_.end()) { + auto it = ObjectSet_.find(pUsertObj); + if (it == ObjectSet_.end()) { return false; } return true; @@ -85,8 +85,9 @@ struct hipUserObject : public amd::ReferenceCountedObject { static void removeUSerObj(hipUserObject* pUsertObj) { amd::ScopedLock lock(UserObjectLock_); - if (ObjectSet_.find(pUsertObj) == ObjectSet_.end()) { - ObjectSet_.erase(pUsertObj); + auto it = ObjectSet_.find(pUsertObj); + if (it != ObjectSet_.end()) { + ObjectSet_.erase(it); } } From 1dcf519eb22b60eab4cd287d053ac8acff276046 Mon Sep 17 00:00:00 2001 From: Sourabh Betigeri Date: Fri, 9 Sep 2022 16:45:48 -0700 Subject: [PATCH 52/56] SWDEV-326798 - Changes in stream sync behavior Change-Id: If6d0b3876a9bf197c7e49273eaa5ca5bfae46d0b --- src/hip_context.cpp | 11 ++++++----- src/hip_device.cpp | 8 +++++--- src/hip_internal.hpp | 7 +++---- src/hip_stream.cpp | 7 +++++-- 4 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/hip_context.cpp b/src/hip_context.cpp index f639d4ff..dfcd189c 100644 --- a/src/hip_context.cpp +++ b/src/hip_context.cpp @@ -91,12 +91,12 @@ void setCurrentDevice(unsigned int index) { amd::Os::setPreferredNumaNode(preferredNumaNode); } -hip::Stream* getStream(hipStream_t stream) { +hip::Stream* getStream(hipStream_t stream, bool wait) { if (stream == nullptr) { - return getNullStream(); + return getNullStream(wait); } else { hip::Stream* hip_stream = reinterpret_cast(stream); - if (!(hip_stream->Flags() & hipStreamNonBlocking)) { + if (wait && !(hip_stream->Flags() & hipStreamNonBlocking)) { constexpr bool WaitNullStreamOnly = true; iHipWaitActiveStreams(hip_stream, WaitNullStreamOnly); } @@ -131,9 +131,10 @@ int getDeviceID(amd::Context& ctx) { } // ================================================================================================ -hip::Stream* getNullStream() { +hip::Stream* getNullStream(bool wait) { Device* device = getCurrentDevice(); - return device ? device->NullStream() : nullptr; + constexpr bool kSkipAlloc = false; + return device ? device->NullStream(kSkipAlloc, wait) : nullptr; } }; diff --git a/src/hip_device.cpp b/src/hip_device.cpp index 8782b6c3..82fcee62 100644 --- a/src/hip_device.cpp +++ b/src/hip_device.cpp @@ -26,7 +26,7 @@ namespace hip { // ================================================================================================ -hip::Stream* Device::NullStream(bool skip_alloc) { +hip::Stream* Device::NullStream(bool skip_alloc, bool wait) { if (null_stream_ == nullptr && !skip_alloc) { null_stream_ = new Stream(this, Stream::Priority::Normal, 0, true); } @@ -34,8 +34,10 @@ hip::Stream* Device::NullStream(bool skip_alloc) { if (null_stream_ == nullptr) { return nullptr; } - // Wait for all active streams before executing commands on the default - iHipWaitActiveStreams(null_stream_); + if (wait == true) { + // Wait for all active streams before executing commands on the default + iHipWaitActiveStreams(null_stream_); + } return null_stream_; } diff --git a/src/hip_internal.hpp b/src/hip_internal.hpp index 49a88326..e92eaeb7 100644 --- a/src/hip_internal.hpp +++ b/src/hip_internal.hpp @@ -451,10 +451,9 @@ namespace hip { void setFlags(unsigned int flags) { flags_ = flags; } void Reset(); - hip::Stream* NullStream(bool skip_alloc = false); + hip::Stream* NullStream(bool skip_alloc = false, bool wait = true); Stream* GetNullStream(); - bool GetActiveStatus() { amd::ScopedLock lock(lock_); if (isActive_) return true; @@ -528,11 +527,11 @@ namespace hip { /// Get ROCclr queue associated with hipStream /// Note: This follows the CUDA spec to sync with default streams /// and Blocking streams - extern hip::Stream* getStream(hipStream_t stream); + extern hip::Stream* getStream(hipStream_t stream, bool wait = true); /// Get default stream associated with the ROCclr context extern hip::Stream* getNullStream(amd::Context&); /// Get default stream of the thread - extern hip::Stream* getNullStream(); + extern hip::Stream* getNullStream(bool wait = true); /// Get device ID associated with the ROCclr context int getDeviceID(amd::Context& ctx); /// Check if stream is valid diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp index 1342ac72..28550c48 100644 --- a/src/hip_stream.cpp +++ b/src/hip_stream.cpp @@ -420,8 +420,9 @@ hipError_t hipStreamSynchronize_common(hipStream_t stream) { HIP_RETURN(hipErrorStreamCaptureUnsupported); } } + bool wait = (stream == nullptr) ? true : false; // Wait for the current host queue - hip::getStream(stream)->finish(); + hip::getStream(stream, wait)->finish(); return hipSuccess; } @@ -530,7 +531,9 @@ hipError_t hipStreamQuery_common(hipStream_t stream) { HIP_RETURN(hipErrorStreamCaptureUnsupported); } } - hip::Stream* hip_stream = hip::getStream(stream); + + bool wait = (stream == nullptr) ? true : false; + hip::Stream* hip_stream = hip::getStream(stream, wait); amd::Command* command = hip_stream->getLastQueuedCommand(true); if (command == nullptr) { From df81240bd4a639c0cd55fca4f183353fc25b574e Mon Sep 17 00:00:00 2001 From: Sourabh Betigeri Date: Tue, 28 Feb 2023 02:28:12 -0500 Subject: [PATCH 53/56] SWDEV-326798 - Revert "SWDEV-326798 - Changes in stream sync behavior" This reverts commit 1dcf519eb22b60eab4cd287d053ac8acff276046. Reason for revert: HIP tests on windows fails Change-Id: I795ed19d76a41e2fd9971414cefa5bd3be45d4bc --- src/hip_context.cpp | 11 +++++------ src/hip_device.cpp | 8 +++----- src/hip_internal.hpp | 7 ++++--- src/hip_stream.cpp | 7 ++----- 4 files changed, 14 insertions(+), 19 deletions(-) diff --git a/src/hip_context.cpp b/src/hip_context.cpp index dfcd189c..f639d4ff 100644 --- a/src/hip_context.cpp +++ b/src/hip_context.cpp @@ -91,12 +91,12 @@ void setCurrentDevice(unsigned int index) { amd::Os::setPreferredNumaNode(preferredNumaNode); } -hip::Stream* getStream(hipStream_t stream, bool wait) { +hip::Stream* getStream(hipStream_t stream) { if (stream == nullptr) { - return getNullStream(wait); + return getNullStream(); } else { hip::Stream* hip_stream = reinterpret_cast(stream); - if (wait && !(hip_stream->Flags() & hipStreamNonBlocking)) { + if (!(hip_stream->Flags() & hipStreamNonBlocking)) { constexpr bool WaitNullStreamOnly = true; iHipWaitActiveStreams(hip_stream, WaitNullStreamOnly); } @@ -131,10 +131,9 @@ int getDeviceID(amd::Context& ctx) { } // ================================================================================================ -hip::Stream* getNullStream(bool wait) { +hip::Stream* getNullStream() { Device* device = getCurrentDevice(); - constexpr bool kSkipAlloc = false; - return device ? device->NullStream(kSkipAlloc, wait) : nullptr; + return device ? device->NullStream() : nullptr; } }; diff --git a/src/hip_device.cpp b/src/hip_device.cpp index 82fcee62..8782b6c3 100644 --- a/src/hip_device.cpp +++ b/src/hip_device.cpp @@ -26,7 +26,7 @@ namespace hip { // ================================================================================================ -hip::Stream* Device::NullStream(bool skip_alloc, bool wait) { +hip::Stream* Device::NullStream(bool skip_alloc) { if (null_stream_ == nullptr && !skip_alloc) { null_stream_ = new Stream(this, Stream::Priority::Normal, 0, true); } @@ -34,10 +34,8 @@ hip::Stream* Device::NullStream(bool skip_alloc, bool wait) { if (null_stream_ == nullptr) { return nullptr; } - if (wait == true) { - // Wait for all active streams before executing commands on the default - iHipWaitActiveStreams(null_stream_); - } + // Wait for all active streams before executing commands on the default + iHipWaitActiveStreams(null_stream_); return null_stream_; } diff --git a/src/hip_internal.hpp b/src/hip_internal.hpp index e92eaeb7..49a88326 100644 --- a/src/hip_internal.hpp +++ b/src/hip_internal.hpp @@ -451,9 +451,10 @@ namespace hip { void setFlags(unsigned int flags) { flags_ = flags; } void Reset(); - hip::Stream* NullStream(bool skip_alloc = false, bool wait = true); + hip::Stream* NullStream(bool skip_alloc = false); Stream* GetNullStream(); + bool GetActiveStatus() { amd::ScopedLock lock(lock_); if (isActive_) return true; @@ -527,11 +528,11 @@ namespace hip { /// Get ROCclr queue associated with hipStream /// Note: This follows the CUDA spec to sync with default streams /// and Blocking streams - extern hip::Stream* getStream(hipStream_t stream, bool wait = true); + extern hip::Stream* getStream(hipStream_t stream); /// Get default stream associated with the ROCclr context extern hip::Stream* getNullStream(amd::Context&); /// Get default stream of the thread - extern hip::Stream* getNullStream(bool wait = true); + extern hip::Stream* getNullStream(); /// Get device ID associated with the ROCclr context int getDeviceID(amd::Context& ctx); /// Check if stream is valid diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp index 28550c48..1342ac72 100644 --- a/src/hip_stream.cpp +++ b/src/hip_stream.cpp @@ -420,9 +420,8 @@ hipError_t hipStreamSynchronize_common(hipStream_t stream) { HIP_RETURN(hipErrorStreamCaptureUnsupported); } } - bool wait = (stream == nullptr) ? true : false; // Wait for the current host queue - hip::getStream(stream, wait)->finish(); + hip::getStream(stream)->finish(); return hipSuccess; } @@ -531,9 +530,7 @@ hipError_t hipStreamQuery_common(hipStream_t stream) { HIP_RETURN(hipErrorStreamCaptureUnsupported); } } - - bool wait = (stream == nullptr) ? true : false; - hip::Stream* hip_stream = hip::getStream(stream, wait); + hip::Stream* hip_stream = hip::getStream(stream); amd::Command* command = hip_stream->getLastQueuedCommand(true); if (command == nullptr) { From 2483f2ca796d54f5a3b6bf04dd9f98df447bad40 Mon Sep 17 00:00:00 2001 From: Sourabh Betigeri Date: Thu, 23 Feb 2023 01:17:58 +0000 Subject: [PATCH 54/56] SWDEV-378778 - Returns with error logged when compiled for a diffferent gpu arch Change-Id: I34fb03c4f76d08278246d29028db0eb21a7aa529 --- src/hip_platform.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/hip_platform.cpp b/src/hip_platform.cpp index aecf5064..84ba32ff 100644 --- a/src/hip_platform.cpp +++ b/src/hip_platform.cpp @@ -662,7 +662,10 @@ void PlatformState::init() { initialized_ = true; for (auto& it : statCO_.modules_) { hipError_t err = digestFatBinary(it.first, it.second); - assert(err == hipSuccess); + if (err != hipSuccess) { + HIP_ERROR_PRINT(err); + return; + } } for (auto& it : statCO_.vars_) { it.second->resize_dVar(g_devices.size()); From 9b42cc5bf2138d39d0ba0f9402d859a25fa1b7eb Mon Sep 17 00:00:00 2001 From: Jaydeep Patel Date: Mon, 27 Feb 2023 17:23:01 +0000 Subject: [PATCH 55/56] SWDEV-383056 - Don't sync with dst device for hipMemcpyAsync. Change-Id: I28530e6bd870d617507592576295fc9e7eed1475 --- src/hip_code_object.cpp | 2 -- src/hip_internal.hpp | 3 ++- src/hip_memory.cpp | 19 +++++++------------ src/hip_peer.cpp | 9 +++++++-- src/hip_platform.cpp | 2 -- src/hip_texture.cpp | 3 --- 6 files changed, 16 insertions(+), 22 deletions(-) diff --git a/src/hip_code_object.cpp b/src/hip_code_object.cpp index 778783cf..dd9de637 100644 --- a/src/hip_code_object.cpp +++ b/src/hip_code_object.cpp @@ -31,8 +31,6 @@ THE SOFTWARE. #include "platform/program.hpp" #include -hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, - hip::Stream& stream, bool isAsync = false); hipError_t ihipFree(void* ptr); // forward declaration of methods required for managed variables hipError_t ihipMallocManaged(void** ptr, size_t size, unsigned int align = 0); diff --git a/src/hip_internal.hpp b/src/hip_internal.hpp index 49a88326..ca924bd5 100644 --- a/src/hip_internal.hpp +++ b/src/hip_internal.hpp @@ -565,7 +565,8 @@ extern hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, hipDevice_t de extern hipError_t ihipDeviceGet(hipDevice_t* device, int deviceId); extern hipError_t ihipStreamOperation(hipStream_t stream, cl_command_type cmdType, void* ptr, uint64_t value, uint64_t mask, unsigned int flags, size_t sizeBytes); - +hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, + hip::Stream& stream, bool isHostAsync = false, bool isGPUAsync = true); constexpr bool kOptionChangeable = true; constexpr bool kNewDevProg = false; diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index d642354a..79295437 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -451,7 +451,7 @@ void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, hip::Stream& s } // ================================================================================================ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, - hip::Stream& stream, bool isAsync = false) { + hip::Stream& stream, bool isHostAsync, bool isGPUAsync) { hipError_t status; if (sizeBytes == 0) { // Skip if nothing needs writing. @@ -464,7 +464,6 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin if (src == dst && kind == hipMemcpyDefault) { return hipSuccess; } - bool isP2P = false; size_t sOffset = 0; amd::Memory* srcMemory = getMemoryObject(src, sOffset); size_t dOffset = 0; @@ -473,24 +472,20 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin ihipHtoHMemcpy(dst, src, sizeBytes, stream); return hipSuccess; } else if ((srcMemory == nullptr) && (dstMemory != nullptr)) { - isAsync = false; + isHostAsync = false; } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) { - isAsync = false; - } else if ((srcMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) && - (srcMemory->getContext().devices().size() == 1) && - (dstMemory->getContext().devices().size() == 1)) { - isAsync = true; - isP2P = true; + isHostAsync = false; } + amd::Command* command = nullptr; - status = ihipMemcpyCommand(command, dst, src, sizeBytes, kind, stream, isAsync); + status = ihipMemcpyCommand(command, dst, src, sizeBytes, kind, stream, isHostAsync); if (status != hipSuccess) { return status; } command->enqueue(); - if (!isAsync) { + if (!isHostAsync) { command->awaitCompletion(); - } else if (isP2P) { + } else if (!isGPUAsync) { hip::Stream* pStream = hip::getNullStream(dstMemory->getContext()); amd::Command::EventWaitList waitList; waitList.push_back(command); diff --git a/src/hip_peer.cpp b/src/hip_peer.cpp index d5255103..17dc65da 100644 --- a/src/hip_peer.cpp +++ b/src/hip_peer.cpp @@ -220,7 +220,8 @@ hipError_t hipMemcpyPeer(void* dst, int dstDevice, const void* src, int srcDevic HIP_RETURN(hipErrorInvalidDevice); } - HIP_RETURN(hipMemcpy(dst, src, sizeBytes, hipMemcpyDeviceToDevice)); + HIP_RETURN(ihipMemcpy(dst, src, sizeBytes, hipMemcpyDeviceToDevice, *hip::getNullStream(), + true, false)); } hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int srcDevice, @@ -235,7 +236,11 @@ hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int src if (!hip::isValid(stream)) { return hipErrorContextIsDestroyed; } - HIP_RETURN(hipMemcpyAsync(dst, src, sizeBytes, hipMemcpyDeviceToDevice, stream)); + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { + return hipErrorInvalidValue; + } + HIP_RETURN(ihipMemcpy(dst, src, sizeBytes, hipMemcpyDeviceToDevice, *hip_stream, true, true)); } hipError_t hipCtxEnablePeerAccess(hipCtx_t peerCtx, unsigned int flags) { diff --git a/src/hip_platform.cpp b/src/hip_platform.cpp index 84ba32ff..10b8d3f8 100644 --- a/src/hip_platform.cpp +++ b/src/hip_platform.cpp @@ -33,8 +33,6 @@ PlatformState* PlatformState::platform_; // Initiaized as nullptr by default // forward declaration of methods required for __hipRegisrterManagedVar hipError_t ihipMallocManaged(void** ptr, size_t size, unsigned int align = 0); -hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, - hip::Stream& stream, bool isAsync = false); struct __CudaFatBinaryWrapper { unsigned int magic; diff --git a/src/hip_texture.cpp b/src/hip_texture.cpp index 9fead8d8..8c443739 100644 --- a/src/hip_texture.cpp +++ b/src/hip_texture.cpp @@ -25,9 +25,6 @@ #include "hip_conversions.hpp" #include "platform/sampler.hpp" -hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, - hip::Stream& stream, bool isAsync = false); - hipError_t ihipFree(void* ptr); struct __hip_texture { From 7f83be52c996287aa96aaa626abf372c333a6eb1 Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary Date: Tue, 17 Jan 2023 10:40:05 +0000 Subject: [PATCH 56/56] SWDEV-372153 - Add hipStreamGetDevice Implementation Change-Id: Ifd1f13e311e8221ca6d94cf27f9131eb97678067 --- include/hip/amd_detail/hip_prof_str.h | 26 ++++++++++++++++++- .../nvidia_detail/nvidia_hip_runtime_api.h | 14 ++++++++++ src/amdhip.def | 1 + src/hip_hcc.def.in | 1 + src/hip_hcc.map.in | 1 + src/hip_stream.cpp | 24 +++++++++++++++++ 6 files changed, 66 insertions(+), 1 deletion(-) diff --git a/include/hip/amd_detail/hip_prof_str.h b/include/hip/amd_detail/hip_prof_str.h index d72fd38d..d0b24d01 100644 --- a/include/hip/amd_detail/hip_prof_str.h +++ b/include/hip/amd_detail/hip_prof_str.h @@ -373,7 +373,8 @@ enum hip_api_id_t { HIP_API_ID_hipArray3DGetDescriptor = 360, HIP_API_ID_hipArrayGetDescriptor = 361, HIP_API_ID_hipArrayGetInfo = 362, - HIP_API_ID_LAST = 362, + HIP_API_ID_hipStreamGetDevice = 363, + HIP_API_ID_LAST = 363, HIP_API_ID_hipBindTexture = HIP_API_ID_NONE, HIP_API_ID_hipBindTexture2D = HIP_API_ID_NONE, @@ -743,6 +744,7 @@ static inline const char* hip_api_name(const uint32_t id) { case HIP_API_ID_hipStreamEndCapture: return "hipStreamEndCapture"; case HIP_API_ID_hipStreamGetCaptureInfo: return "hipStreamGetCaptureInfo"; case HIP_API_ID_hipStreamGetCaptureInfo_v2: return "hipStreamGetCaptureInfo_v2"; + case HIP_API_ID_hipStreamGetDevice: return "hipStreamGetDevice"; case HIP_API_ID_hipStreamGetFlags: return "hipStreamGetFlags"; case HIP_API_ID_hipStreamGetPriority: return "hipStreamGetPriority"; case HIP_API_ID_hipStreamIsCapturing: return "hipStreamIsCapturing"; @@ -1108,6 +1110,7 @@ static inline uint32_t hipApiIdByName(const char* name) { if (strcmp("hipStreamEndCapture", name) == 0) return HIP_API_ID_hipStreamEndCapture; if (strcmp("hipStreamGetCaptureInfo", name) == 0) return HIP_API_ID_hipStreamGetCaptureInfo; if (strcmp("hipStreamGetCaptureInfo_v2", name) == 0) return HIP_API_ID_hipStreamGetCaptureInfo_v2; + if (strcmp("hipStreamGetDevice", name) == 0) return HIP_API_ID_hipStreamGetDevice; if (strcmp("hipStreamGetFlags", name) == 0) return HIP_API_ID_hipStreamGetFlags; if (strcmp("hipStreamGetPriority", name) == 0) return HIP_API_ID_hipStreamGetPriority; if (strcmp("hipStreamIsCapturing", name) == 0) return HIP_API_ID_hipStreamIsCapturing; @@ -3062,6 +3065,11 @@ typedef struct hip_api_data_s { size_t* numDependencies_out; size_t numDependencies_out__val; } hipStreamGetCaptureInfo_v2; + struct { + hipStream_t stream; + hipDevice_t* device; + hipDevice_t device__val; + } hipStreamGetDevice; struct { hipStream_t stream; unsigned int* flags; @@ -5231,6 +5239,11 @@ typedef struct hip_api_data_s { cb_data.args.hipStreamGetCaptureInfo_v2.dependencies_out = (const hipGraphNode_t**)dependencies_out; \ cb_data.args.hipStreamGetCaptureInfo_v2.numDependencies_out = (size_t*)numDependencies_out; \ }; +// hipStreamGetDevice[('hipStream_t', 'stream'), ('hipDevice_t*', 'device')] +#define INIT_hipStreamGetDevice_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipStreamGetDevice.stream = (hipStream_t)stream; \ + cb_data.args.hipStreamGetDevice.device = (hipDevice_t*)device; \ +}; // hipStreamGetFlags[('hipStream_t', 'stream'), ('unsigned int*', 'flags')] #define INIT_hipStreamGetFlags_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipStreamGetFlags.stream = (hipStream_t)stream; \ @@ -6765,6 +6778,10 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { if (data->args.hipStreamGetCaptureInfo_v2.dependencies_out) data->args.hipStreamGetCaptureInfo_v2.dependencies_out__val = *(data->args.hipStreamGetCaptureInfo_v2.dependencies_out); if (data->args.hipStreamGetCaptureInfo_v2.numDependencies_out) data->args.hipStreamGetCaptureInfo_v2.numDependencies_out__val = *(data->args.hipStreamGetCaptureInfo_v2.numDependencies_out); break; +// hipStreamGetDevice[('hipStream_t', 'stream'), ('hipDevice_t*', 'device')] + case HIP_API_ID_hipStreamGetDevice: + if (data->args.hipStreamGetDevice.device) data->args.hipStreamGetDevice.device__val = *(data->args.hipStreamGetDevice.device); + break; // hipStreamGetFlags[('hipStream_t', 'stream'), ('unsigned int*', 'flags')] case HIP_API_ID_hipStreamGetFlags: if (data->args.hipStreamGetFlags.flags) data->args.hipStreamGetFlags.flags__val = *(data->args.hipStreamGetFlags.flags); @@ -9491,6 +9508,13 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da else { oss << ", numDependencies_out="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetCaptureInfo_v2.numDependencies_out__val); } oss << ")"; break; + case HIP_API_ID_hipStreamGetDevice: + oss << "hipStreamGetDevice("; + oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetDevice.stream); + if (data->args.hipStreamGetDevice.device == NULL) oss << ", device=NULL"; + else { oss << ", device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetDevice.device__val); } + oss << ")"; + break; case HIP_API_ID_hipStreamGetFlags: oss << "hipStreamGetFlags("; oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetFlags.stream); diff --git a/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/include/hip/nvidia_detail/nvidia_hip_runtime_api.h index d7701826..4c8be9af 100644 --- a/include/hip/nvidia_detail/nvidia_hip_runtime_api.h +++ b/include/hip/nvidia_detail/nvidia_hip_runtime_api.h @@ -2507,6 +2507,20 @@ inline static hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallb cudaStreamAddCallback(stream, (cudaStreamCallback_t)callback, userData, flags)); } +inline static hipError_t hipStreamGetDevice(hipStream_t stream, hipDevice_t* device) { + hipCtx_t context; + auto err = hipCUResultTohipError(cuStreamGetCtx(stream, &context)); + if (err != hipSuccess) return err; + + err = hipCUResultTohipError(cuCtxPushCurrent(context)); + if (err != hipSuccess) return err; + + err = hipCUResultTohipError(cuCtxGetDevice(device)); + if (err != hipSuccess) return err; + + return hipCUResultTohipError(cuCtxPopCurrent(&context)); +} + inline static hipError_t hipDriverGetVersion(int* driverVersion) { return hipCUDAErrorTohipError(cudaDriverGetVersion(driverVersion)); } diff --git a/src/amdhip.def b/src/amdhip.def index 70279e21..ffaff7f5 100644 --- a/src/amdhip.def +++ b/src/amdhip.def @@ -193,6 +193,7 @@ hipStreamCreate hipStreamCreateWithFlags hipStreamCreateWithPriority hipStreamDestroy +hipStreamGetDevice hipStreamGetFlags hipStreamQuery hipStreamSynchronize diff --git a/src/hip_hcc.def.in b/src/hip_hcc.def.in index 129fa7c4..fe219359 100644 --- a/src/hip_hcc.def.in +++ b/src/hip_hcc.def.in @@ -194,6 +194,7 @@ hipStreamCreate hipStreamCreateWithFlags hipStreamCreateWithPriority hipStreamDestroy +hipStreamGetDevice hipStreamGetFlags hipStreamQuery hipStreamSynchronize diff --git a/src/hip_hcc.map.in b/src/hip_hcc.map.in index 81251cca..204b139f 100644 --- a/src/hip_hcc.map.in +++ b/src/hip_hcc.map.in @@ -169,6 +169,7 @@ global: hipStreamCreateWithFlags; hipStreamCreateWithPriority; hipStreamDestroy; + hipStreamGetDevice; hipStreamGetFlags; hipStreamQuery; hipStreamSynchronize; diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp index 1342ac72..6d085fe7 100644 --- a/src/hip_stream.cpp +++ b/src/hip_stream.cpp @@ -795,3 +795,27 @@ hipError_t hipExtStreamGetCUMask(hipStream_t stream, uint32_t cuMaskSize, uint32 } HIP_RETURN(hipSuccess); } + +// ================================================================================================ +hipError_t hipStreamGetDevice(hipStream_t stream, hipDevice_t* device) { + HIP_INIT_API(hipStreamGetDevice, stream, device); + + if (device == nullptr) { + HIP_RETURN(hipErrorInvalidValue); + } + + if (!hip::isValid(stream)) { + return HIP_RETURN(hipErrorContextIsDestroyed); + } + + if (stream == nullptr) { // handle null stream + // null stream is associated with current device, return the device id associated with the + // current device + *device = hip::getCurrentDevice()->deviceId(); + } else { + getStreamPerThread(stream); + *device = reinterpret_cast(stream)->DeviceId(); + } + + HIP_RETURN(hipSuccess); +}