Skip to content

Commit 9f1cee5

Browse files
authored
[SYCL][CUDA] Improve kernel launch error handling for out-of-registers (#12604)
This PR improves the handling of errors by specializing `PI_ERROR_OUT_OF_RESOURCES`. Previously, in the CUDA backend we handled the out of resources launch error (for exceeded registers) as invalid work group size error. Now pairing the new specialized handling with the UR adapter change oneapi-src/unified-runtime#1318 to return the correct error code, we no longer output a misleading error message to users. Also, added a fallback message for the generic out of resources error codes returned from APIs (e.g. for kernel launch). Fixes issue: oneapi-src/unified-runtime#1308
1 parent 030a937 commit 9f1cee5

File tree

3 files changed

+71
-53
lines changed

3 files changed

+71
-53
lines changed

sycl/plugins/unified_runtime/CMakeLists.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,13 +100,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
100100
endfunction()
101101

102102
set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
103-
# commit 9f783837089c970a22cda08f768aa3dbed38f0d3
104-
# Merge: c015f892 b9442104
103+
# commit 5083f4f96557672b7b6a55ea53347896d40549d7
104+
# Merge: a97eed15 4c3f9abe
105105
# Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
106-
# Date: Fri May 31 10:20:23 2024 +0100
107-
# Merge pull request #1533 from AllanZyne/sanitizer-buffer
108-
# [DeviceSanitizer] Support detecting out-of-bounds errors on sycl::buffer
109-
set(UNIFIED_RUNTIME_TAG 9f783837089c970a22cda08f768aa3dbed38f0d3)
106+
# Date: Fri May 31 17:20:01 2024 +0100
107+
# Merge pull request #1397 from GeorgeWeb/georgi/check-allocation-error-on-event-from-native-handle
108+
# [CUDA][HIP] Catch and report bad_alloc errors for event object creation
109+
set(UNIFIED_RUNTIME_TAG 5083f4f96557672b7b6a55ea53347896d40549d7)
110110

111111
fetch_adapter_source(level_zero
112112
${UNIFIED_RUNTIME_REPO}

sycl/source/detail/error_handling/error_handling.cpp

Lines changed: 57 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,60 @@ namespace sycl {
2020
inline namespace _V1 {
2121
namespace detail::enqueue_kernel_launch {
2222

23+
void handleOutOfResources(const device_impl &DeviceImpl, pi_kernel Kernel,
24+
const NDRDescT &NDRDesc) {
25+
sycl::platform Platform = DeviceImpl.get_platform();
26+
sycl::backend Backend = Platform.get_backend();
27+
if (Backend == sycl::backend::ext_oneapi_cuda) {
28+
// PI_ERROR_OUT_OF_RESOURCES is returned when the kernel registers
29+
// required for the launch config exceeds the maximum number of registers
30+
// per block (PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP).
31+
// This is if local_work_size[0] * ... * local_work_size[work_dim - 1]
32+
// multiplied by PI_KERNEL_GROUP_INFO_NUM_REGS is greater than the value
33+
// of PI_KERNEL_MAX_NUM_REGISTERS_PER_BLOCK. See Table 15: Technical
34+
// Specifications per Compute Capability, for limitations.
35+
const size_t TotalNumberOfWIs =
36+
NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];
37+
38+
const uint32_t MaxRegistersPerBlock =
39+
DeviceImpl.get_info<ext::codeplay::experimental::info::device::
40+
max_registers_per_work_group>();
41+
42+
const PluginPtr &Plugin = DeviceImpl.getPlugin();
43+
sycl::detail::pi::PiDevice Device = DeviceImpl.getHandleRef();
44+
45+
uint32_t NumRegisters = 0;
46+
Plugin->call<PiApiKind::piKernelGetGroupInfo>(
47+
Kernel, Device, PI_KERNEL_GROUP_INFO_NUM_REGS, sizeof(NumRegisters),
48+
&NumRegisters, nullptr);
49+
50+
const bool HasExceededAvailableRegisters =
51+
TotalNumberOfWIs * NumRegisters > MaxRegistersPerBlock;
52+
53+
if (HasExceededAvailableRegisters) {
54+
std::string message(
55+
"Exceeded the number of registers available on the hardware.\n");
56+
throw sycl::exception(
57+
sycl::make_error_code(sycl::errc::nd_range),
58+
// Additional information which can be helpful to the user.
59+
message.append(
60+
"\tThe number registers per work-group cannot exceed " +
61+
std::to_string(MaxRegistersPerBlock) +
62+
" for this kernel on this device.\n"
63+
"\tThe kernel uses " +
64+
std::to_string(NumRegisters) +
65+
" registers per work-item for a total of " +
66+
std::to_string(TotalNumberOfWIs) +
67+
" work-items per work-group.\n"));
68+
}
69+
}
70+
// Fallback
71+
constexpr pi_result Error = PI_ERROR_OUT_OF_RESOURCES;
72+
throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
73+
"PI backend failed. PI backend returns:" +
74+
codeToString(Error));
75+
}
76+
2377
void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
2478
const NDRDescT &NDRDesc) {
2579
sycl::platform Platform = DeviceImpl.get_platform();
@@ -30,7 +84,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
3084
bool IsOpenCLV1x = false; // Backend is OpenCL 1.x
3185
bool IsOpenCLVGE20 = false; // Backend is Greater or Equal to OpenCL 2.0
3286
bool IsLevelZero = false; // Backend is any OneAPI Level 0 version
33-
bool IsCuda = false; // Backend is CUDA
3487
auto Backend = Platform.get_backend();
3588
if (Backend == sycl::backend::opencl) {
3689
std::string VersionString =
@@ -41,8 +94,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
4194
(VersionString.find("2.") == 0) || (VersionString.find("3.") == 0);
4295
} else if (Backend == sycl::backend::ext_oneapi_level_zero) {
4396
IsLevelZero = true;
44-
} else if (Backend == sycl::backend::ext_oneapi_cuda) {
45-
IsCuda = true;
4697
}
4798

4899
const PluginPtr &Plugin = DeviceImpl.getPlugin();
@@ -243,46 +294,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
243294
// else unknown. fallback (below)
244295
}
245296
}
246-
} else if (IsCuda) {
247-
// CUDA:
248-
// PI_ERROR_INVALID_WORK_GROUP_SIZE is returned when the kernel registers
249-
// required for the launch config exceeds the maximum number of registers
250-
// per block (PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP).
251-
// This is if local_work_size[0] * ... * local_work_size[work_dim - 1]
252-
// multiplied by PI_KERNEL_GROUP_INFO_NUM_REGS is greater than the value
253-
// of PI_KERNEL_MAX_NUM_REGISTERS_PER_BLOCK. See Table 15: Technical
254-
// Specifications per Compute Capability, for limitations.
255-
const size_t TotalNumberOfWIs =
256-
NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];
257-
258-
uint32_t NumRegisters = 0;
259-
Plugin->call<PiApiKind::piKernelGetGroupInfo>(
260-
Kernel, Device, PI_KERNEL_GROUP_INFO_NUM_REGS, sizeof(NumRegisters),
261-
&NumRegisters, nullptr);
262-
263-
uint32_t MaxRegistersPerBlock =
264-
DeviceImpl.get_info<ext::codeplay::experimental::info::device::
265-
max_registers_per_work_group>();
266-
267-
const bool HasExceededAvailableRegisters =
268-
TotalNumberOfWIs * NumRegisters > MaxRegistersPerBlock;
269-
270-
if (HasExceededAvailableRegisters) {
271-
std::string message(
272-
"Exceeded the number of registers available on the hardware.\n");
273-
throw sycl::nd_range_error(
274-
// Additional information which can be helpful to the user.
275-
message.append(
276-
"\tThe number registers per work-group cannot exceed " +
277-
std::to_string(MaxRegistersPerBlock) +
278-
" for this kernel on this device.\n"
279-
"\tThe kernel uses " +
280-
std::to_string(NumRegisters) +
281-
" registers per work-item for a total of " +
282-
std::to_string(TotalNumberOfWIs) +
283-
" work-items per work-group.\n"),
284-
PI_ERROR_INVALID_WORK_GROUP_SIZE);
285-
}
286297
} else {
287298
// TODO: Decide what checks (if any) we need for the other backends
288299
}
@@ -352,6 +363,9 @@ void handleErrorOrWarning(pi_result Error, const device_impl &DeviceImpl,
352363
assert(Error != PI_SUCCESS &&
353364
"Success is expected to be handled on caller side");
354365
switch (Error) {
366+
case PI_ERROR_OUT_OF_RESOURCES:
367+
return handleOutOfResources(DeviceImpl, Kernel, NDRDesc);
368+
355369
case PI_ERROR_INVALID_WORK_GROUP_SIZE:
356370
return handleInvalidWorkGroupSize(DeviceImpl, Kernel, NDRDesc);
357371

sycl/test-e2e/OptionalKernelFeatures/throw-exception-for-out-of-registers-on-kernel-launch.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
class kernel_vadd_and_sum;
2727

2828
int main() {
29-
sycl::queue q;
29+
sycl::queue q{};
3030
sycl::device dev = q.get_device();
3131
size_t local_size = dev.get_info<sycl::info::device::max_work_group_size>();
3232
if (local_size < 1024u) {
@@ -80,6 +80,8 @@ int main() {
8080
// compute vector add
8181
const auto vadd = values1 + values2 + values3 + values4;
8282

83+
// NB: 64 registers used to do the vector addition.
84+
8385
// compute total vector elements sum
8486
auto sum = elem_t(0);
8587
for (int j = 0; j < VEC_DIM; j++) {
@@ -92,11 +94,13 @@ int main() {
9294
output[i] = vadd;
9395
output[i] += sum;
9496
});
95-
}).wait();
96-
} catch (sycl::exception &e) {
97+
}).wait_and_throw();
98+
} catch (const sycl::exception &e) {
9799
using std::string_view_literals::operator""sv;
98100
auto Msg = "Exceeded the number of registers available on the hardware."sv;
99-
if (std::string(e.what()).find(Msg) != std::string::npos) {
101+
auto Errc = sycl::make_error_code(sycl::errc::nd_range);
102+
if (e.code() == Errc &&
103+
std::string_view{e.what()}.find(Msg) != std::string_view::npos) {
100104
return 0;
101105
}
102106
}

0 commit comments

Comments
 (0)