-
Notifications
You must be signed in to change notification settings - Fork 491
DEVICE: Improve error logs in GPU and host device APIs #10921
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
325142a
5228435
58e73ed
e00e62a
66e0732
b2a0396
a1b7d98
77968d5
e82e867
31b28e2
fbc6bb7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,6 +8,8 @@ | |
| #define UCS_DEVICE_CODE_H | ||
|
|
||
| #include <ucs/sys/compiler_def.h> | ||
| #include <ucs/sys/string.h> | ||
| #include <ucs/type/status.h> | ||
| #include <stdint.h> | ||
|
|
||
| /* | ||
|
|
@@ -35,8 +37,7 @@ typedef enum { | |
| } ucs_device_level_t; | ||
|
|
||
|
|
||
| static UCS_F_ALWAYS_INLINE const char* | ||
| ucs_device_level_name(ucs_device_level_t level) | ||
| UCS_F_DEVICE const char *ucs_device_level_name(ucs_device_level_t level) | ||
| { | ||
| switch (level) { | ||
| case UCS_DEVICE_LEVEL_THREAD: | ||
|
|
@@ -86,12 +87,34 @@ UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value) | |
| } | ||
|
|
||
|
|
||
| /** | ||
| * @brief Device compatible basename function | ||
| * | ||
| * Get pointer to file name in path, same as basename but do not modify source | ||
| * string. | ||
| * | ||
| * @param [in] path Path to parse | ||
| * | ||
| * @return File name | ||
| */ | ||
| UCS_F_DEVICE const char *ucs_device_basename(const char *path) | ||
| { | ||
| return UCS_BASENAME(path); | ||
| } | ||
|
|
||
|
|
||
| /* Device log format - matches UCX host log structure */ | ||
| #define UCS_DEVICE_LOG_FMT "%20s[%-8d:%-7d] %17s:%-4u %-4s %-5s %*s" | ||
|
|
||
|
|
||
| /* Helper macro to print a message from a device function including the | ||
| * thread and block indices */ | ||
| #define ucs_device_printf(_title, _fmt, ...) \ | ||
| printf("(%d:%d) %6s " _fmt "\n", threadIdx.x, blockIdx.x, _title, \ | ||
| * thread and block indices, file and line */ | ||
| #define ucs_device_printf(_level, _fmt, ...) \ | ||
| printf(UCS_DEVICE_LOG_FMT _fmt "\n", "", threadIdx.x, blockIdx.x, \ | ||
| ucs_device_basename(__FILE__), __LINE__, "UCX", _level, 0, "", \ | ||
| ##__VA_ARGS__) | ||
|
|
||
|
|
||
| /* Print an error message from a device function */ | ||
| #define ucs_device_error(_fmt, ...) \ | ||
| ucs_device_printf("ERROR", _fmt, ##__VA_ARGS__) | ||
|
|
@@ -101,4 +124,21 @@ UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value) | |
| #define ucs_device_debug(_fmt, ...) \ | ||
| ucs_device_printf("DEBUG", _fmt, ##__VA_ARGS__) | ||
|
|
||
|
|
||
| /** | ||
| * @brief Device compatible status code to string conversion | ||
| * | ||
| * @param [in] status Status code to convert | ||
| * | ||
| * @return String representation of the status code | ||
| */ | ||
| UCS_F_DEVICE const char *ucs_device_status_string(ucs_status_t status) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we use original |
||
| { | ||
| switch (status) { | ||
| UCS_STATUS_STRING_CASES | ||
| default: | ||
| return "Unknown error"; | ||
| }; | ||
| } | ||
|
|
||
| #endif | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -40,6 +40,22 @@ BEGIN_C_DECLS | |||||
| #define UCS_VALUE_UNKNOWN_STR "unknown" | ||||||
|
|
||||||
|
|
||||||
| /* Macro for basename implementation logic used in both host and device code */ | ||||||
| #define UCS_BASENAME(_path) \ | ||||||
| ({ \ | ||||||
| const char *_p = (_path); \ | ||||||
| const char *_result = (_path); \ | ||||||
| while (*_p != '\0') { \ | ||||||
| if (*_p == '/') { \ | ||||||
| _result = _p + 1; \ | ||||||
| } \ | ||||||
| _p++; \ | ||||||
| } \ | ||||||
| _result; \ | ||||||
| }) | ||||||
|
|
||||||
|
|
||||||
|
|
||||||
| /** | ||||||
| * Expand a partial path to full path. | ||||||
| * | ||||||
|
|
@@ -210,9 +226,7 @@ char *ucs_strtrim(char *str); | |||||
| */ | ||||||
| static UCS_F_ALWAYS_INLINE const char* ucs_basename(const char *path) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we do it without macro by declaring function as both host and device?
Suggested change
|
||||||
| { | ||||||
| const char *name = strrchr(path, '/'); | ||||||
|
|
||||||
| return (name == NULL) ? path : name + 1; | ||||||
| return UCS_BASENAME(path); | ||||||
| } | ||||||
|
|
||||||
|
|
||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,50 @@ BEGIN_C_DECLS | |
| * @} | ||
| */ | ||
|
|
||
| /** | ||
| * @ingroup UCS_RESOURCE | ||
| * @brief X-macro for defining status codes and their string representations | ||
| * | ||
| * This macro allows defining status codes and their associated messages in one | ||
| * place, avoiding duplication between enum definitions and string conversions. | ||
| * | ||
| * Usage: UCS_FOREACH_STATUS(_macro) where _macro(ID, VALUE, MSG) is expanded for each status. | ||
| */ | ||
| #define UCS_FOREACH_STATUS(_macro) \ | ||
| _macro(UCS_OK, 0, "Success") \ | ||
| _macro(UCS_INPROGRESS, 1, "Operation in progress") \ | ||
| _macro(UCS_ERR_NO_MESSAGE, -1, "No pending message") \ | ||
| _macro(UCS_ERR_NO_RESOURCE, -2, "No resources are available to initiate the operation") \ | ||
| _macro(UCS_ERR_IO_ERROR, -3, "Input/output error") \ | ||
| _macro(UCS_ERR_NO_MEMORY, -4, "Out of memory") \ | ||
| _macro(UCS_ERR_INVALID_PARAM, -5, "Invalid parameter") \ | ||
| _macro(UCS_ERR_UNREACHABLE, -6, "Destination is unreachable") \ | ||
| _macro(UCS_ERR_INVALID_ADDR, -7, "Address not valid") \ | ||
| _macro(UCS_ERR_NOT_IMPLEMENTED, -8, "Function not implemented") \ | ||
| _macro(UCS_ERR_MESSAGE_TRUNCATED, -9, "Message truncated") \ | ||
| _macro(UCS_ERR_NO_PROGRESS, -10, "No progress") \ | ||
| _macro(UCS_ERR_BUFFER_TOO_SMALL, -11, "Provided buffer is too small") \ | ||
| _macro(UCS_ERR_NO_ELEM, -12, "No such element") \ | ||
| _macro(UCS_ERR_SOME_CONNECTS_FAILED, -13, "Failed to connect some of the requested endpoints") \ | ||
| _macro(UCS_ERR_NO_DEVICE, -14, "No such device") \ | ||
| _macro(UCS_ERR_BUSY, -15, "Device is busy") \ | ||
| _macro(UCS_ERR_CANCELED, -16, "Request canceled") \ | ||
| _macro(UCS_ERR_SHMEM_SEGMENT, -17, "Shared memory error") \ | ||
| _macro(UCS_ERR_ALREADY_EXISTS, -18, "Element already exists") \ | ||
| _macro(UCS_ERR_OUT_OF_RANGE, -19, "Index out of range") \ | ||
| _macro(UCS_ERR_TIMED_OUT, -20, "Operation timed out") \ | ||
| _macro(UCS_ERR_EXCEEDS_LIMIT, -21, "User-defined limit was reached") \ | ||
| _macro(UCS_ERR_UNSUPPORTED, -22, "Unsupported operation") \ | ||
| _macro(UCS_ERR_REJECTED, -23, "Operation rejected by remote peer") \ | ||
| _macro(UCS_ERR_NOT_CONNECTED, -24, "Endpoint is not connected") \ | ||
| _macro(UCS_ERR_CONNECTION_RESET, -25, "Connection reset by remote peer") \ | ||
| _macro(UCS_ERR_FIRST_LINK_FAILURE, -40, "First link failure") \ | ||
| _macro(UCS_ERR_LAST_LINK_FAILURE, -59, "Last link failure") \ | ||
| _macro(UCS_ERR_FIRST_ENDPOINT_FAILURE, -60, "First endpoint failure") \ | ||
| _macro(UCS_ERR_ENDPOINT_TIMEOUT, -80, "Endpoint timeout") \ | ||
| _macro(UCS_ERR_LAST_ENDPOINT_FAILURE, -89, "Last endpoint failure") \ | ||
| _macro(UCS_ERR_LAST, -100, "Last error code") | ||
|
|
||
| /** | ||
| * @ingroup UCS_RESOURCE | ||
| * @brief Status codes | ||
|
|
@@ -42,47 +86,10 @@ BEGIN_C_DECLS | |
| * For example, if a link fails it may be sufficient to destroy (and possibly | ||
| * replace) it, in contrast to an endpoint-level error. | ||
| */ | ||
| #define UCS_STATUS_ENUMIFY(ID, VALUE, _) ID = VALUE, | ||
|
|
||
| typedef enum { | ||
| /* Operation completed successfully */ | ||
| UCS_OK = 0, | ||
|
|
||
| /* Operation is queued and still in progress */ | ||
| UCS_INPROGRESS = 1, | ||
|
|
||
| /* Failure codes */ | ||
| UCS_ERR_NO_MESSAGE = -1, | ||
| UCS_ERR_NO_RESOURCE = -2, | ||
| UCS_ERR_IO_ERROR = -3, | ||
| UCS_ERR_NO_MEMORY = -4, | ||
| UCS_ERR_INVALID_PARAM = -5, | ||
| UCS_ERR_UNREACHABLE = -6, | ||
| UCS_ERR_INVALID_ADDR = -7, | ||
| UCS_ERR_NOT_IMPLEMENTED = -8, | ||
| UCS_ERR_MESSAGE_TRUNCATED = -9, | ||
| UCS_ERR_NO_PROGRESS = -10, | ||
| UCS_ERR_BUFFER_TOO_SMALL = -11, | ||
| UCS_ERR_NO_ELEM = -12, | ||
| UCS_ERR_SOME_CONNECTS_FAILED = -13, | ||
| UCS_ERR_NO_DEVICE = -14, | ||
| UCS_ERR_BUSY = -15, | ||
| UCS_ERR_CANCELED = -16, | ||
| UCS_ERR_SHMEM_SEGMENT = -17, | ||
| UCS_ERR_ALREADY_EXISTS = -18, | ||
| UCS_ERR_OUT_OF_RANGE = -19, | ||
| UCS_ERR_TIMED_OUT = -20, | ||
| UCS_ERR_EXCEEDS_LIMIT = -21, | ||
| UCS_ERR_UNSUPPORTED = -22, | ||
| UCS_ERR_REJECTED = -23, | ||
| UCS_ERR_NOT_CONNECTED = -24, | ||
| UCS_ERR_CONNECTION_RESET = -25, | ||
|
|
||
| UCS_ERR_FIRST_LINK_FAILURE = -40, | ||
| UCS_ERR_LAST_LINK_FAILURE = -59, | ||
| UCS_ERR_FIRST_ENDPOINT_FAILURE = -60, | ||
| UCS_ERR_ENDPOINT_TIMEOUT = -80, | ||
| UCS_ERR_LAST_ENDPOINT_FAILURE = -89, | ||
|
|
||
| UCS_ERR_LAST = -100 | ||
| UCS_FOREACH_STATUS(UCS_STATUS_ENUMIFY) | ||
| } UCS_S_PACKED ucs_status_t; | ||
|
|
||
|
|
||
|
|
@@ -112,6 +119,20 @@ typedef void *ucs_status_ptr_t; | |
| #define UCS_STATUS_PTR(_status) ((void*)(intptr_t)(_status)) | ||
| #define UCS_STATUS_IS_ERR(_status) ((_status) < 0) | ||
|
|
||
| /** | ||
| * @brief Helper macro to generate switch case for status to string conversion | ||
| */ | ||
| #define UCS_STATUS_STRINGIFY(ID, _, MSG) \ | ||
| case ID: \ | ||
| return MSG; | ||
| /** | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing newline |
||
| * @brief Common status code to string cases | ||
| * | ||
| * This macro defines the common switch cases for converting status codes to | ||
| * strings. It's used by both the host and device implementations to avoid | ||
| * code duplication. | ||
| */ | ||
| #define UCS_STATUS_STRING_CASES UCS_FOREACH_STATUS(UCS_STATUS_STRINGIFY) | ||
|
|
||
| /** | ||
| * @param status UCS status code. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we need to remove that one eventually?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added a scoped handler in the test