From 325142a18872c0c30b733748311f92c33b9e8740 Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Mon, 29 Sep 2025 12:19:48 +0300 Subject: [PATCH 1/8] DEVICE: Add logs --- src/ucp/api/device/ucp_device_impl.h | 12 ++++ src/ucp/core/ucp_device.c | 4 ++ src/ucs/sys/device_code.h | 84 ++++++++++++++++++++++++++-- src/uct/api/device/uct_device_impl.h | 11 ++++ src/uct/ib/mlx5/gdaki/gdaki.cuh | 3 + 5 files changed, 110 insertions(+), 4 deletions(-) diff --git a/src/ucp/api/device/ucp_device_impl.h b/src/ucp/api/device/ucp_device_impl.h index 5c3ece9f71a..302cb19a991 100644 --- a/src/ucp/api/device/ucp_device_impl.h +++ b/src/ucp/api/device/ucp_device_impl.h @@ -88,6 +88,10 @@ UCS_F_DEVICE ucs_status_t ucp_device_prepare_send( if ((mem_list_h->version != UCP_DEVICE_MEM_LIST_VERSION_V1) || (first_mem_elem_index >= mem_list_h->mem_list_length)) { + ucs_device_error("invalid parameters: mem_list version=%u (expected %u), " + "first_mem_elem_index=%u, mem_list_length=%u", + mem_list_h->version, UCP_DEVICE_MEM_LIST_VERSION_V1, + first_mem_elem_index, mem_list_h->mem_list_length); return UCS_ERR_INVALID_PARAM; } @@ -144,6 +148,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_single( status = ucp_device_prepare_send(mem_list_h, mem_list_index, req, device_ep, uct_elem, comp); if (status != UCS_OK) { + ucs_device_error("send prepare failed with %s, mem_list_index=%u", + ucs_device_status_string(status), mem_list_index); return status; } @@ -195,6 +201,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_counter_inc( status = ucp_device_prepare_send(mem_list_h, mem_list_index, req, device_ep, uct_elem, comp); if (status != UCS_OK) { + ucs_device_error("send prepare failed with %s, mem_list_index=%u", + ucs_device_status_string(status), mem_list_index); return status; } @@ -259,6 +267,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi( status = ucp_device_prepare_send(mem_list_h, 0, req, device_ep, uct_mem_list, comp); if (status != UCS_OK) { + ucs_device_error("send prepare failed with %s, mem_list_length=%u", + ucs_device_status_string(status), mem_list_h->mem_list_length); return status; } @@ -334,6 +344,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi_partial( status = ucp_device_prepare_send(mem_list_h, 0, req, device_ep, uct_mem_list, comp); if (status != UCS_OK) { + ucs_device_error("send prepare failed with %s, mem_list_count=%u", + ucs_device_status_string(status), mem_list_count); return status; } diff --git a/src/ucp/core/ucp_device.c b/src/ucp/core/ucp_device.c index dffba8622b5..b777aaa103d 100644 --- a/src/ucp/core/ucp_device.c +++ b/src/ucp/core/ucp_device.c @@ -380,6 +380,7 @@ ucp_device_mem_list_create(ucp_ep_h ep, uct_allocated_memory_t mem; if (!(ep->flags & UCP_EP_FLAG_REMOTE_CONNECTED)) { + ucs_error("ep=%p didn't complete wireup", ep); return UCS_ERR_NOT_CONNECTED; } @@ -388,6 +389,8 @@ ucp_device_mem_list_create(ucp_ep_h ep, &local_sys_dev, &local_md_map, &mem_type); if (status != UCS_OK) { + ucs_error("ep=%p check parameters failed: %s", ep, + ucs_status_string(status)); return status; } @@ -426,6 +429,7 @@ ucp_device_mem_list_create(ucp_ep_h ep, /* Track memory allocator for later release */ status = ucp_device_mem_handle_hash_insert(&mem); if (status != UCS_OK) { + ucs_error("failed to insert handle: %s", ucs_status_string(status)); uct_mem_free(&mem); } else { *handle_p = mem.address; diff --git a/src/ucs/sys/device_code.h b/src/ucs/sys/device_code.h index d80835fe5c1..48fc98cc5ac 100644 --- a/src/ucs/sys/device_code.h +++ b/src/ucs/sys/device_code.h @@ -8,6 +8,7 @@ #define UCS_DEVICE_CODE_H #include +#include #include /* @@ -35,7 +36,7 @@ typedef enum { } ucs_device_level_t; -static UCS_F_ALWAYS_INLINE const char* +UCS_F_DEVICE const char* ucs_device_level_name(ucs_device_level_t level) { switch (level) { @@ -87,10 +88,10 @@ UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value) /* Helper macro to print a message from a device function including the - * thread and block indices */ + * thread and block indices, file, line, and function */ #define ucs_device_printf(_title, _fmt, ...) \ - printf("(%d:%d) %6s " _fmt "\n", threadIdx.x, blockIdx.x, _title, \ - ##__VA_ARGS__) + printf("(%d:%d) %6s %s:%d %s: " _fmt "\n", threadIdx.x, blockIdx.x, _title, \ + __FILE__, __LINE__, __func__, ##__VA_ARGS__) /* Print an error message from a device function */ #define ucs_device_error(_fmt, ...) \ @@ -101,4 +102,79 @@ UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value) #define ucs_device_debug(_fmt, ...) \ ucs_device_printf("DEBUG", _fmt, ##__VA_ARGS__) + +/** + * @brief Device compatible status code to string conversion + * + * This function provides status code to string conversion that can be called + * from device code. Returns a short string representation of the status code. + * + * @param [in] status Status code to convert + * + * @return Short string representation of the status code + */ +UCS_F_DEVICE const char* ucs_device_status_string(ucs_status_t status) +{ + switch (status) { + case UCS_OK: + return "Success"; + case UCS_INPROGRESS: + return "Operation in progress"; + case UCS_ERR_NO_MESSAGE: + return "No pending message"; + case UCS_ERR_NO_RESOURCE: + return "No resources are available to initiate the operation"; + case UCS_ERR_IO_ERROR: + return "Input/output error"; + case UCS_ERR_NO_MEMORY: + return "Out of memory"; + case UCS_ERR_INVALID_PARAM: + return "Invalid parameter"; + case UCS_ERR_UNREACHABLE: + return "Destination is unreachable"; + case UCS_ERR_INVALID_ADDR: + return "Address not valid"; + case UCS_ERR_NOT_IMPLEMENTED: + return "Function not implemented"; + case UCS_ERR_MESSAGE_TRUNCATED: + return "Message truncated"; + case UCS_ERR_NO_PROGRESS: + return "No progress"; + case UCS_ERR_BUFFER_TOO_SMALL: + return "Provided buffer is too small"; + case UCS_ERR_NO_ELEM: + return "No such element"; + case UCS_ERR_SOME_CONNECTS_FAILED: + return "Failed to connect some of the requested endpoints"; + case UCS_ERR_NO_DEVICE: + return "No such device"; + case UCS_ERR_BUSY: + return "Device is busy"; + case UCS_ERR_CANCELED: + return "Request canceled"; + case UCS_ERR_SHMEM_SEGMENT: + return "Shared memory error"; + case UCS_ERR_ALREADY_EXISTS: + return "Element already exists"; + case UCS_ERR_OUT_OF_RANGE: + return "Index out of range"; + case UCS_ERR_TIMED_OUT: + return "Operation timed out"; + case UCS_ERR_EXCEEDS_LIMIT: + return "User-defined limit was reached"; + case UCS_ERR_UNSUPPORTED: + return "Unsupported operation"; + case UCS_ERR_REJECTED: + return "Operation rejected by remote peer"; + case UCS_ERR_NOT_CONNECTED: + return "Endpoint is not connected"; + case UCS_ERR_CONNECTION_RESET: + return "Connection reset by remote peer"; + case UCS_ERR_ENDPOINT_TIMEOUT: + return "Endpoint timeout"; + default: + return "Unknown error"; + }; +} + #endif diff --git a/src/uct/api/device/uct_device_impl.h b/src/uct/api/device/uct_device_impl.h index fa0d3e0f656..e6e53e793a1 100644 --- a/src/uct/api/device/uct_device_impl.h +++ b/src/uct/api/device/uct_device_impl.h @@ -55,6 +55,8 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_put_single( comp); } + ucs_device_error("unsupported device_ep->uct_tl_id=%d", + device_ep->uct_tl_id); return UCS_ERR_UNSUPPORTED; } @@ -96,6 +98,8 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_atomic_add( flags, comp); } + ucs_device_error("unsupported device_ep->uct_tl_id=%d", + device_ep->uct_tl_id); return UCS_ERR_UNSUPPORTED; } @@ -161,6 +165,8 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_put_multi( flags, comp); } + ucs_device_error("unsupported device_ep->uct_tl_id=%d", + device_ep->uct_tl_id); return UCS_ERR_UNSUPPORTED; } @@ -233,6 +239,9 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_put_multi_partial( counter_inc_value, counter_remote_address, flags, comp); } + + ucs_device_error("unsupported device_ep->uct_tl_id=%d", + device_ep->uct_tl_id); return UCS_ERR_UNSUPPORTED; } @@ -256,6 +265,8 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_progress(uct_device_ep_h device_ep) return UCS_OK; } + ucs_device_error("unsupported device_ep->uct_tl_id=%d", + device_ep->uct_tl_id); return UCS_ERR_UNSUPPORTED; } diff --git a/src/uct/ib/mlx5/gdaki/gdaki.cuh b/src/uct/ib/mlx5/gdaki/gdaki.cuh index b8826e7c416..a68ae9a400a 100644 --- a/src/uct/ib/mlx5/gdaki/gdaki.cuh +++ b/src/uct/ib/mlx5/gdaki/gdaki.cuh @@ -336,6 +336,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi( if ((level != UCS_DEVICE_LEVEL_THREAD) && (level != UCS_DEVICE_LEVEL_WARP)) { + ucs_device_error("unsupported level: %s", ucs_device_level_name(level)); return UCS_ERR_UNSUPPORTED; } @@ -426,6 +427,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial( if ((level != UCS_DEVICE_LEVEL_THREAD) && (level != UCS_DEVICE_LEVEL_WARP)) { + ucs_device_error("unsupported level: %s", ucs_device_level_name(level)); return UCS_ERR_UNSUPPORTED; } @@ -600,6 +602,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_progress(uct_device_ep_h tl_ep) } else if (level == UCS_DEVICE_LEVEL_THREAD) { return uct_rc_mlx5_gda_progress_thread(ep); } else { + ucs_device_error("unsupported level: %s", ucs_device_level_name(level)); return UCS_ERR_UNSUPPORTED; } } From 522843538bbd3e006173aac57ad064b5fe95938b Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Mon, 6 Oct 2025 12:11:30 +0300 Subject: [PATCH 2/8] UCS/STATUS: Add UCS_STATUS_STRING_CASES macro --- src/ucs/sys/device_code.h | 57 +--------------------------------- src/ucs/type/status.c | 57 +--------------------------------- src/ucs/type/status.h | 64 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 112 deletions(-) diff --git a/src/ucs/sys/device_code.h b/src/ucs/sys/device_code.h index 48fc98cc5ac..d76329d2390 100644 --- a/src/ucs/sys/device_code.h +++ b/src/ucs/sys/device_code.h @@ -116,62 +116,7 @@ UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value) UCS_F_DEVICE const char* ucs_device_status_string(ucs_status_t status) { switch (status) { - case UCS_OK: - return "Success"; - case UCS_INPROGRESS: - return "Operation in progress"; - case UCS_ERR_NO_MESSAGE: - return "No pending message"; - case UCS_ERR_NO_RESOURCE: - return "No resources are available to initiate the operation"; - case UCS_ERR_IO_ERROR: - return "Input/output error"; - case UCS_ERR_NO_MEMORY: - return "Out of memory"; - case UCS_ERR_INVALID_PARAM: - return "Invalid parameter"; - case UCS_ERR_UNREACHABLE: - return "Destination is unreachable"; - case UCS_ERR_INVALID_ADDR: - return "Address not valid"; - case UCS_ERR_NOT_IMPLEMENTED: - return "Function not implemented"; - case UCS_ERR_MESSAGE_TRUNCATED: - return "Message truncated"; - case UCS_ERR_NO_PROGRESS: - return "No progress"; - case UCS_ERR_BUFFER_TOO_SMALL: - return "Provided buffer is too small"; - case UCS_ERR_NO_ELEM: - return "No such element"; - case UCS_ERR_SOME_CONNECTS_FAILED: - return "Failed to connect some of the requested endpoints"; - case UCS_ERR_NO_DEVICE: - return "No such device"; - case UCS_ERR_BUSY: - return "Device is busy"; - case UCS_ERR_CANCELED: - return "Request canceled"; - case UCS_ERR_SHMEM_SEGMENT: - return "Shared memory error"; - case UCS_ERR_ALREADY_EXISTS: - return "Element already exists"; - case UCS_ERR_OUT_OF_RANGE: - return "Index out of range"; - case UCS_ERR_TIMED_OUT: - return "Operation timed out"; - case UCS_ERR_EXCEEDS_LIMIT: - return "User-defined limit was reached"; - case UCS_ERR_UNSUPPORTED: - return "Unsupported operation"; - case UCS_ERR_REJECTED: - return "Operation rejected by remote peer"; - case UCS_ERR_NOT_CONNECTED: - return "Endpoint is not connected"; - case UCS_ERR_CONNECTION_RESET: - return "Connection reset by remote peer"; - case UCS_ERR_ENDPOINT_TIMEOUT: - return "Endpoint timeout"; + UCS_STATUS_STRING_CASES default: return "Unknown error"; }; diff --git a/src/ucs/type/status.c b/src/ucs/type/status.c index 1e3547b98ee..58f426d9727 100644 --- a/src/ucs/type/status.c +++ b/src/ucs/type/status.c @@ -18,62 +18,7 @@ const char *ucs_status_string(ucs_status_t status) static char error_str[128] = {0}; switch (status) { - case UCS_OK: - return "Success"; - case UCS_INPROGRESS: - return "Operation in progress"; - case UCS_ERR_NO_MESSAGE: - return "No pending message"; - case UCS_ERR_NO_RESOURCE: - return "No resources are available to initiate the operation"; - case UCS_ERR_IO_ERROR: - return "Input/output error"; - case UCS_ERR_NO_MEMORY: - return "Out of memory"; - case UCS_ERR_INVALID_PARAM: - return "Invalid parameter"; - case UCS_ERR_UNREACHABLE: - return "Destination is unreachable"; - case UCS_ERR_INVALID_ADDR: - return "Address not valid"; - case UCS_ERR_NOT_IMPLEMENTED: - return "Function not implemented"; - case UCS_ERR_MESSAGE_TRUNCATED: - return "Message truncated"; - case UCS_ERR_NO_PROGRESS: - return "No progress"; - case UCS_ERR_BUFFER_TOO_SMALL: - return "Provided buffer is too small"; - case UCS_ERR_NO_ELEM: - return "No such element"; - case UCS_ERR_SOME_CONNECTS_FAILED: - return "Failed to connect some of the requested endpoints"; - case UCS_ERR_NO_DEVICE: - return "No such device"; - case UCS_ERR_BUSY: - return "Device is busy"; - case UCS_ERR_CANCELED: - return "Request canceled"; - case UCS_ERR_SHMEM_SEGMENT: - return "Shared memory error"; - case UCS_ERR_ALREADY_EXISTS: - return "Element already exists"; - case UCS_ERR_OUT_OF_RANGE: - return "Index out of range"; - case UCS_ERR_TIMED_OUT: - return "Operation timed out"; - case UCS_ERR_EXCEEDS_LIMIT: - return "User-defined limit was reached"; - case UCS_ERR_UNSUPPORTED: - return "Unsupported operation"; - case UCS_ERR_REJECTED: - return "Operation rejected by remote peer"; - case UCS_ERR_NOT_CONNECTED: - return "Endpoint is not connected"; - case UCS_ERR_CONNECTION_RESET: - return "Connection reset by remote peer"; - case UCS_ERR_ENDPOINT_TIMEOUT: - return "Endpoint timeout"; + UCS_STATUS_STRING_CASES default: snprintf(error_str, sizeof(error_str) - 1, "Unknown error %d", status); return error_str; diff --git a/src/ucs/type/status.h b/src/ucs/type/status.h index 4c40cda905a..e105ceb85e4 100644 --- a/src/ucs/type/status.h +++ b/src/ucs/type/status.h @@ -112,6 +112,70 @@ typedef void *ucs_status_ptr_t; #define UCS_STATUS_PTR(_status) ((void*)(intptr_t)(_status)) #define UCS_STATUS_IS_ERR(_status) ((_status) < 0) +/** + * @brief Common status code to string cases + * + * This macro defines the common switch cases for converting status codes to + * strings. It's used by both the host and device implementations to avoid + * code duplication. + */ +#define UCS_STATUS_STRING_CASES \ + case UCS_OK: \ + return "Success"; \ + case UCS_INPROGRESS: \ + return "Operation in progress"; \ + case UCS_ERR_NO_MESSAGE: \ + return "No pending message"; \ + case UCS_ERR_NO_RESOURCE: \ + return "No resources are available to initiate the operation"; \ + case UCS_ERR_IO_ERROR: \ + return "Input/output error"; \ + case UCS_ERR_NO_MEMORY: \ + return "Out of memory"; \ + case UCS_ERR_INVALID_PARAM: \ + return "Invalid parameter"; \ + case UCS_ERR_UNREACHABLE: \ + return "Destination is unreachable"; \ + case UCS_ERR_INVALID_ADDR: \ + return "Address not valid"; \ + case UCS_ERR_NOT_IMPLEMENTED: \ + return "Function not implemented"; \ + case UCS_ERR_MESSAGE_TRUNCATED: \ + return "Message truncated"; \ + case UCS_ERR_NO_PROGRESS: \ + return "No progress"; \ + case UCS_ERR_BUFFER_TOO_SMALL: \ + return "Provided buffer is too small"; \ + case UCS_ERR_NO_ELEM: \ + return "No such element"; \ + case UCS_ERR_SOME_CONNECTS_FAILED: \ + return "Failed to connect some of the requested endpoints"; \ + case UCS_ERR_NO_DEVICE: \ + return "No such device"; \ + case UCS_ERR_BUSY: \ + return "Device is busy"; \ + case UCS_ERR_CANCELED: \ + return "Request canceled"; \ + case UCS_ERR_SHMEM_SEGMENT: \ + return "Shared memory error"; \ + case UCS_ERR_ALREADY_EXISTS: \ + return "Element already exists"; \ + case UCS_ERR_OUT_OF_RANGE: \ + return "Index out of range"; \ + case UCS_ERR_TIMED_OUT: \ + return "Operation timed out"; \ + case UCS_ERR_EXCEEDS_LIMIT: \ + return "User-defined limit was reached"; \ + case UCS_ERR_UNSUPPORTED: \ + return "Unsupported operation"; \ + case UCS_ERR_REJECTED: \ + return "Operation rejected by remote peer"; \ + case UCS_ERR_NOT_CONNECTED: \ + return "Endpoint is not connected"; \ + case UCS_ERR_CONNECTION_RESET: \ + return "Connection reset by remote peer"; \ + case UCS_ERR_ENDPOINT_TIMEOUT: \ + return "Endpoint timeout"; /** * @param status UCS status code. From e00e62a990a3039dd0df6bb6d989722a70782496 Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Wed, 8 Oct 2025 20:50:28 +0300 Subject: [PATCH 3/8] DEVICE: Remove log of wireup until decided --- src/ucp/core/ucp_device.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ucp/core/ucp_device.c b/src/ucp/core/ucp_device.c index b777aaa103d..af1afc6e8d0 100644 --- a/src/ucp/core/ucp_device.c +++ b/src/ucp/core/ucp_device.c @@ -380,7 +380,6 @@ ucp_device_mem_list_create(ucp_ep_h ep, uct_allocated_memory_t mem; if (!(ep->flags & UCP_EP_FLAG_REMOTE_CONNECTED)) { - ucs_error("ep=%p didn't complete wireup", ep); return UCS_ERR_NOT_CONNECTED; } From 66e07321ecfa87e82b3f60ab04b2c147753a5328 Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Thu, 9 Oct 2025 13:03:24 +0300 Subject: [PATCH 4/8] DEVICE: Use X-macro --- src/ucs/type/status.h | 150 +++++++++++++++--------------------------- 1 file changed, 53 insertions(+), 97 deletions(-) diff --git a/src/ucs/type/status.h b/src/ucs/type/status.h index e105ceb85e4..b2403df2329 100644 --- a/src/ucs/type/status.h +++ b/src/ucs/type/status.h @@ -31,6 +31,50 @@ BEGIN_C_DECLS * @} */ +/** + * @ingroup UCS_RESOURCE + * @brief X-macro for defining status codes and their string representations + * + * This macro allows defining status codes and their associated messages in one + * place, avoiding duplication between enum definitions and string conversions. + * + * Usage: UCS_FOREACH_STATUS(_macro) where _macro(ID, VALUE, MSG) is expanded for each status. + */ +#define UCS_FOREACH_STATUS(_macro) \ + _macro(UCS_OK, 0, "Success") \ + _macro(UCS_INPROGRESS, 1, "Operation in progress") \ + _macro(UCS_ERR_NO_MESSAGE, -1, "No pending message") \ + _macro(UCS_ERR_NO_RESOURCE, -2, "No resources are available to initiate the operation") \ + _macro(UCS_ERR_IO_ERROR, -3, "Input/output error") \ + _macro(UCS_ERR_NO_MEMORY, -4, "Out of memory") \ + _macro(UCS_ERR_INVALID_PARAM, -5, "Invalid parameter") \ + _macro(UCS_ERR_UNREACHABLE, -6, "Destination is unreachable") \ + _macro(UCS_ERR_INVALID_ADDR, -7, "Address not valid") \ + _macro(UCS_ERR_NOT_IMPLEMENTED, -8, "Function not implemented") \ + _macro(UCS_ERR_MESSAGE_TRUNCATED, -9, "Message truncated") \ + _macro(UCS_ERR_NO_PROGRESS, -10, "No progress") \ + _macro(UCS_ERR_BUFFER_TOO_SMALL, -11, "Provided buffer is too small") \ + _macro(UCS_ERR_NO_ELEM, -12, "No such element") \ + _macro(UCS_ERR_SOME_CONNECTS_FAILED,-13, "Failed to connect some of the requested endpoints") \ + _macro(UCS_ERR_NO_DEVICE, -14, "No such device") \ + _macro(UCS_ERR_BUSY, -15, "Device is busy") \ + _macro(UCS_ERR_CANCELED, -16, "Request canceled") \ + _macro(UCS_ERR_SHMEM_SEGMENT, -17, "Shared memory error") \ + _macro(UCS_ERR_ALREADY_EXISTS, -18, "Element already exists") \ + _macro(UCS_ERR_OUT_OF_RANGE, -19, "Index out of range") \ + _macro(UCS_ERR_TIMED_OUT, -20, "Operation timed out") \ + _macro(UCS_ERR_EXCEEDS_LIMIT, -21, "User-defined limit was reached") \ + _macro(UCS_ERR_UNSUPPORTED, -22, "Unsupported operation") \ + _macro(UCS_ERR_REJECTED, -23, "Operation rejected by remote peer") \ + _macro(UCS_ERR_NOT_CONNECTED, -24, "Endpoint is not connected") \ + _macro(UCS_ERR_CONNECTION_RESET, -25, "Connection reset by remote peer") \ + _macro(UCS_ERR_FIRST_LINK_FAILURE, -40, "First link failure") \ + _macro(UCS_ERR_LAST_LINK_FAILURE, -59, "Last link failure") \ + _macro(UCS_ERR_FIRST_ENDPOINT_FAILURE,-60, "First endpoint failure") \ + _macro(UCS_ERR_ENDPOINT_TIMEOUT, -80, "Endpoint timeout") \ + _macro(UCS_ERR_LAST_ENDPOINT_FAILURE,-89, "Last endpoint failure") \ + _macro(UCS_ERR_LAST, -100, "Last error code") + /** * @ingroup UCS_RESOURCE * @brief Status codes @@ -42,47 +86,10 @@ BEGIN_C_DECLS * For example, if a link fails it may be sufficient to destroy (and possibly * replace) it, in contrast to an endpoint-level error. */ +#define UCS_STATUS_ENUMIFY(ID, VALUE, _) ID = VALUE, + typedef enum { - /* Operation completed successfully */ - UCS_OK = 0, - - /* Operation is queued and still in progress */ - UCS_INPROGRESS = 1, - - /* Failure codes */ - UCS_ERR_NO_MESSAGE = -1, - UCS_ERR_NO_RESOURCE = -2, - UCS_ERR_IO_ERROR = -3, - UCS_ERR_NO_MEMORY = -4, - UCS_ERR_INVALID_PARAM = -5, - UCS_ERR_UNREACHABLE = -6, - UCS_ERR_INVALID_ADDR = -7, - UCS_ERR_NOT_IMPLEMENTED = -8, - UCS_ERR_MESSAGE_TRUNCATED = -9, - UCS_ERR_NO_PROGRESS = -10, - UCS_ERR_BUFFER_TOO_SMALL = -11, - UCS_ERR_NO_ELEM = -12, - UCS_ERR_SOME_CONNECTS_FAILED = -13, - UCS_ERR_NO_DEVICE = -14, - UCS_ERR_BUSY = -15, - UCS_ERR_CANCELED = -16, - UCS_ERR_SHMEM_SEGMENT = -17, - UCS_ERR_ALREADY_EXISTS = -18, - UCS_ERR_OUT_OF_RANGE = -19, - UCS_ERR_TIMED_OUT = -20, - UCS_ERR_EXCEEDS_LIMIT = -21, - UCS_ERR_UNSUPPORTED = -22, - UCS_ERR_REJECTED = -23, - UCS_ERR_NOT_CONNECTED = -24, - UCS_ERR_CONNECTION_RESET = -25, - - UCS_ERR_FIRST_LINK_FAILURE = -40, - UCS_ERR_LAST_LINK_FAILURE = -59, - UCS_ERR_FIRST_ENDPOINT_FAILURE = -60, - UCS_ERR_ENDPOINT_TIMEOUT = -80, - UCS_ERR_LAST_ENDPOINT_FAILURE = -89, - - UCS_ERR_LAST = -100 + UCS_FOREACH_STATUS(UCS_STATUS_ENUMIFY) } UCS_S_PACKED ucs_status_t; @@ -112,6 +119,11 @@ typedef void *ucs_status_ptr_t; #define UCS_STATUS_PTR(_status) ((void*)(intptr_t)(_status)) #define UCS_STATUS_IS_ERR(_status) ((_status) < 0) +/** + * @brief Helper macro to generate switch case for status to string conversion + */ +#define UCS_STATUS_STRINGIFY(ID, _, MSG) case ID: return MSG; + /** * @brief Common status code to string cases * @@ -119,63 +131,7 @@ typedef void *ucs_status_ptr_t; * strings. It's used by both the host and device implementations to avoid * code duplication. */ -#define UCS_STATUS_STRING_CASES \ - case UCS_OK: \ - return "Success"; \ - case UCS_INPROGRESS: \ - return "Operation in progress"; \ - case UCS_ERR_NO_MESSAGE: \ - return "No pending message"; \ - case UCS_ERR_NO_RESOURCE: \ - return "No resources are available to initiate the operation"; \ - case UCS_ERR_IO_ERROR: \ - return "Input/output error"; \ - case UCS_ERR_NO_MEMORY: \ - return "Out of memory"; \ - case UCS_ERR_INVALID_PARAM: \ - return "Invalid parameter"; \ - case UCS_ERR_UNREACHABLE: \ - return "Destination is unreachable"; \ - case UCS_ERR_INVALID_ADDR: \ - return "Address not valid"; \ - case UCS_ERR_NOT_IMPLEMENTED: \ - return "Function not implemented"; \ - case UCS_ERR_MESSAGE_TRUNCATED: \ - return "Message truncated"; \ - case UCS_ERR_NO_PROGRESS: \ - return "No progress"; \ - case UCS_ERR_BUFFER_TOO_SMALL: \ - return "Provided buffer is too small"; \ - case UCS_ERR_NO_ELEM: \ - return "No such element"; \ - case UCS_ERR_SOME_CONNECTS_FAILED: \ - return "Failed to connect some of the requested endpoints"; \ - case UCS_ERR_NO_DEVICE: \ - return "No such device"; \ - case UCS_ERR_BUSY: \ - return "Device is busy"; \ - case UCS_ERR_CANCELED: \ - return "Request canceled"; \ - case UCS_ERR_SHMEM_SEGMENT: \ - return "Shared memory error"; \ - case UCS_ERR_ALREADY_EXISTS: \ - return "Element already exists"; \ - case UCS_ERR_OUT_OF_RANGE: \ - return "Index out of range"; \ - case UCS_ERR_TIMED_OUT: \ - return "Operation timed out"; \ - case UCS_ERR_EXCEEDS_LIMIT: \ - return "User-defined limit was reached"; \ - case UCS_ERR_UNSUPPORTED: \ - return "Unsupported operation"; \ - case UCS_ERR_REJECTED: \ - return "Operation rejected by remote peer"; \ - case UCS_ERR_NOT_CONNECTED: \ - return "Endpoint is not connected"; \ - case UCS_ERR_CONNECTION_RESET: \ - return "Connection reset by remote peer"; \ - case UCS_ERR_ENDPOINT_TIMEOUT: \ - return "Endpoint timeout"; +#define UCS_STATUS_STRING_CASES UCS_FOREACH_STATUS(UCS_STATUS_STRINGIFY) /** * @param status UCS status code. From b2a039673223b3099595349687b2d1d066c9c3c6 Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Thu, 9 Oct 2025 13:18:46 +0300 Subject: [PATCH 5/8] DEVICE: Re-add wireup log with scoped_log_handler in the test --- src/ucp/core/ucp_device.c | 1 + test/gtest/ucp/test_ucp_device.cc | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/ucp/core/ucp_device.c b/src/ucp/core/ucp_device.c index af1afc6e8d0..b777aaa103d 100644 --- a/src/ucp/core/ucp_device.c +++ b/src/ucp/core/ucp_device.c @@ -380,6 +380,7 @@ ucp_device_mem_list_create(ucp_ep_h ep, uct_allocated_memory_t mem; if (!(ep->flags & UCP_EP_FLAG_REMOTE_CONNECTED)) { + ucs_error("ep=%p didn't complete wireup", ep); return UCS_ERR_NOT_CONNECTED; } diff --git a/test/gtest/ucp/test_ucp_device.cc b/test/gtest/ucp/test_ucp_device.cc index 06d2fb8a420..993a062fbe1 100644 --- a/test/gtest/ucp/test_ucp_device.cc +++ b/test/gtest/ucp/test_ucp_device.cc @@ -78,9 +78,12 @@ void test_ucp_device::init() } ucp_device_mem_list_handle_h handle; - while (ucp_device_mem_list_create(sender().ep(), NULL, &handle) == - UCS_ERR_NOT_CONNECTED) { - progress(); + { + scoped_log_handler slh(wrap_errors_logger); + while (ucp_device_mem_list_create(sender().ep(), NULL, &handle) == + UCS_ERR_NOT_CONNECTED) { + progress(); + } } } From a1b7d9867b371b22764e3412be662ce094e94ea7 Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Thu, 9 Oct 2025 13:45:48 +0300 Subject: [PATCH 6/8] DEVICE: Fix max widths in logs --- src/ucs/sys/device_code.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ucs/sys/device_code.h b/src/ucs/sys/device_code.h index d76329d2390..8d24de22b93 100644 --- a/src/ucs/sys/device_code.h +++ b/src/ucs/sys/device_code.h @@ -90,7 +90,7 @@ UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value) /* Helper macro to print a message from a device function including the * thread and block indices, file, line, and function */ #define ucs_device_printf(_title, _fmt, ...) \ - printf("(%d:%d) %6s %s:%d %s: " _fmt "\n", threadIdx.x, blockIdx.x, _title, \ + printf("(%5d:%5d) %5s %-40.40s:%-4d %-30.30s: " _fmt "\n", threadIdx.x, blockIdx.x, _title, \ __FILE__, __LINE__, __func__, ##__VA_ARGS__) /* Print an error message from a device function */ From e82e8674b04e6ee3d8b2a36051856a4de4b4640b Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Mon, 20 Oct 2025 17:50:20 +0300 Subject: [PATCH 7/8] DEVICE: PR fixes --- src/ucs/sys/device_code.h | 36 ++++++++++++++++----- src/ucs/sys/string.h | 20 ++++++++++-- src/ucs/type/status.h | 66 +++++++++++++++++++-------------------- 3 files changed, 78 insertions(+), 44 deletions(-) diff --git a/src/ucs/sys/device_code.h b/src/ucs/sys/device_code.h index 8d24de22b93..f8f5eae3a1c 100644 --- a/src/ucs/sys/device_code.h +++ b/src/ucs/sys/device_code.h @@ -8,6 +8,7 @@ #define UCS_DEVICE_CODE_H #include +#include #include #include @@ -87,11 +88,33 @@ UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value) } +/** + * @brief Device compatible basename function + * + * Get pointer to file name in path, same as basename but do not modify source + * string. + * + * @param [in] path Path to parse + * + * @return File name + */ +UCS_F_DEVICE const char* ucs_device_basename(const char *path) +{ + return UCS_BASENAME(path); +} + + +/* Device log format - matches UCX host log structure */ +#define UCS_DEVICE_LOG_FMT "%20s[%-8d:%-7d] %17s:%-4u %-4s %-5s %*s" + + /* Helper macro to print a message from a device function including the - * thread and block indices, file, line, and function */ -#define ucs_device_printf(_title, _fmt, ...) \ - printf("(%5d:%5d) %5s %-40.40s:%-4d %-30.30s: " _fmt "\n", threadIdx.x, blockIdx.x, _title, \ - __FILE__, __LINE__, __func__, ##__VA_ARGS__) + * thread and block indices, file and line */ +#define ucs_device_printf(_level, _fmt, ...) \ + printf(UCS_DEVICE_LOG_FMT _fmt "\n", \ + "", threadIdx.x, blockIdx.x, ucs_device_basename(__FILE__), __LINE__, \ + "UCX", _level, 0, "", ##__VA_ARGS__) + /* Print an error message from a device function */ #define ucs_device_error(_fmt, ...) \ @@ -106,12 +129,9 @@ UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value) /** * @brief Device compatible status code to string conversion * - * This function provides status code to string conversion that can be called - * from device code. Returns a short string representation of the status code. - * * @param [in] status Status code to convert * - * @return Short string representation of the status code + * @return String representation of the status code */ UCS_F_DEVICE const char* ucs_device_status_string(ucs_status_t status) { diff --git a/src/ucs/sys/string.h b/src/ucs/sys/string.h index a479f8ce73e..dec52bb0a13 100644 --- a/src/ucs/sys/string.h +++ b/src/ucs/sys/string.h @@ -40,6 +40,22 @@ BEGIN_C_DECLS #define UCS_VALUE_UNKNOWN_STR "unknown" +/* Macro for basename implementation logic used in both host and device code */ +#define UCS_BASENAME(_path) \ + ({ \ + const char *_p = (_path); \ + const char *_result = (_path); \ + while (*_p != '\0') { \ + if (*_p == '/') { \ + _result = _p + 1; \ + } \ + _p++; \ + } \ + _result; \ + }) + + + /** * Expand a partial path to full path. * @@ -210,9 +226,7 @@ char *ucs_strtrim(char *str); */ static UCS_F_ALWAYS_INLINE const char* ucs_basename(const char *path) { - const char *name = strrchr(path, '/'); - - return (name == NULL) ? path : name + 1; + return UCS_BASENAME(path); } diff --git a/src/ucs/type/status.h b/src/ucs/type/status.h index b2403df2329..a887cfc24ff 100644 --- a/src/ucs/type/status.h +++ b/src/ucs/type/status.h @@ -41,39 +41,39 @@ BEGIN_C_DECLS * Usage: UCS_FOREACH_STATUS(_macro) where _macro(ID, VALUE, MSG) is expanded for each status. */ #define UCS_FOREACH_STATUS(_macro) \ - _macro(UCS_OK, 0, "Success") \ - _macro(UCS_INPROGRESS, 1, "Operation in progress") \ - _macro(UCS_ERR_NO_MESSAGE, -1, "No pending message") \ - _macro(UCS_ERR_NO_RESOURCE, -2, "No resources are available to initiate the operation") \ - _macro(UCS_ERR_IO_ERROR, -3, "Input/output error") \ - _macro(UCS_ERR_NO_MEMORY, -4, "Out of memory") \ - _macro(UCS_ERR_INVALID_PARAM, -5, "Invalid parameter") \ - _macro(UCS_ERR_UNREACHABLE, -6, "Destination is unreachable") \ - _macro(UCS_ERR_INVALID_ADDR, -7, "Address not valid") \ - _macro(UCS_ERR_NOT_IMPLEMENTED, -8, "Function not implemented") \ - _macro(UCS_ERR_MESSAGE_TRUNCATED, -9, "Message truncated") \ - _macro(UCS_ERR_NO_PROGRESS, -10, "No progress") \ - _macro(UCS_ERR_BUFFER_TOO_SMALL, -11, "Provided buffer is too small") \ - _macro(UCS_ERR_NO_ELEM, -12, "No such element") \ - _macro(UCS_ERR_SOME_CONNECTS_FAILED,-13, "Failed to connect some of the requested endpoints") \ - _macro(UCS_ERR_NO_DEVICE, -14, "No such device") \ - _macro(UCS_ERR_BUSY, -15, "Device is busy") \ - _macro(UCS_ERR_CANCELED, -16, "Request canceled") \ - _macro(UCS_ERR_SHMEM_SEGMENT, -17, "Shared memory error") \ - _macro(UCS_ERR_ALREADY_EXISTS, -18, "Element already exists") \ - _macro(UCS_ERR_OUT_OF_RANGE, -19, "Index out of range") \ - _macro(UCS_ERR_TIMED_OUT, -20, "Operation timed out") \ - _macro(UCS_ERR_EXCEEDS_LIMIT, -21, "User-defined limit was reached") \ - _macro(UCS_ERR_UNSUPPORTED, -22, "Unsupported operation") \ - _macro(UCS_ERR_REJECTED, -23, "Operation rejected by remote peer") \ - _macro(UCS_ERR_NOT_CONNECTED, -24, "Endpoint is not connected") \ - _macro(UCS_ERR_CONNECTION_RESET, -25, "Connection reset by remote peer") \ - _macro(UCS_ERR_FIRST_LINK_FAILURE, -40, "First link failure") \ - _macro(UCS_ERR_LAST_LINK_FAILURE, -59, "Last link failure") \ - _macro(UCS_ERR_FIRST_ENDPOINT_FAILURE,-60, "First endpoint failure") \ - _macro(UCS_ERR_ENDPOINT_TIMEOUT, -80, "Endpoint timeout") \ - _macro(UCS_ERR_LAST_ENDPOINT_FAILURE,-89, "Last endpoint failure") \ - _macro(UCS_ERR_LAST, -100, "Last error code") + _macro(UCS_OK, 0, "Success") \ + _macro(UCS_INPROGRESS, 1, "Operation in progress") \ + _macro(UCS_ERR_NO_MESSAGE, -1, "No pending message") \ + _macro(UCS_ERR_NO_RESOURCE, -2, "No resources are available to initiate the operation") \ + _macro(UCS_ERR_IO_ERROR, -3, "Input/output error") \ + _macro(UCS_ERR_NO_MEMORY, -4, "Out of memory") \ + _macro(UCS_ERR_INVALID_PARAM, -5, "Invalid parameter") \ + _macro(UCS_ERR_UNREACHABLE, -6, "Destination is unreachable") \ + _macro(UCS_ERR_INVALID_ADDR, -7, "Address not valid") \ + _macro(UCS_ERR_NOT_IMPLEMENTED, -8, "Function not implemented") \ + _macro(UCS_ERR_MESSAGE_TRUNCATED, -9, "Message truncated") \ + _macro(UCS_ERR_NO_PROGRESS, -10, "No progress") \ + _macro(UCS_ERR_BUFFER_TOO_SMALL, -11, "Provided buffer is too small") \ + _macro(UCS_ERR_NO_ELEM, -12, "No such element") \ + _macro(UCS_ERR_SOME_CONNECTS_FAILED, -13, "Failed to connect some of the requested endpoints") \ + _macro(UCS_ERR_NO_DEVICE, -14, "No such device") \ + _macro(UCS_ERR_BUSY, -15, "Device is busy") \ + _macro(UCS_ERR_CANCELED, -16, "Request canceled") \ + _macro(UCS_ERR_SHMEM_SEGMENT, -17, "Shared memory error") \ + _macro(UCS_ERR_ALREADY_EXISTS, -18, "Element already exists") \ + _macro(UCS_ERR_OUT_OF_RANGE, -19, "Index out of range") \ + _macro(UCS_ERR_TIMED_OUT, -20, "Operation timed out") \ + _macro(UCS_ERR_EXCEEDS_LIMIT, -21, "User-defined limit was reached") \ + _macro(UCS_ERR_UNSUPPORTED, -22, "Unsupported operation") \ + _macro(UCS_ERR_REJECTED, -23, "Operation rejected by remote peer") \ + _macro(UCS_ERR_NOT_CONNECTED, -24, "Endpoint is not connected") \ + _macro(UCS_ERR_CONNECTION_RESET, -25, "Connection reset by remote peer") \ + _macro(UCS_ERR_FIRST_LINK_FAILURE, -40, "First link failure") \ + _macro(UCS_ERR_LAST_LINK_FAILURE, -59, "Last link failure") \ + _macro(UCS_ERR_FIRST_ENDPOINT_FAILURE, -60, "First endpoint failure") \ + _macro(UCS_ERR_ENDPOINT_TIMEOUT, -80, "Endpoint timeout") \ + _macro(UCS_ERR_LAST_ENDPOINT_FAILURE, -89, "Last endpoint failure") \ + _macro(UCS_ERR_LAST, -100, "Last error code") /** * @ingroup UCS_RESOURCE From fbc6bb7d901633495fea7669b388bf038cd3fcb9 Mon Sep 17 00:00:00 2001 From: Michal Shalev Date: Tue, 21 Oct 2025 02:56:07 +0300 Subject: [PATCH 8/8] DEVICE: Fix code style --- src/ucs/sys/device_code.h | 15 +++++++-------- src/ucs/sys/string.h | 2 +- src/ucs/type/status.c | 2 +- src/ucs/type/status.h | 5 +++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/ucs/sys/device_code.h b/src/ucs/sys/device_code.h index f8f5eae3a1c..c16fba37579 100644 --- a/src/ucs/sys/device_code.h +++ b/src/ucs/sys/device_code.h @@ -37,8 +37,7 @@ typedef enum { } ucs_device_level_t; -UCS_F_DEVICE const char* -ucs_device_level_name(ucs_device_level_t level) +UCS_F_DEVICE const char *ucs_device_level_name(ucs_device_level_t level) { switch (level) { case UCS_DEVICE_LEVEL_THREAD: @@ -98,7 +97,7 @@ UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value) * * @return File name */ -UCS_F_DEVICE const char* ucs_device_basename(const char *path) +UCS_F_DEVICE const char *ucs_device_basename(const char *path) { return UCS_BASENAME(path); } @@ -111,9 +110,9 @@ UCS_F_DEVICE const char* ucs_device_basename(const char *path) /* Helper macro to print a message from a device function including the * thread and block indices, file and line */ #define ucs_device_printf(_level, _fmt, ...) \ - printf(UCS_DEVICE_LOG_FMT _fmt "\n", \ - "", threadIdx.x, blockIdx.x, ucs_device_basename(__FILE__), __LINE__, \ - "UCX", _level, 0, "", ##__VA_ARGS__) + printf(UCS_DEVICE_LOG_FMT _fmt "\n", "", threadIdx.x, blockIdx.x, \ + ucs_device_basename(__FILE__), __LINE__, "UCX", _level, 0, "", \ + ##__VA_ARGS__) /* Print an error message from a device function */ @@ -133,10 +132,10 @@ UCS_F_DEVICE const char* ucs_device_basename(const char *path) * * @return String representation of the status code */ -UCS_F_DEVICE const char* ucs_device_status_string(ucs_status_t status) +UCS_F_DEVICE const char *ucs_device_status_string(ucs_status_t status) { switch (status) { - UCS_STATUS_STRING_CASES + UCS_STATUS_STRING_CASES default: return "Unknown error"; }; diff --git a/src/ucs/sys/string.h b/src/ucs/sys/string.h index dec52bb0a13..6fb00258a31 100644 --- a/src/ucs/sys/string.h +++ b/src/ucs/sys/string.h @@ -43,7 +43,7 @@ BEGIN_C_DECLS /* Macro for basename implementation logic used in both host and device code */ #define UCS_BASENAME(_path) \ ({ \ - const char *_p = (_path); \ + const char *_p = (_path); \ const char *_result = (_path); \ while (*_p != '\0') { \ if (*_p == '/') { \ diff --git a/src/ucs/type/status.c b/src/ucs/type/status.c index 58f426d9727..3507a1d7ade 100644 --- a/src/ucs/type/status.c +++ b/src/ucs/type/status.c @@ -18,7 +18,7 @@ const char *ucs_status_string(ucs_status_t status) static char error_str[128] = {0}; switch (status) { - UCS_STATUS_STRING_CASES + UCS_STATUS_STRING_CASES default: snprintf(error_str, sizeof(error_str) - 1, "Unknown error %d", status); return error_str; diff --git a/src/ucs/type/status.h b/src/ucs/type/status.h index a887cfc24ff..b1be0dca6ef 100644 --- a/src/ucs/type/status.h +++ b/src/ucs/type/status.h @@ -122,8 +122,9 @@ typedef void *ucs_status_ptr_t; /** * @brief Helper macro to generate switch case for status to string conversion */ -#define UCS_STATUS_STRINGIFY(ID, _, MSG) case ID: return MSG; - +#define UCS_STATUS_STRINGIFY(ID, _, MSG) \ + case ID: \ + return MSG; /** * @brief Common status code to string cases *