diff --git a/src/ucp/api/device/ucp_device_impl.h b/src/ucp/api/device/ucp_device_impl.h index a9e1ac694d1..d7c64c9f8f4 100644 --- a/src/ucp/api/device/ucp_device_impl.h +++ b/src/ucp/api/device/ucp_device_impl.h @@ -91,6 +91,10 @@ UCS_F_DEVICE ucs_status_t ucp_device_prepare_send( if ((mem_list_h->version != UCP_DEVICE_MEM_LIST_VERSION_V1) || (first_mem_elem_index >= mem_list_h->mem_list_length)) { + ucs_device_error("invalid parameters: mem_list version=%u (expected %u), " + "first_mem_elem_index=%u, mem_list_length=%u", + mem_list_h->version, UCP_DEVICE_MEM_LIST_VERSION_V1, + first_mem_elem_index, mem_list_h->mem_list_length); return UCS_ERR_INVALID_PARAM; } @@ -157,6 +161,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_single( status = ucp_device_prepare_send(mem_list_h, mem_list_index, req, device_ep, uct_elem, comp); if (status != UCS_OK) { + ucs_device_error("send prepare failed with %s, mem_list_index=%u", + ucs_device_status_string(status), mem_list_index); return status; } @@ -214,6 +220,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_counter_inc( status = ucp_device_prepare_send(mem_list_h, mem_list_index, req, device_ep, uct_elem, comp); if (status != UCS_OK) { + ucs_device_error("send prepare failed with %s, mem_list_index=%u", + ucs_device_status_string(status), mem_list_index); return status; } @@ -276,6 +284,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi( status = ucp_device_prepare_send(mem_list_h, 0, req, device_ep, uct_mem_list, comp); if (status != UCS_OK) { + ucs_device_error("send prepare failed with %s, mem_list_length=%u", + ucs_device_status_string(status), mem_list_h->mem_list_length); return status; } @@ -362,6 +372,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi_partial( status = ucp_device_prepare_send(mem_list_h, 0, req, device_ep, uct_mem_list, comp); if (status != UCS_OK) { + ucs_device_error("send prepare failed with %s, mem_list_count=%u", + ucs_device_status_string(status), mem_list_count); return status; } diff --git a/src/ucp/core/ucp_device.c b/src/ucp/core/ucp_device.c index 919c547b138..8d32c2a4777 100644 --- a/src/ucp/core/ucp_device.c +++ b/src/ucp/core/ucp_device.c @@ -419,6 +419,7 @@ ucp_device_mem_list_create(ucp_ep_h ep, uct_allocated_memory_t mem; if (!(ep->flags & UCP_EP_FLAG_REMOTE_CONNECTED)) { + ucs_error("ep=%p didn't complete wireup", ep); return UCS_ERR_NOT_CONNECTED; } @@ -427,6 +428,8 @@ ucp_device_mem_list_create(ucp_ep_h ep, &local_sys_dev, &local_md_map, &mem_type); if (status != UCS_OK) { + ucs_error("ep=%p check parameters failed: %s", ep, + ucs_status_string(status)); return status; } @@ -464,6 +467,7 @@ ucp_device_mem_list_create(ucp_ep_h ep, /* Track memory allocator for later release */ status = ucp_device_mem_handle_hash_insert(&mem); if (status != UCS_OK) { + ucs_error("failed to insert handle: %s", ucs_status_string(status)); uct_mem_free(&mem); } else { *handle_p = mem.address; diff --git a/src/ucs/sys/device_code.h b/src/ucs/sys/device_code.h index d80835fe5c1..c16fba37579 100644 --- a/src/ucs/sys/device_code.h +++ b/src/ucs/sys/device_code.h @@ -8,6 +8,8 @@ #define UCS_DEVICE_CODE_H #include +#include +#include #include /* @@ -35,8 +37,7 @@ typedef enum { } ucs_device_level_t; -static UCS_F_ALWAYS_INLINE const char* -ucs_device_level_name(ucs_device_level_t level) +UCS_F_DEVICE const char *ucs_device_level_name(ucs_device_level_t level) { switch (level) { case UCS_DEVICE_LEVEL_THREAD: @@ -86,12 +87,34 @@ UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value) } +/** + * @brief Device compatible basename function + * + * Get pointer to file name in path, same as basename but do not modify source + * string. + * + * @param [in] path Path to parse + * + * @return File name + */ +UCS_F_DEVICE const char *ucs_device_basename(const char *path) +{ + return UCS_BASENAME(path); +} + + +/* Device log format - matches UCX host log structure */ +#define UCS_DEVICE_LOG_FMT "%20s[%-8d:%-7d] %17s:%-4u %-4s %-5s %*s" + + /* Helper macro to print a message from a device function including the - * thread and block indices */ -#define ucs_device_printf(_title, _fmt, ...) \ - printf("(%d:%d) %6s " _fmt "\n", threadIdx.x, blockIdx.x, _title, \ + * thread and block indices, file and line */ +#define ucs_device_printf(_level, _fmt, ...) \ + printf(UCS_DEVICE_LOG_FMT _fmt "\n", "", threadIdx.x, blockIdx.x, \ + ucs_device_basename(__FILE__), __LINE__, "UCX", _level, 0, "", \ ##__VA_ARGS__) + /* Print an error message from a device function */ #define ucs_device_error(_fmt, ...) \ ucs_device_printf("ERROR", _fmt, ##__VA_ARGS__) @@ -101,4 +124,21 @@ UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value) #define ucs_device_debug(_fmt, ...) \ ucs_device_printf("DEBUG", _fmt, ##__VA_ARGS__) + +/** + * @brief Device compatible status code to string conversion + * + * @param [in] status Status code to convert + * + * @return String representation of the status code + */ +UCS_F_DEVICE const char *ucs_device_status_string(ucs_status_t status) +{ + switch (status) { + UCS_STATUS_STRING_CASES + default: + return "Unknown error"; + }; +} + #endif diff --git a/src/ucs/sys/string.h b/src/ucs/sys/string.h index a479f8ce73e..6fb00258a31 100644 --- a/src/ucs/sys/string.h +++ b/src/ucs/sys/string.h @@ -40,6 +40,22 @@ BEGIN_C_DECLS #define UCS_VALUE_UNKNOWN_STR "unknown" +/* Macro for basename implementation logic used in both host and device code */ +#define UCS_BASENAME(_path) \ + ({ \ + const char *_p = (_path); \ + const char *_result = (_path); \ + while (*_p != '\0') { \ + if (*_p == '/') { \ + _result = _p + 1; \ + } \ + _p++; \ + } \ + _result; \ + }) + + + /** * Expand a partial path to full path. * @@ -210,9 +226,7 @@ char *ucs_strtrim(char *str); */ static UCS_F_ALWAYS_INLINE const char* ucs_basename(const char *path) { - const char *name = strrchr(path, '/'); - - return (name == NULL) ? path : name + 1; + return UCS_BASENAME(path); } diff --git a/src/ucs/type/status.c b/src/ucs/type/status.c index 1e3547b98ee..3507a1d7ade 100644 --- a/src/ucs/type/status.c +++ b/src/ucs/type/status.c @@ -18,62 +18,7 @@ const char *ucs_status_string(ucs_status_t status) static char error_str[128] = {0}; switch (status) { - case UCS_OK: - return "Success"; - case UCS_INPROGRESS: - return "Operation in progress"; - case UCS_ERR_NO_MESSAGE: - return "No pending message"; - case UCS_ERR_NO_RESOURCE: - return "No resources are available to initiate the operation"; - case UCS_ERR_IO_ERROR: - return "Input/output error"; - case UCS_ERR_NO_MEMORY: - return "Out of memory"; - case UCS_ERR_INVALID_PARAM: - return "Invalid parameter"; - case UCS_ERR_UNREACHABLE: - return "Destination is unreachable"; - case UCS_ERR_INVALID_ADDR: - return "Address not valid"; - case UCS_ERR_NOT_IMPLEMENTED: - return "Function not implemented"; - case UCS_ERR_MESSAGE_TRUNCATED: - return "Message truncated"; - case UCS_ERR_NO_PROGRESS: - return "No progress"; - case UCS_ERR_BUFFER_TOO_SMALL: - return "Provided buffer is too small"; - case UCS_ERR_NO_ELEM: - return "No such element"; - case UCS_ERR_SOME_CONNECTS_FAILED: - return "Failed to connect some of the requested endpoints"; - case UCS_ERR_NO_DEVICE: - return "No such device"; - case UCS_ERR_BUSY: - return "Device is busy"; - case UCS_ERR_CANCELED: - return "Request canceled"; - case UCS_ERR_SHMEM_SEGMENT: - return "Shared memory error"; - case UCS_ERR_ALREADY_EXISTS: - return "Element already exists"; - case UCS_ERR_OUT_OF_RANGE: - return "Index out of range"; - case UCS_ERR_TIMED_OUT: - return "Operation timed out"; - case UCS_ERR_EXCEEDS_LIMIT: - return "User-defined limit was reached"; - case UCS_ERR_UNSUPPORTED: - return "Unsupported operation"; - case UCS_ERR_REJECTED: - return "Operation rejected by remote peer"; - case UCS_ERR_NOT_CONNECTED: - return "Endpoint is not connected"; - case UCS_ERR_CONNECTION_RESET: - return "Connection reset by remote peer"; - case UCS_ERR_ENDPOINT_TIMEOUT: - return "Endpoint timeout"; + UCS_STATUS_STRING_CASES default: snprintf(error_str, sizeof(error_str) - 1, "Unknown error %d", status); return error_str; diff --git a/src/ucs/type/status.h b/src/ucs/type/status.h index 4c40cda905a..b1be0dca6ef 100644 --- a/src/ucs/type/status.h +++ b/src/ucs/type/status.h @@ -31,6 +31,50 @@ BEGIN_C_DECLS * @} */ +/** + * @ingroup UCS_RESOURCE + * @brief X-macro for defining status codes and their string representations + * + * This macro allows defining status codes and their associated messages in one + * place, avoiding duplication between enum definitions and string conversions. + * + * Usage: UCS_FOREACH_STATUS(_macro) where _macro(ID, VALUE, MSG) is expanded for each status. + */ +#define UCS_FOREACH_STATUS(_macro) \ + _macro(UCS_OK, 0, "Success") \ + _macro(UCS_INPROGRESS, 1, "Operation in progress") \ + _macro(UCS_ERR_NO_MESSAGE, -1, "No pending message") \ + _macro(UCS_ERR_NO_RESOURCE, -2, "No resources are available to initiate the operation") \ + _macro(UCS_ERR_IO_ERROR, -3, "Input/output error") \ + _macro(UCS_ERR_NO_MEMORY, -4, "Out of memory") \ + _macro(UCS_ERR_INVALID_PARAM, -5, "Invalid parameter") \ + _macro(UCS_ERR_UNREACHABLE, -6, "Destination is unreachable") \ + _macro(UCS_ERR_INVALID_ADDR, -7, "Address not valid") \ + _macro(UCS_ERR_NOT_IMPLEMENTED, -8, "Function not implemented") \ + _macro(UCS_ERR_MESSAGE_TRUNCATED, -9, "Message truncated") \ + _macro(UCS_ERR_NO_PROGRESS, -10, "No progress") \ + _macro(UCS_ERR_BUFFER_TOO_SMALL, -11, "Provided buffer is too small") \ + _macro(UCS_ERR_NO_ELEM, -12, "No such element") \ + _macro(UCS_ERR_SOME_CONNECTS_FAILED, -13, "Failed to connect some of the requested endpoints") \ + _macro(UCS_ERR_NO_DEVICE, -14, "No such device") \ + _macro(UCS_ERR_BUSY, -15, "Device is busy") \ + _macro(UCS_ERR_CANCELED, -16, "Request canceled") \ + _macro(UCS_ERR_SHMEM_SEGMENT, -17, "Shared memory error") \ + _macro(UCS_ERR_ALREADY_EXISTS, -18, "Element already exists") \ + _macro(UCS_ERR_OUT_OF_RANGE, -19, "Index out of range") \ + _macro(UCS_ERR_TIMED_OUT, -20, "Operation timed out") \ + _macro(UCS_ERR_EXCEEDS_LIMIT, -21, "User-defined limit was reached") \ + _macro(UCS_ERR_UNSUPPORTED, -22, "Unsupported operation") \ + _macro(UCS_ERR_REJECTED, -23, "Operation rejected by remote peer") \ + _macro(UCS_ERR_NOT_CONNECTED, -24, "Endpoint is not connected") \ + _macro(UCS_ERR_CONNECTION_RESET, -25, "Connection reset by remote peer") \ + _macro(UCS_ERR_FIRST_LINK_FAILURE, -40, "First link failure") \ + _macro(UCS_ERR_LAST_LINK_FAILURE, -59, "Last link failure") \ + _macro(UCS_ERR_FIRST_ENDPOINT_FAILURE, -60, "First endpoint failure") \ + _macro(UCS_ERR_ENDPOINT_TIMEOUT, -80, "Endpoint timeout") \ + _macro(UCS_ERR_LAST_ENDPOINT_FAILURE, -89, "Last endpoint failure") \ + _macro(UCS_ERR_LAST, -100, "Last error code") + /** * @ingroup UCS_RESOURCE * @brief Status codes @@ -42,47 +86,10 @@ BEGIN_C_DECLS * For example, if a link fails it may be sufficient to destroy (and possibly * replace) it, in contrast to an endpoint-level error. */ +#define UCS_STATUS_ENUMIFY(ID, VALUE, _) ID = VALUE, + typedef enum { - /* Operation completed successfully */ - UCS_OK = 0, - - /* Operation is queued and still in progress */ - UCS_INPROGRESS = 1, - - /* Failure codes */ - UCS_ERR_NO_MESSAGE = -1, - UCS_ERR_NO_RESOURCE = -2, - UCS_ERR_IO_ERROR = -3, - UCS_ERR_NO_MEMORY = -4, - UCS_ERR_INVALID_PARAM = -5, - UCS_ERR_UNREACHABLE = -6, - UCS_ERR_INVALID_ADDR = -7, - UCS_ERR_NOT_IMPLEMENTED = -8, - UCS_ERR_MESSAGE_TRUNCATED = -9, - UCS_ERR_NO_PROGRESS = -10, - UCS_ERR_BUFFER_TOO_SMALL = -11, - UCS_ERR_NO_ELEM = -12, - UCS_ERR_SOME_CONNECTS_FAILED = -13, - UCS_ERR_NO_DEVICE = -14, - UCS_ERR_BUSY = -15, - UCS_ERR_CANCELED = -16, - UCS_ERR_SHMEM_SEGMENT = -17, - UCS_ERR_ALREADY_EXISTS = -18, - UCS_ERR_OUT_OF_RANGE = -19, - UCS_ERR_TIMED_OUT = -20, - UCS_ERR_EXCEEDS_LIMIT = -21, - UCS_ERR_UNSUPPORTED = -22, - UCS_ERR_REJECTED = -23, - UCS_ERR_NOT_CONNECTED = -24, - UCS_ERR_CONNECTION_RESET = -25, - - UCS_ERR_FIRST_LINK_FAILURE = -40, - UCS_ERR_LAST_LINK_FAILURE = -59, - UCS_ERR_FIRST_ENDPOINT_FAILURE = -60, - UCS_ERR_ENDPOINT_TIMEOUT = -80, - UCS_ERR_LAST_ENDPOINT_FAILURE = -89, - - UCS_ERR_LAST = -100 + UCS_FOREACH_STATUS(UCS_STATUS_ENUMIFY) } UCS_S_PACKED ucs_status_t; @@ -112,6 +119,20 @@ typedef void *ucs_status_ptr_t; #define UCS_STATUS_PTR(_status) ((void*)(intptr_t)(_status)) #define UCS_STATUS_IS_ERR(_status) ((_status) < 0) +/** + * @brief Helper macro to generate switch case for status to string conversion + */ +#define UCS_STATUS_STRINGIFY(ID, _, MSG) \ + case ID: \ + return MSG; +/** + * @brief Common status code to string cases + * + * This macro defines the common switch cases for converting status codes to + * strings. It's used by both the host and device implementations to avoid + * code duplication. + */ +#define UCS_STATUS_STRING_CASES UCS_FOREACH_STATUS(UCS_STATUS_STRINGIFY) /** * @param status UCS status code. diff --git a/src/uct/api/device/uct_device_impl.h b/src/uct/api/device/uct_device_impl.h index 42b82ee5fc5..f54cc42e66d 100644 --- a/src/uct/api/device/uct_device_impl.h +++ b/src/uct/api/device/uct_device_impl.h @@ -65,6 +65,8 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_put_single( comp); } + ucs_device_error("unsupported device_ep->uct_tl_id=%d", + device_ep->uct_tl_id); return UCS_ERR_UNSUPPORTED; } @@ -110,6 +112,8 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_atomic_add( remote_address, flags, comp); } + ucs_device_error("unsupported device_ep->uct_tl_id=%d", + device_ep->uct_tl_id); return UCS_ERR_UNSUPPORTED; } @@ -180,6 +184,8 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_put_multi( comp); } + ucs_device_error("unsupported device_ep->uct_tl_id=%d", + device_ep->uct_tl_id); return UCS_ERR_UNSUPPORTED; } @@ -260,6 +266,9 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_put_multi_partial( lengths, counter_index, counter_inc_value, counter_remote_address, flags, comp); } + + ucs_device_error("unsupported device_ep->uct_tl_id=%d", + device_ep->uct_tl_id); return UCS_ERR_UNSUPPORTED; } @@ -299,6 +308,8 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_check_completion( return uct_rc_mlx5_gda_ep_check_completion(device_ep, comp); } + ucs_device_error("unsupported device_ep->uct_tl_id=%d", + device_ep->uct_tl_id); return UCS_ERR_UNSUPPORTED; } diff --git a/src/uct/ib/mlx5/gdaki/gdaki.cuh b/src/uct/ib/mlx5/gdaki/gdaki.cuh index b057bc3cf62..2025c7ea44c 100644 --- a/src/uct/ib/mlx5/gdaki/gdaki.cuh +++ b/src/uct/ib/mlx5/gdaki/gdaki.cuh @@ -344,6 +344,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi( if ((level != UCS_DEVICE_LEVEL_THREAD) && (level != UCS_DEVICE_LEVEL_WARP)) { + ucs_device_error("unsupported level: %s", ucs_device_level_name(level)); return UCS_ERR_UNSUPPORTED; } @@ -438,6 +439,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial( if ((level != UCS_DEVICE_LEVEL_THREAD) && (level != UCS_DEVICE_LEVEL_WARP)) { + ucs_device_error("unsupported level: %s", ucs_device_level_name(level)); return UCS_ERR_UNSUPPORTED; } diff --git a/test/gtest/ucp/test_ucp_device.cc b/test/gtest/ucp/test_ucp_device.cc index 15906e020b8..6a58933ca33 100644 --- a/test/gtest/ucp/test_ucp_device.cc +++ b/test/gtest/ucp/test_ucp_device.cc @@ -78,9 +78,12 @@ void test_ucp_device::init() } ucp_device_mem_list_handle_h handle; - while (ucp_device_mem_list_create(sender().ep(), NULL, &handle) == - UCS_ERR_NOT_CONNECTED) { - progress(); + { + scoped_log_handler slh(wrap_errors_logger); + while (ucp_device_mem_list_create(sender().ep(), NULL, &handle) == + UCS_ERR_NOT_CONNECTED) { + progress(); + } } }