Skip to content

Commit 341b9c3

Browse files
committed
DEVICE: Add logs
1 parent d2ee3c4 commit 341b9c3

File tree

5 files changed

+110
-4
lines changed

5 files changed

+110
-4
lines changed

src/ucp/api/device/ucp_device_impl.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,10 @@ UCS_F_DEVICE ucs_status_t ucp_device_prepare_send(
8888

8989
if ((mem_list_h->version != UCP_DEVICE_MEM_LIST_VERSION_V1) ||
9090
(first_mem_elem_index >= mem_list_h->mem_list_length)) {
91+
ucs_device_error("invalid parameters: mem_list version=%u (expected %u), "
92+
"first_mem_elem_index=%u, mem_list_length=%u",
93+
mem_list_h->version, UCP_DEVICE_MEM_LIST_VERSION_V1,
94+
first_mem_elem_index, mem_list_h->mem_list_length);
9195
return UCS_ERR_INVALID_PARAM;
9296
}
9397

@@ -144,6 +148,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_single(
144148
status = ucp_device_prepare_send(mem_list_h, mem_list_index, req, device_ep,
145149
uct_elem, comp);
146150
if (status != UCS_OK) {
151+
ucs_device_error("send prepare failed with %s, mem_list_index=%u",
152+
ucs_device_status_string(status), mem_list_index);
147153
return status;
148154
}
149155

@@ -195,6 +201,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_counter_inc(
195201
status = ucp_device_prepare_send(mem_list_h, mem_list_index, req, device_ep,
196202
uct_elem, comp);
197203
if (status != UCS_OK) {
204+
ucs_device_error("send prepare failed with %s, mem_list_index=%u",
205+
ucs_device_status_string(status), mem_list_index);
198206
return status;
199207
}
200208

@@ -259,6 +267,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi(
259267
status = ucp_device_prepare_send(mem_list_h, 0, req, device_ep,
260268
uct_mem_list, comp);
261269
if (status != UCS_OK) {
270+
ucs_device_error("send prepare failed with %s, mem_list_length=%u",
271+
ucs_device_status_string(status), mem_list_h->mem_list_length);
262272
return status;
263273
}
264274

@@ -334,6 +344,8 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi_partial(
334344
status = ucp_device_prepare_send(mem_list_h, 0, req, device_ep,
335345
uct_mem_list, comp);
336346
if (status != UCS_OK) {
347+
ucs_device_error("send prepare failed with %s, mem_list_count=%u",
348+
ucs_device_status_string(status), mem_list_count);
337349
return status;
338350
}
339351

src/ucp/core/ucp_device.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,7 @@ ucp_device_mem_list_create(ucp_ep_h ep,
380380
uct_allocated_memory_t mem;
381381

382382
if (!(ep->flags & UCP_EP_FLAG_REMOTE_CONNECTED)) {
383+
ucs_error("ep=%p didn't complete wireup", ep);
383384
return UCS_ERR_NOT_CONNECTED;
384385
}
385386

@@ -388,6 +389,8 @@ ucp_device_mem_list_create(ucp_ep_h ep,
388389
&local_sys_dev, &local_md_map,
389390
&mem_type);
390391
if (status != UCS_OK) {
392+
ucs_error("ep=%p check parameters failed: %s", ep,
393+
ucs_status_string(status));
391394
return status;
392395
}
393396

@@ -426,6 +429,7 @@ ucp_device_mem_list_create(ucp_ep_h ep,
426429
/* Track memory allocator for later release */
427430
status = ucp_device_mem_handle_hash_insert(&mem);
428431
if (status != UCS_OK) {
432+
ucs_error("failed to insert handle: %s", ucs_status_string(status));
429433
uct_mem_free(&mem);
430434
} else {
431435
*handle_p = mem.address;

src/ucs/sys/device_code.h

Lines changed: 80 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#define UCS_DEVICE_CODE_H
99

1010
#include <ucs/sys/compiler_def.h>
11+
#include <ucs/type/status.h>
1112
#include <stdint.h>
1213

1314
/*
@@ -35,7 +36,7 @@ typedef enum {
3536
} ucs_device_level_t;
3637

3738

38-
static UCS_F_ALWAYS_INLINE const char*
39+
UCS_F_DEVICE const char*
3940
ucs_device_level_name(ucs_device_level_t level)
4041
{
4142
switch (level) {
@@ -87,10 +88,10 @@ UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value)
8788

8889

8990
/* Helper macro to print a message from a device function including the
90-
* thread and block indices */
91+
* thread and block indices, file, line, and function */
9192
#define ucs_device_printf(_title, _fmt, ...) \
92-
printf("(%d:%d) %6s " _fmt "\n", threadIdx.x, blockIdx.x, _title, \
93-
##__VA_ARGS__)
93+
printf("(%d:%d) %6s %s:%d %s: " _fmt "\n", threadIdx.x, blockIdx.x, _title, \
94+
__FILE__, __LINE__, __func__, ##__VA_ARGS__)
9495

9596
/* Print an error message from a device function */
9697
#define ucs_device_error(_fmt, ...) \
@@ -101,4 +102,79 @@ UCS_F_DEVICE void ucs_device_atomic64_write(uint64_t *ptr, uint64_t value)
101102
#define ucs_device_debug(_fmt, ...) \
102103
ucs_device_printf("DEBUG", _fmt, ##__VA_ARGS__)
103104

105+
106+
/**
107+
* @brief Device compatible status code to string conversion
108+
*
109+
* This function provides status code to string conversion that can be called
110+
* from device code. Returns a short string representation of the status code.
111+
*
112+
* @param [in] status Status code to convert
113+
*
114+
* @return Short string representation of the status code
115+
*/
116+
UCS_F_DEVICE const char* ucs_device_status_string(ucs_status_t status)
117+
{
118+
switch (status) {
119+
case UCS_OK:
120+
return "Success";
121+
case UCS_INPROGRESS:
122+
return "Operation in progress";
123+
case UCS_ERR_NO_MESSAGE:
124+
return "No pending message";
125+
case UCS_ERR_NO_RESOURCE:
126+
return "No resources are available to initiate the operation";
127+
case UCS_ERR_IO_ERROR:
128+
return "Input/output error";
129+
case UCS_ERR_NO_MEMORY:
130+
return "Out of memory";
131+
case UCS_ERR_INVALID_PARAM:
132+
return "Invalid parameter";
133+
case UCS_ERR_UNREACHABLE:
134+
return "Destination is unreachable";
135+
case UCS_ERR_INVALID_ADDR:
136+
return "Address not valid";
137+
case UCS_ERR_NOT_IMPLEMENTED:
138+
return "Function not implemented";
139+
case UCS_ERR_MESSAGE_TRUNCATED:
140+
return "Message truncated";
141+
case UCS_ERR_NO_PROGRESS:
142+
return "No progress";
143+
case UCS_ERR_BUFFER_TOO_SMALL:
144+
return "Provided buffer is too small";
145+
case UCS_ERR_NO_ELEM:
146+
return "No such element";
147+
case UCS_ERR_SOME_CONNECTS_FAILED:
148+
return "Failed to connect some of the requested endpoints";
149+
case UCS_ERR_NO_DEVICE:
150+
return "No such device";
151+
case UCS_ERR_BUSY:
152+
return "Device is busy";
153+
case UCS_ERR_CANCELED:
154+
return "Request canceled";
155+
case UCS_ERR_SHMEM_SEGMENT:
156+
return "Shared memory error";
157+
case UCS_ERR_ALREADY_EXISTS:
158+
return "Element already exists";
159+
case UCS_ERR_OUT_OF_RANGE:
160+
return "Index out of range";
161+
case UCS_ERR_TIMED_OUT:
162+
return "Operation timed out";
163+
case UCS_ERR_EXCEEDS_LIMIT:
164+
return "User-defined limit was reached";
165+
case UCS_ERR_UNSUPPORTED:
166+
return "Unsupported operation";
167+
case UCS_ERR_REJECTED:
168+
return "Operation rejected by remote peer";
169+
case UCS_ERR_NOT_CONNECTED:
170+
return "Endpoint is not connected";
171+
case UCS_ERR_CONNECTION_RESET:
172+
return "Connection reset by remote peer";
173+
case UCS_ERR_ENDPOINT_TIMEOUT:
174+
return "Endpoint timeout";
175+
default:
176+
return "Unknown error";
177+
};
178+
}
179+
104180
#endif

src/uct/api/device/uct_device_impl.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_put_single(
5555
comp);
5656
}
5757

58+
ucs_device_error("unsupported device_ep->uct_tl_id=%d",
59+
device_ep->uct_tl_id);
5860
return UCS_ERR_UNSUPPORTED;
5961
}
6062

@@ -96,6 +98,8 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_atomic_add(
9698
flags, comp);
9799
}
98100

101+
ucs_device_error("unsupported device_ep->uct_tl_id=%d",
102+
device_ep->uct_tl_id);
99103
return UCS_ERR_UNSUPPORTED;
100104
}
101105

@@ -161,6 +165,8 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_put_multi(
161165
flags, comp);
162166
}
163167

168+
ucs_device_error("unsupported device_ep->uct_tl_id=%d",
169+
device_ep->uct_tl_id);
164170
return UCS_ERR_UNSUPPORTED;
165171
}
166172

@@ -233,6 +239,9 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_put_multi_partial(
233239
counter_inc_value, counter_remote_address,
234240
flags, comp);
235241
}
242+
243+
ucs_device_error("unsupported device_ep->uct_tl_id=%d",
244+
device_ep->uct_tl_id);
236245
return UCS_ERR_UNSUPPORTED;
237246
}
238247

@@ -256,6 +265,8 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_progress(uct_device_ep_h device_ep)
256265
return UCS_OK;
257266
}
258267

268+
ucs_device_error("unsupported device_ep->uct_tl_id=%d",
269+
device_ep->uct_tl_id);
259270
return UCS_ERR_UNSUPPORTED;
260271
}
261272

src/uct/ib/mlx5/gdaki/gdaki.cuh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi(
334334

335335
if ((level != UCS_DEVICE_LEVEL_THREAD) &&
336336
(level != UCS_DEVICE_LEVEL_WARP)) {
337+
ucs_device_error("unsupported level: %s", ucs_device_level_name(level));
337338
return UCS_ERR_UNSUPPORTED;
338339
}
339340

@@ -424,6 +425,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial(
424425

425426
if ((level != UCS_DEVICE_LEVEL_THREAD) &&
426427
(level != UCS_DEVICE_LEVEL_WARP)) {
428+
ucs_device_error("unsupported level: %s", ucs_device_level_name(level));
427429
return UCS_ERR_UNSUPPORTED;
428430
}
429431

@@ -598,6 +600,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_progress(uct_device_ep_h tl_ep)
598600
} else if (level == UCS_DEVICE_LEVEL_THREAD) {
599601
return uct_rc_mlx5_gda_progress_thread(ep);
600602
} else {
603+
ucs_device_error("unsupported level: %s", ucs_device_level_name(level));
601604
return UCS_ERR_UNSUPPORTED;
602605
}
603606
}

0 commit comments

Comments
 (0)