From 1c6e882a8c0a6294991e9c50c6c582fc77b1e033 Mon Sep 17 00:00:00 2001 From: Shachar Hasson Date: Sun, 5 Oct 2025 11:49:52 +0000 Subject: [PATCH 1/7] UCT/CUDA_IPC: Enforce host memory support for mem_ttpe EP --- src/ucp/core/ucp_ep.c | 11 ++++++++++- src/uct/cuda/cuda_ipc/cuda_ipc_iface.c | 7 +------ src/uct/cuda/cuda_ipc/cuda_ipc_iface.h | 1 - 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/ucp/core/ucp_ep.c b/src/ucp/core/ucp_ep.c index 3562dfd1e6b..a6b50caf86e 100644 --- a/src/ucp/core/ucp_ep.c +++ b/src/ucp/core/ucp_ep.c @@ -684,12 +684,18 @@ ucs_status_t ucp_worker_mem_type_eps_create(ucp_worker_h worker) ucs_status_t status; void *address_buffer; size_t address_length; - ucp_tl_bitmap_t mem_access_tls; + ucp_tl_bitmap_t mem_access_tls, host_mem_access_tls; char ep_name[UCP_WORKER_ADDRESS_NAME_MAX]; unsigned addr_indices[UCP_MAX_LANES]; + ucp_lane_index_t num_lanes; ucs_memory_type_for_each(mem_type) { ucp_context_memaccess_tl_bitmap(context, mem_type, 0, &mem_access_tls); + /* Mem type EP requires host memory support */ + ucp_context_memaccess_tl_bitmap(context, UCS_MEMORY_TYPE_HOST, 0, + &host_mem_access_tls); + UCS_STATIC_BITMAP_AND_INPLACE(&mem_access_tls, host_mem_access_tls); + if (UCP_MEM_IS_HOST(mem_type) || UCS_STATIC_BITMAP_IS_ZERO(mem_access_tls)) { continue; @@ -725,6 +731,9 @@ ucs_status_t ucp_worker_mem_type_eps_create(ucp_worker_h worker) goto err_free_address_list; } + /* Mem type EP cannot have more than one lane */ + num_lanes = ucp_ep_num_lanes(worker->mem_type_ep[mem_type]); + ucs_assertv_always(num_lanes == 1, "num_lanes=%u", num_lanes); UCS_ASYNC_UNBLOCK(&worker->async); ucs_free(local_address.address_list); diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c index 116f3791f35..85228487e37 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c @@ -74,10 +74,6 @@ static ucs_config_field_t uct_cuda_ipc_iface_config_table[] = { "Estimated CPU overhead for transferring GPU memory", ucs_offsetof(uct_cuda_ipc_iface_config_t, params.overhead), UCS_CONFIG_TYPE_TIME}, - {"ENABLE_SAME_PROCESS", "n", - "Enable same process same device communication for cuda_ipc", - ucs_offsetof(uct_cuda_ipc_iface_config_t, params.enable_same_process), UCS_CONFIG_TYPE_BOOL}, - {NULL} }; @@ -146,8 +142,7 @@ uct_cuda_ipc_iface_is_reachable_v2(const uct_iface_h tl_iface, dev_addr = (const uct_cuda_ipc_device_addr_t *)params->device_addr; same_uuid = (ucs_get_system_id() == dev_addr->system_uuid); - if ((getpid() == *(pid_t*)params->iface_addr) && same_uuid && - !iface->config.enable_same_process) { + if ((getpid() == *(pid_t*)params->iface_addr) && same_uuid) { uct_iface_fill_info_str_buf(params, "same process"); return 0; } diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h index c749fcff9e0..663cece255f 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h @@ -28,7 +28,6 @@ typedef struct { double bandwidth; /* estimated bandwidth */ double latency; /* estimated latency */ double overhead; /* estimated CPU overhead */ - int enable_same_process; /* enable cuda_ipc for same pid same device */ } uct_cuda_ipc_iface_config_params_t; From 77634250866cde55b01f5a7d26ee8989794b3a02 Mon Sep 17 00:00:00 2001 From: Shachar Hasson Date: Sun, 5 Oct 2025 11:55:49 +0000 Subject: [PATCH 2/7] UCP/TEST: remove old config from test --- test/gtest/ucp/test_ucp_device.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/test/gtest/ucp/test_ucp_device.cc b/test/gtest/ucp/test_ucp_device.cc index 4340f1c7222..1081aa797d5 100644 --- a/test/gtest/ucp/test_ucp_device.cc +++ b/test/gtest/ucp/test_ucp_device.cc @@ -69,7 +69,6 @@ void test_ucp_device::get_test_variants(std::vector &variants) void test_ucp_device::init() { - m_env.push_back(new ucs::scoped_setenv("UCX_CUDA_IPC_ENABLE_SAME_PROCESS", "y")); m_env.push_back(new ucs::scoped_setenv("UCX_IB_GPU_IB_DISTANCE_LATENCY_THRESH", "1000ns")); ucp_test::init(); sender().connect(&receiver(), get_ep_params()); From d57ea7352113380ea677fc6a04985423c6a4a15c Mon Sep 17 00:00:00 2001 From: Shachar Hasson Date: Sun, 19 Oct 2025 14:06:18 +0000 Subject: [PATCH 3/7] UCT/CUDA_IPC: removed self check in reachability --- src/uct/cuda/cuda_ipc/cuda_ipc_iface.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c index 85228487e37..e7436c41fc8 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c @@ -142,11 +142,6 @@ uct_cuda_ipc_iface_is_reachable_v2(const uct_iface_h tl_iface, dev_addr = (const uct_cuda_ipc_device_addr_t *)params->device_addr; same_uuid = (ucs_get_system_id() == dev_addr->system_uuid); - if ((getpid() == *(pid_t*)params->iface_addr) && same_uuid) { - uct_iface_fill_info_str_buf(params, "same process"); - return 0; - } - if (same_uuid || uct_cuda_ipc_iface_mnnvl_supported(md, dev_addr, dev_addr_len)) { return uct_iface_scope_is_reachable(tl_iface, params); From 57c56867503a804ac7353494b1cac14861c42325 Mon Sep 17 00:00:00 2001 From: Shachar Hasson Date: Wed, 22 Oct 2025 19:31:40 +0000 Subject: [PATCH 4/7] TEST/UCT: fixed CI failures --- test/gtest/ucp/test_ucp_peer_failure.cc | 2 +- test/gtest/uct/test_uct_iface.cc | 23 ++--------------------- 2 files changed, 3 insertions(+), 22 deletions(-) diff --git a/test/gtest/ucp/test_ucp_peer_failure.cc b/test/gtest/ucp/test_ucp_peer_failure.cc index 80d459568aa..f01d1ddaea7 100644 --- a/test/gtest/ucp/test_ucp_peer_failure.cc +++ b/test/gtest/ucp/test_ucp_peer_failure.cc @@ -987,7 +987,7 @@ UCS_TEST_P(test_ucp_peer_failure_rndv_put_ppln_abort, rtr_mtype) } UCS_TEST_P(test_ucp_peer_failure_rndv_put_ppln_abort, pipeline, - "RNDV_FRAG_SIZE=host:8K") + "RNDV_FRAG_SIZE=host:8K,cuda:8K") { rndv_progress_failure_test(rndv_mode::put_ppln, true); } diff --git a/test/gtest/uct/test_uct_iface.cc b/test/gtest/uct/test_uct_iface.cc index 233f0864d88..f6a0578ab9d 100644 --- a/test/gtest/uct/test_uct_iface.cc +++ b/test/gtest/uct/test_uct_iface.cc @@ -24,11 +24,6 @@ class test_uct_iface : public uct_test { } void test_is_reachable(); - - virtual bool is_self_reachable() const - { - return true; - } }; void test_uct_iface::test_is_reachable() @@ -63,7 +58,7 @@ void test_uct_iface::test_is_reachable() ASSERT_UCS_OK(status); bool is_reachable = uct_iface_is_reachable_v2(iface, ¶ms); - EXPECT_EQ(is_self_reachable(), is_reachable); + EXPECT_TRUE(is_reachable); // Allocate corrupted address buffers, make it larger than the correct // buffer size in case the corrupted data indicates a larger address length @@ -98,18 +93,4 @@ UCS_TEST_P(test_uct_iface, is_reachable) } UCT_INSTANTIATE_TEST_CASE(test_uct_iface) - -class test_uct_iface_self_unreachable : public test_uct_iface { -protected: - bool is_self_reachable() const override - { - return false; - } -}; - -UCS_TEST_P(test_uct_iface_self_unreachable, is_reachable) -{ - test_is_reachable(); -} - -UCT_INSTANTIATE_CUDA_IPC_TEST_CASE(test_uct_iface_self_unreachable) +UCT_INSTANTIATE_CUDA_IPC_TEST_CASE(test_uct_iface) From a03be171180c75b34b453c2af64108643eb1ea06 Mon Sep 17 00:00:00 2001 From: Shachar Hasson Date: Thu, 23 Oct 2025 17:45:33 +0000 Subject: [PATCH 5/7] UCP/TEST: separate ipc test case from others --- test/gtest/ucp/test_ucp_peer_failure.cc | 1 + test/gtest/ucp/ucp_test.h | 12 ++++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/test/gtest/ucp/test_ucp_peer_failure.cc b/test/gtest/ucp/test_ucp_peer_failure.cc index f01d1ddaea7..dddf816ae21 100644 --- a/test/gtest/ucp/test_ucp_peer_failure.cc +++ b/test/gtest/ucp/test_ucp_peer_failure.cc @@ -993,3 +993,4 @@ UCS_TEST_P(test_ucp_peer_failure_rndv_put_ppln_abort, pipeline, } UCP_INSTANTIATE_TEST_CASE_GPU_AWARE(test_ucp_peer_failure_rndv_put_ppln_abort); +UCP_INSTANTIATE_TEST_CASE_GPU_IPC(test_ucp_peer_failure_rndv_put_ppln_abort); diff --git a/test/gtest/ucp/ucp_test.h b/test/gtest/ucp/ucp_test.h index ca6c3949e5a..1309ed7720d 100644 --- a/test/gtest/ucp/ucp_test.h +++ b/test/gtest/ucp/ucp_test.h @@ -482,11 +482,19 @@ std::vector enum_test_params(const std::string& tls) "srd") \ UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, shm_ib, \ "shm,ib,gdr_copy") \ - UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, shm_ib_ipc, \ - "shm,ib,cuda_ipc,rocm_ipc") \ UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, ugni, \ "ugni") \ UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, tcp, \ "tcp") + +/** + * Instantiate the parameterized test case for gpu ipc transports + * + * @param _test_case Test case class, derived from ucp_test. + */ +#define UCP_INSTANTIATE_TEST_CASE_GPU_IPC(_test_case) \ + UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, shm_ib_ipc, \ + "shm,ib,cuda_ipc,rocm_ipc") + #endif From c7638b93fee5fa9f4282394800b56b7f6abfdd0c Mon Sep 17 00:00:00 2001 From: Shachar Hasson Date: Thu, 23 Oct 2025 18:05:53 +0000 Subject: [PATCH 6/7] UCP/TEST: revert last commit --- test/gtest/ucp/test_ucp_peer_failure.cc | 1 - test/gtest/ucp/ucp_test.h | 12 ++---------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/test/gtest/ucp/test_ucp_peer_failure.cc b/test/gtest/ucp/test_ucp_peer_failure.cc index dddf816ae21..f01d1ddaea7 100644 --- a/test/gtest/ucp/test_ucp_peer_failure.cc +++ b/test/gtest/ucp/test_ucp_peer_failure.cc @@ -993,4 +993,3 @@ UCS_TEST_P(test_ucp_peer_failure_rndv_put_ppln_abort, pipeline, } UCP_INSTANTIATE_TEST_CASE_GPU_AWARE(test_ucp_peer_failure_rndv_put_ppln_abort); -UCP_INSTANTIATE_TEST_CASE_GPU_IPC(test_ucp_peer_failure_rndv_put_ppln_abort); diff --git a/test/gtest/ucp/ucp_test.h b/test/gtest/ucp/ucp_test.h index 1309ed7720d..ca6c3949e5a 100644 --- a/test/gtest/ucp/ucp_test.h +++ b/test/gtest/ucp/ucp_test.h @@ -482,19 +482,11 @@ std::vector enum_test_params(const std::string& tls) "srd") \ UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, shm_ib, \ "shm,ib,gdr_copy") \ + UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, shm_ib_ipc, \ + "shm,ib,cuda_ipc,rocm_ipc") \ UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, ugni, \ "ugni") \ UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, tcp, \ "tcp") - -/** - * Instantiate the parameterized test case for gpu ipc transports - * - * @param _test_case Test case class, derived from ucp_test. - */ -#define UCP_INSTANTIATE_TEST_CASE_GPU_IPC(_test_case) \ - UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, shm_ib_ipc, \ - "shm,ib,cuda_ipc,rocm_ipc") - #endif From e8dff898a00097f0a7114e47056fb17153c19904 Mon Sep 17 00:00:00 2001 From: Shachar Hasson Date: Mon, 27 Oct 2025 15:25:12 +0000 Subject: [PATCH 7/7] UCP/TEST: fix flush --- src/ucp/rma/flush.c | 2 +- src/ucp/rma/rma.h | 1 + test/gtest/ucp/test_ucp_memheap.cc | 7 +++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/ucp/rma/flush.c b/src/ucp/rma/flush.c index 43af459493c..07d6638cef8 100644 --- a/src/ucp/rma/flush.c +++ b/src/ucp/rma/flush.c @@ -530,7 +530,7 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_ep_flush_nbx, (ep, param), return request; } -static ucs_status_t ucp_worker_flush_check(ucp_worker_h worker) +ucs_status_t ucp_worker_flush_check(ucp_worker_h worker) { ucp_rsc_index_t iface_id; ucp_worker_iface_t *wiface; diff --git a/src/ucp/rma/rma.h b/src/ucp/rma/rma.h index 14516c1ce7f..9242d2468b4 100644 --- a/src/ucp/rma/rma.h +++ b/src/ucp/rma/rma.h @@ -100,6 +100,7 @@ extern ucp_amo_proto_t ucp_amo_sw_proto; extern const ucp_rma_proto_t *ucp_rma_proto_list[]; extern const ucp_amo_proto_t *ucp_amo_proto_list[]; +ucs_status_t ucp_worker_flush_check(ucp_worker_h worker); ucs_status_t ucp_rma_request_advance(ucp_request_t *req, ssize_t frag_length, ucs_status_t status, diff --git a/test/gtest/ucp/test_ucp_memheap.cc b/test/gtest/ucp/test_ucp_memheap.cc index d60f6f3af03..991c14dc7ec 100644 --- a/test/gtest/ucp/test_ucp_memheap.cc +++ b/test/gtest/ucp/test_ucp_memheap.cc @@ -10,6 +10,10 @@ #include #include #include +extern "C" { +#include +} + #include @@ -95,6 +99,9 @@ void test_ucp_memheap::test_xfer(send_func_t send_func, size_t size, flush_ep(sender()); } else { flush_worker(sender()); + while(ucp_worker_flush_check(sender().worker()) != UCS_OK) { + progress(); + } } /* Validate data */