From 81975e2347d52b982d832f94f0e9234a7eda0b86 Mon Sep 17 00:00:00 2001 From: "Samuel K. Gutierrez" Date: Fri, 26 Jul 2024 19:57:51 -0600 Subject: [PATCH] Move more infrastructure into qvi-split. Signed-off-by: Samuel K. Gutierrez --- src/qvi-hwpool.cc | 29 +--- src/qvi-scope.cc | 392 ++------------------------------------------ src/qvi-split.cc | 403 ++++++++++++++++++++++++++++++++++++++++------ src/qvi-split.h | 126 +++++++++++++-- 4 files changed, 483 insertions(+), 467 deletions(-) diff --git a/src/qvi-hwpool.cc b/src/qvi-hwpool.cc index 7bc980f..a04630c 100644 --- a/src/qvi-hwpool.cc +++ b/src/qvi-hwpool.cc @@ -130,8 +130,8 @@ qvi_hwpool_cpu_s::cpuset(void) } const qvi_hwloc_bitmap_s & -qvi_hwpool_cpu_s::cpuset(void) - const { +qvi_hwpool_cpu_s::cpuset(void) const +{ return m_cpuset; } @@ -228,8 +228,8 @@ qvi_hwpool_dev_s::id( } const qvi_hwloc_bitmap_s & -qvi_hwpool_dev_s::affinity(void) - const { +qvi_hwpool_dev_s::affinity(void) const +{ return m_affinity; } @@ -471,27 +471,6 @@ qvi_hwpool_s::unpack( return rc; } -#if 0 -/** - * Extend namespace std so we can easily add qvi_devinfo_ts to - * unordered_sets. - */ -namespace std { - template <> - struct hash - { - size_t - operator()(const qvi_hwpool_dev_s &x) const - { - const int a = x.m_id; - const int b = (int)x.type; - const int64_t c = qvi_cantor_pairing(a, b); - return hash()(c); - } - }; -} -#endif - /* * vim: ft=cpp ts=4 sts=4 sw=4 expandtab */ diff --git a/src/qvi-scope.cc b/src/qvi-scope.cc index d6fd773..3621081 100644 --- a/src/qvi-scope.cc +++ b/src/qvi-scope.cc @@ -18,10 +18,8 @@ // TODO(skg) Add RMI to acquire/release resources. #include "qvi-scope.h" -#include "qvi-task.h" -#include "qvi-bbuff.h" #include "qvi-rmi.h" -#include "qvi-bbuff-rmi.h" +#include "qvi-task.h" #include "qvi-hwpool.h" #include "qvi-split.h" #include "qvi-utils.h" @@ -247,369 +245,6 @@ qvi_scope_bind_string( return rc; } -template -static int -gather_values( - qvi_group_t *group, - int root, - TYPE invalue, - std::vector &outvals -) { - static_assert(std::is_trivially_copyable::value, ""); - const uint_t group_size = group->size(); - - qvi_bbuff_t *txbuff = nullptr; - int rc = qvi_bbuff_new(&txbuff); - if (qvi_unlikely(rc != QV_SUCCESS)) return rc; - - rc = txbuff->append(&invalue, sizeof(TYPE)); - if (qvi_unlikely(rc != QV_SUCCESS)) { - qvi_bbuff_delete(&txbuff); - return rc; - } - // Gather the values to the root. - bool shared = false; - qvi_bbuff_t **bbuffs = nullptr; - rc = group->gather(txbuff, root, &shared, &bbuffs); - if (qvi_unlikely(rc != QV_SUCCESS)) goto out; - // The root fills in the output. - if (group->rank() == root) { - outvals.resize(group_size); - // Unpack the values. - for (uint_t i = 0; i < group_size; ++i) { - outvals[i] = *(TYPE *)bbuffs[i]->data(); - } - } -out: - if (!shared || (shared && (group->rank() == root))) { - if (bbuffs) { - for (uint_t i = 0; i < group_size; ++i) { - qvi_bbuff_delete(&bbuffs[i]); - } - delete[] bbuffs; - } - } - qvi_bbuff_delete(&txbuff); - if (qvi_unlikely(rc != QV_SUCCESS)) { - // If something went wrong, just zero-initialize the values. - outvals = {}; - } - return rc; -} - -static int -gather_hwpools( - qvi_group_t *group, - int root, - qvi_hwpool_s *txpool, - std::vector &rxpools -) { - const uint_t group_size = group->size(); - // Pack the hardware pool into a buffer. - qvi_bbuff_t txbuff; - int rc = txpool->packinto(&txbuff); - if (qvi_unlikely(rc != QV_SUCCESS)) return rc; - // Gather the values to the root. - bool shared = false; - qvi_bbuff_t **bbuffs = nullptr; - rc = group->gather(&txbuff, root, &shared, &bbuffs); - if (rc != QV_SUCCESS) goto out; - - if (group->rank() == root) { - rxpools.resize(group_size); - // Unpack the hwpools. - for (uint_t i = 0; i < group_size; ++i) { - rc = qvi_bbuff_rmi_unpack( - bbuffs[i]->data(), &rxpools[i] - ); - if (qvi_unlikely(rc != QV_SUCCESS)) break; - } - } -out: - if (!shared || (shared && (group->rank() == root))) { - if (bbuffs) { - for (uint_t i = 0; i < group_size; ++i) { - qvi_bbuff_delete(&bbuffs[i]); - } - delete[] bbuffs; - } - } - if (rc != QV_SUCCESS) { - // If something went wrong, just zero-initialize the pools. - rxpools = {}; - } - return rc; -} - -template -static int -scatter_values( - qvi_group_t *group, - int root, - const std::vector &values, - TYPE *value -) { - static_assert(std::is_trivially_copyable::value, ""); - - int rc = QV_SUCCESS; - qvi_bbuff_t *rxbuff = nullptr; - - std::vector txbuffs(0); - if (root == group->rank()) { - const uint_t group_size = group->size(); - txbuffs.resize(group_size); - // Pack the values. - for (uint_t i = 0; i < group_size; ++i) { - rc = qvi_bbuff_new(&txbuffs[i]); - if (qvi_unlikely(rc != QV_SUCCESS)) break; - - rc = txbuffs[i]->append(&values[i], sizeof(TYPE)); - if (qvi_unlikely(rc != QV_SUCCESS)) break; - } - if (qvi_unlikely(rc != QV_SUCCESS)) goto out; - } - - rc = group->scatter(txbuffs.data(), root, &rxbuff); - if (qvi_unlikely(rc != QV_SUCCESS)) goto out; - - *value = *(TYPE *)rxbuff->data(); -out: - for (auto &buff : txbuffs) { - qvi_bbuff_delete(&buff); - } - qvi_bbuff_delete(&rxbuff); - if (rc != QV_SUCCESS) { - // If something went wrong, just zero-initialize the value. - *value = {}; - } - return rc; -} - -static int -scatter_hwpools( - qvi_group_t *group, - int root, - const std::vector &pools, - qvi_hwpool_s **pool -) { - int rc = QV_SUCCESS; - std::vector txbuffs(0); - qvi_bbuff_t *rxbuff = nullptr; - - if (root == group->rank()) { - const uint_t group_size = group->size(); - txbuffs.resize(group_size); - // Pack the hwpools. - for (uint_t i = 0; i < group_size; ++i) { - rc = qvi_bbuff_new(&txbuffs[i]); - if (rc != QV_SUCCESS) break; - - rc = pools[i]->packinto(txbuffs[i]); - if (rc != QV_SUCCESS) break; - } - if (rc != QV_SUCCESS) goto out; - } - - rc = group->scatter(txbuffs.data(), root, &rxbuff); - if (rc != QV_SUCCESS) goto out; - - rc = qvi_bbuff_rmi_unpack(rxbuff->data(), pool); -out: - for (auto &buff : txbuffs) { - qvi_bbuff_delete(&buff); - } - qvi_bbuff_delete(&rxbuff); - if (rc != QV_SUCCESS) { - qvi_delete(pool); - } - return rc; -} - -template -static int -bcast_value( - qvi_group_t *group, - int root, - TYPE *value -) { - static_assert(std::is_trivially_copyable::value, ""); - - std::vector values; - if (root == group->rank()) { - values.resize(group->size()); - std::fill(values.begin(), values.end(), *value); - } - return scatter_values(group, root, values, value); -} - -/** - * Collective split structure: a collection of data relevant to split operations - * requiring aggregated resource knowledge AND coordination between tasks in the - * parent scope to perform a split. - */ -struct qvi_scope_split_coll_s { - /** - * The root task ID used for collective operations. - * Note: We use 0 as the root because 0 will always exist. - */ - static constexpr int rootid = 0; - /** Points to the parent scope that we are splitting. */ - qv_scope_t *parent = nullptr; - /** My color. */ - int mycolor = 0; - /** - * Stores group-global split information brought together by collective - * operations across the members in parent_scope. - */ - qvi_hwsplit_s gsplit; - /** Constructor. */ - qvi_scope_split_coll_s(void) = delete; - /** Constructor. */ - qvi_scope_split_coll_s( - qv_scope_t *parent_a, - uint_t split_size_a, - int mycolor_a, - qv_hw_obj_type_t split_at_type_a - ) : parent(parent_a) - , mycolor(mycolor_a) - { - const qvi_group_t *const pgroup = parent->group; - if (pgroup->rank() == qvi_scope_split_coll_s::rootid) { - gsplit = qvi_hwsplit_s( - parent, pgroup->size(), split_size_a, split_at_type_a - ); - } - } -}; - -static int -scope_split_coll_gather( - qvi_scope_split_coll_s &splitcoll -) { - qv_scope_t *const parent = splitcoll.parent; - - int rc = gather_values( - parent->group, - qvi_scope_split_coll_s::rootid, - qvi_task_t::mytid(), - splitcoll.gsplit.taskids - ); - if (rc != QV_SUCCESS) return rc; - // Note that the result hwpools are copies, so we can modify them freely. - rc = gather_hwpools( - parent->group, - qvi_scope_split_coll_s::rootid, - parent->hwpool, - splitcoll.gsplit.hwpools - ); - if (rc != QV_SUCCESS) return rc; - - rc = gather_values( - parent->group, - qvi_scope_split_coll_s::rootid, - splitcoll.mycolor, - splitcoll.gsplit.colors - ); - if (rc != QV_SUCCESS) return rc; - - const int myid = parent->group->rank(); - const uint_t group_size = parent->group->size(); - if (myid == qvi_scope_split_coll_s::rootid) { - splitcoll.gsplit.affinities.resize(group_size); - for (uint_t tid = 0; tid < group_size; ++tid) { - hwloc_cpuset_t cpuset = nullptr; - rc = parent->group->task()->bind_top(&cpuset); - if (rc != QV_SUCCESS) break; - - rc = splitcoll.gsplit.affinities[tid].set(cpuset); - // Clean up. - qvi_hwloc_bitmap_delete(&cpuset); - if (rc != QV_SUCCESS) break; - } - } - return rc; -} - -static int -scope_split_coll_scatter( - const qvi_scope_split_coll_s &splitcoll, - int *colorp, - qvi_hwpool_s **result -) { - const int rc = scatter_values( - splitcoll.parent->group, - qvi_scope_split_coll_s::rootid, - splitcoll.gsplit.colors, - colorp - ); - if (qvi_unlikely(rc != QV_SUCCESS)) return rc; - - return scatter_hwpools( - splitcoll.parent->group, - qvi_scope_split_coll_s::rootid, - splitcoll.gsplit.hwpools, - result - ); -} - -/** - * Split the hardware resources based on the provided split parameters: - * - npieces: The number of splits requested. - * - color: Either user-supplied (explicitly set) or a value that requests - * us to do the coloring for the callers. - * maybe_obj_type: Potentially the object type that we are splitting at. This - * value influences how the splitting algorithms perform their mapping. - * - colorp: color' is potentially a new color assignment determined by one - * of our coloring algorithms. This value can be used to influence the - * group splitting that occurs after this call completes. - */ -static int -coll_split_hardware_resources( - qv_scope_t *parent, - int npieces, - int color, - qv_hw_obj_type_t maybe_obj_type, - int *colorp, - qvi_hwpool_s **result -) { - int rc2 = QV_SUCCESS; - const int rootid = qvi_scope_split_coll_s::rootid, myid = parent->group->rank(); - // Information relevant to hardware resource splitting. Note that - // aggregated data are only valid for the task whose id is equal to - // qvi_global_split_t::rootid after gather has completed. - qvi_scope_split_coll_s splitcoll( - parent, npieces, color, maybe_obj_type - ); - // First consolidate the provided information, as this is coming from a - // SPMD-like context (e.g., splitting a resource shared by MPI processes). - // In most cases it is easiest to have a single task calculate the split - // based on global knowledge and later redistribute the calculated result to - // its group members. - int rc = scope_split_coll_gather(splitcoll); - if (rc != QV_SUCCESS) goto out; - // The root does this calculation. - if (myid == rootid) { - rc2 = splitcoll.gsplit.split(); - } - // Wait for the split information. Explicitly barrier here in case the - // underlying broadcast implementation polls heavily for completion. - rc = splitcoll.parent->group->barrier(); - if (rc != QV_SUCCESS) goto out; - // To avoid hangs in split error paths, share the split rc with everyone. - rc = bcast_value(splitcoll.parent->group, rootid, &rc2); - if (rc != QV_SUCCESS) goto out; - // If the split failed, return the error to all callers. - if (rc2 != QV_SUCCESS) { - rc = rc2; - goto out; - } - // Scatter the results. - rc = scope_split_coll_scatter(splitcoll, colorp, result); - if (rc != QV_SUCCESS) goto out; -out: - return rc; -} - int qvi_scope_split( qv_scope_t *parent, @@ -623,9 +258,10 @@ qvi_scope_split( qvi_group_t *group = nullptr; qv_scope_t *ichild = nullptr; // Split the hardware resources based on the provided split parameters. - rc = coll_split_hardware_resources( - parent, npieces, color, maybe_obj_type, &colorp, &hwpool + qvi_scope_split_coll_s splitcoll( + parent, npieces, color, maybe_obj_type ); + rc = splitcoll.split(&colorp, &hwpool); if (rc != QV_SUCCESS) goto out; // Split underlying group. Notice the use of colorp here. rc = parent->group->split( @@ -657,12 +293,10 @@ qvi_scope_thsplit( qvi_group_t *const pgroup = parent->group; const uint_t group_size = k; - - qvi_hwsplit_s splitagg( - parent, group_size, npieces, maybe_obj_type - ); + // Construct the hardware split. + qvi_hwsplit_s hwsplit(parent, group_size, npieces, maybe_obj_type); // Eagerly make room for the group member information. - splitagg.reserve(); + hwsplit.reserve(); // Since this is called by a single task, get its ID and associated // hardware affinity here, and replicate them in the following loop // that populates splitagg. @@ -674,21 +308,21 @@ qvi_scope_thsplit( if (rc != QV_SUCCESS) return rc; for (uint_t i = 0; i < group_size; ++i) { // Store requested colors in aggregate. - splitagg.colors[i] = kcolors[i]; + hwsplit.m_colors[i] = kcolors[i]; // Since the parent hardware pool is the resource we are splitting and // agg_split_* calls expect |group_size| elements, replicate by dups. - rc = qvi_dup(*parent->hwpool, &splitagg.hwpools[i]); + rc = qvi_dup(*parent->hwpool, &hwsplit.m_hwpools[i]); if (rc != QV_SUCCESS) break; // Since this is called by a single task, replicate its task ID, too. - splitagg.taskids[i] = taskid; + hwsplit.m_taskids[i] = taskid; // Same goes for the task's affinity. - splitagg.affinities[i].set(task_affinity); + hwsplit.m_affinities[i].set(task_affinity); } // Cleanup: we don't need task_affinity anymore. qvi_hwloc_bitmap_delete(&task_affinity); if (rc != QV_SUCCESS) return rc; // Split the hardware resources based on the provided split parameters. - rc = splitagg.split(); + rc = hwsplit.split(); if (rc != QV_SUCCESS) return rc; // Split off from our parent group. This call is called from a context in // which a process is splitting its resources across threads, so create a @@ -701,7 +335,7 @@ qvi_scope_thsplit( for (uint_t i = 0; i < group_size; ++i) { // Copy out, since the hardware pools in splitagg will get freed. qvi_hwpool_s *hwpool = nullptr; - rc = qvi_dup(*splitagg.hwpools[i], &hwpool); + rc = qvi_dup(*hwsplit.m_hwpools[i], &hwpool); if (rc != QV_SUCCESS) break; // Create and initialize the new scope. qv_scope_t *child = nullptr; diff --git a/src/qvi-split.cc b/src/qvi-split.cc index 6ee5858..3acc2c2 100644 --- a/src/qvi-split.cc +++ b/src/qvi-split.cc @@ -12,23 +12,25 @@ */ #include "qvi-split.h" +#include "qvi-bbuff.h" #include "qvi-rmi.h" +#include "qvi-bbuff-rmi.h" #include "qvi-task.h" // IWYU pragma: keep #include "qvi-scope.h" // IWYU pragma: keep /** Maintains a mapping between IDs to device information. */ -using id_devinfo_multimap_t = std::multimap; +using id2devs_t = std::multimap; qvi_hwsplit_s::qvi_hwsplit_s( qv_scope_t *parent, - uint_t group_size_a, - uint_t split_size_a, - qv_hw_obj_type_t split_at_type_a -) : rmi(parent->group->task()->rmi()) - , base_hwpool(parent->hwpool) - , group_size(group_size_a) - , split_size(split_size_a) - , split_at_type(split_at_type_a) + uint_t group_size, + uint_t split_size, + qv_hw_obj_type_t split_at_type +) : m_rmi(parent->group->task()->rmi()) + , m_hwpool(parent->hwpool) + , m_group_size(group_size) + , m_split_size(split_size) + , m_split_at_type(split_at_type) { // To save memory we don't eagerly resize our vectors to group_size // since most processes will not use the storage. For example, in the @@ -39,7 +41,7 @@ qvi_hwsplit_s::qvi_hwsplit_s( qvi_hwsplit_s::~qvi_hwsplit_s(void) { - for (auto &hwpool : hwpools) { + for (auto &hwpool : m_hwpools) { qvi_delete(&hwpool); } } @@ -47,18 +49,18 @@ qvi_hwsplit_s::~qvi_hwsplit_s(void) void qvi_hwsplit_s::reserve(void) { - taskids.resize(group_size); - hwpools.resize(group_size); - colors.resize(group_size); - affinities.resize(group_size); + m_taskids.resize(m_group_size); + m_hwpools.resize(m_group_size); + m_colors.resize(m_group_size); + m_affinities.resize(m_group_size); } qvi_hwloc_bitmap_s qvi_hwsplit_s::cpuset(void) const { // This shouldn't happen. - assert(hwpools.size() != 0); - return hwpools[0]->cpuset(); + assert(m_hwpools.size() != 0); + return m_hwpools[0]->cpuset(); } int @@ -68,16 +70,16 @@ qvi_hwsplit_s::split_cpuset( // The cpuset that we are going to split. const qvi_hwloc_bitmap_s base_cpuset = cpuset(); // Pointer to my hwloc instance. - qvi_hwloc_t *const hwloc = qvi_rmi_client_hwloc(rmi); + qvi_hwloc_t *const hwloc = qvi_rmi_client_hwloc(m_rmi); // Holds the host's split cpusets. - result.resize(split_size); + result.resize(m_split_size); // Notice that we do not go through the RMI for this because this is just an // local, temporary splitting that is ultimately fed to another splitting // algorithm. int rc = QV_SUCCESS; - for (uint_t chunkid = 0; chunkid < split_size; ++chunkid) { + for (uint_t chunkid = 0; chunkid < m_split_size; ++chunkid) { rc = qvi_hwloc_split_cpuset_by_chunk_id( - hwloc, base_cpuset.cdata(), split_size, + hwloc, base_cpuset.cdata(), m_split_size, chunkid, result[chunkid].data() ); if (rc != QV_SUCCESS) break; @@ -91,16 +93,16 @@ qvi_hwsplit_s::osdev_cpusets( ) const { // Get the number of devices we have available in the provided scope. int nobj = 0; - int rc = base_hwpool->nobjects( - qvi_rmi_client_hwloc(rmi), split_at_type, &nobj + int rc = m_hwpool->nobjects( + qvi_rmi_client_hwloc(m_rmi), m_split_at_type, &nobj ); if (rc != QV_SUCCESS) return rc; // Holds the device affinities used for the split. result.resize(nobj); uint_t affi = 0; - for (const auto &dinfo : base_hwpool->devices()) { + for (const auto &dinfo : m_hwpool->devices()) { // Not the type we are looking to split. - if (split_at_type != dinfo.first) continue; + if (m_split_at_type != dinfo.first) continue; // Copy the device's affinity to our list of device affinities. result[affi++] = dinfo.second->affinity(); } @@ -116,8 +118,8 @@ qvi_hwsplit_s::primary_cpusets( // split() context, which uses the host's cpuset to split the resources. // TODO(skg) This looks suspicious to me. Make sure we want to do this. // What about getting called from a split context for devices? - if (qvi_hwloc_obj_type_is_host_resource(split_at_type) || - split_at_type == QV_HW_OBJ_LAST) { + if (qvi_hwloc_obj_type_is_host_resource(m_split_at_type) || + m_split_at_type == QV_HW_OBJ_LAST) { return split_cpuset(result); } // An OS device. @@ -129,7 +131,7 @@ qvi_hwsplit_s::primary_cpusets( qvi_map_fn_t qvi_hwsplit_s::affinity_preserving_policy(void) const { - switch (split_at_type) { + switch (m_split_at_type) { // For split() case QV_HW_OBJ_LAST: return qvi_map_packed; @@ -143,7 +145,7 @@ int qvi_hwsplit_s::release_devices(void) { int rc = QV_SUCCESS; - for (auto &hwpool : hwpools) { + for (auto &hwpool : m_hwpools) { rc = hwpool->release_devices(); if (qvi_unlikely(rc != QV_SUCCESS)) return rc; } @@ -164,18 +166,18 @@ qvi_hwsplit_s::split_devices_user_defined(void) // Determine mapping of colors to task IDs. The array index i of colors is // the color requested by task i. Also determine the number of distinct // colors provided in the colors array. - std::set color_set(colors.begin(), colors.end()); + std::set color_set(m_colors.begin(), m_colors.end()); // Adjust the color set so that the distinct colors provided // fall within the range of the number of splits requested. std::set color_setp; uint_t ncolors_chosen = 0; for (const auto &c : color_set) { - if (ncolors_chosen >= split_size) break; + if (ncolors_chosen >= m_split_size) break; color_setp.insert(c); ncolors_chosen++; } // Cache all device infos associated with the parent hardware pool. - auto dinfos = base_hwpool->devices(); + auto dinfos = m_hwpool->devices(); // Iterate over the supported device types and split them up round-robin. // TODO(skg) Should this be a mapping operation in qvi-map? for (const auto devt : qvi_hwloc_supported_devices()) { @@ -189,7 +191,7 @@ qvi_hwsplit_s::split_devices_user_defined(void) devs.push_back(dinfo.second.get()); } // Maps colors to device information. - id_devinfo_multimap_t devmap; + id2devs_t devmap; uint_t devi = 0; while (devi < ndevs) { for (const auto &c : color_setp) { @@ -199,11 +201,11 @@ qvi_hwsplit_s::split_devices_user_defined(void) } // Now that we have the mapping of colors to devices, assign devices to // the associated hardware pools. - for (uint_t i = 0; i < group_size; ++i) { - const int color = colors[i]; + for (uint_t i = 0; i < m_group_size; ++i) { + const int color = m_colors[i]; for (const auto &c2d : devmap) { if (c2d.first != color) continue; - rc = hwpools[i]->add_device(*c2d.second); + rc = m_hwpools[i]->add_device(*c2d.second); if (rc != QV_SUCCESS) break; } if (rc != QV_SUCCESS) break; @@ -223,7 +225,7 @@ qvi_hwsplit_s::split_devices_affinity_preserving(void) int rc = release_devices(); if (rc != QV_SUCCESS) return rc; // Get a pointer to device infos associated with the parent hardware pool. - auto dinfos = base_hwpool->devices(); + auto dinfos = m_hwpool->devices(); // Iterate over the supported device types and split them up. for (const auto devt : qvi_hwloc_supported_devices()) { // Store device infos. @@ -242,7 +244,7 @@ qvi_hwsplit_s::split_devices_affinity_preserving(void) qvi_map_t map; const auto policy = affinity_preserving_policy(); rc = qvi_map_affinity_preserving( - map, policy, devaffs, affinities + map, policy, devaffs, m_affinities ); if (rc != QV_SUCCESS) return rc; //qvi_map_debug_dump(map); @@ -251,7 +253,7 @@ qvi_hwsplit_s::split_devices_affinity_preserving(void) for (const auto &mi : map) { const uint_t devid = mi.first; const uint_t pooli = mi.second; - rc = hwpools[pooli]->add_device(*devs[devid]); + rc = m_hwpools[pooli]->add_device(*devs[devid]); if (rc != QV_SUCCESS) break; } if (rc != QV_SUCCESS) break; @@ -296,15 +298,15 @@ qvi_hwsplit_s::split_user_defined(void) int rc = split_cpuset(cpusets); if (rc != QV_SUCCESS) return rc; // Developer sanity check. - assert(cpusets.size() == split_size); + assert(cpusets.size() == m_split_size); // Maintains the mapping between task (consumer) IDs and resource IDs. qvi_map_t map{}; - rc = qvi_map_colors(map, colors, cpusets); + rc = qvi_map_colors(map, m_colors, cpusets); if (rc != QV_SUCCESS) return rc; - qvi_hwloc_t *const hwloc = qvi_rmi_client_hwloc(rmi); + qvi_hwloc_t *const hwloc = qvi_rmi_client_hwloc(m_rmi); // Update the hardware pools and colors to reflect the new mapping. rc = apply_cpuset_mapping( - hwloc, map, cpusets, hwpools, colors + hwloc, map, cpusets, m_hwpools, m_colors ); if (rc != QV_SUCCESS) return rc; // Use a straightforward device splitting algorithm based on user's request. @@ -324,17 +326,17 @@ qvi_hwsplit_s::split_affinity_preserving_pass1(void) // Map tasks based on their affinity to resources encoded by the cpusets. const auto policy = affinity_preserving_policy(); rc = qvi_map_affinity_preserving( - map, policy, affinities, cpusets + map, policy, m_affinities, cpusets ); if (rc != QV_SUCCESS) return rc; // Make sure that we mapped all the tasks. If not, this is a bug. - if (qvi_map_nfids_mapped(map) != group_size) { + if (qvi_map_nfids_mapped(map) != m_group_size) { qvi_abort(); } - qvi_hwloc_t *const hwloc = qvi_rmi_client_hwloc(rmi); + qvi_hwloc_t *const hwloc = qvi_rmi_client_hwloc(m_rmi); // Update the hardware pools and colors to reflect the new mapping. return apply_cpuset_mapping( - hwloc, map, cpusets, hwpools, colors + hwloc, map, cpusets, m_hwpools, m_colors ); } @@ -386,7 +388,7 @@ qvi_hwsplit_s::split(void) // provided by the caller. Negative values are reserved for internal // use and shall be constants defined in quo-vadis.h. Note we don't sort the // splitagg's colors directly because they are ordered by task ID. - std::vector tcolors(colors); + std::vector tcolors(m_colors); std::sort(tcolors.begin(), tcolors.end()); // We have a few possibilities here: // * The values are all positive: user-defined split, but we have to clamp @@ -401,7 +403,7 @@ qvi_hwsplit_s::split(void) // All colors are positive. if (tcolors.front() >= 0) { - rc = clamp_colors(colors); + rc = clamp_colors(m_colors); if (rc != QV_SUCCESS) return rc; } // Some values are negative. @@ -417,7 +419,7 @@ qvi_hwsplit_s::split(void) return split_user_defined(); } // Automatic splitting. - switch (colors[0]) { + switch (m_colors[0]) { case QV_SCOPE_SPLIT_AFFINITY_PRESERVING: return split_affinity_preserving(); default: @@ -427,6 +429,309 @@ qvi_hwsplit_s::split(void) return rc; } +qvi_scope_split_coll_s::qvi_scope_split_coll_s( + qv_scope_t *parent_a, + uint_t split_size_a, + int mycolor_a, + qv_hw_obj_type_t split_at_type_a +) : parent(parent_a) + , mycolor(mycolor_a) +{ + const qvi_group_t *const pgroup = parent->group; + if (pgroup->rank() == qvi_scope_split_coll_s::s_rootid) { + hwsplit = qvi_hwsplit_s( + parent, pgroup->size(), split_size_a, split_at_type_a + ); + } +} + +template +int +qvi_scope_split_coll_s::scatter_values( + int root, + const std::vector &values, + TYPE *value +) { + static_assert(std::is_trivially_copyable::value, ""); + + int rc = QV_SUCCESS; + qvi_bbuff_t *rxbuff = nullptr; + + qvi_group_t *const group = parent->group; + std::vector txbuffs(0); + if (root == group->rank()) { + const uint_t group_size = group->size(); + txbuffs.resize(group_size); + // Pack the values. + for (uint_t i = 0; i < group_size; ++i) { + rc = qvi_bbuff_new(&txbuffs[i]); + if (qvi_unlikely(rc != QV_SUCCESS)) break; + + rc = txbuffs[i]->append(&values[i], sizeof(TYPE)); + if (qvi_unlikely(rc != QV_SUCCESS)) break; + } + if (qvi_unlikely(rc != QV_SUCCESS)) goto out; + } + + rc = group->scatter(txbuffs.data(), root, &rxbuff); + if (qvi_unlikely(rc != QV_SUCCESS)) goto out; + + *value = *(TYPE *)rxbuff->data(); +out: + for (auto &buff : txbuffs) { + qvi_bbuff_delete(&buff); + } + qvi_bbuff_delete(&rxbuff); + if (rc != QV_SUCCESS) { + // If something went wrong, just zero-initialize the value. + *value = {}; + } + return rc; +} + +template +int +qvi_scope_split_coll_s::bcast_value( + int root, + TYPE *value +) { + static_assert(std::is_trivially_copyable::value, ""); + qvi_group_t *const group = parent->group; + + std::vector values; + if (root == group->rank()) { + values.resize(group->size()); + std::fill(values.begin(), values.end(), *value); + } + return scatter_values(root, values, value); +} + +template +int +qvi_scope_split_coll_s::gather_values( + int root, + TYPE invalue, + std::vector &outvals +) { + static_assert(std::is_trivially_copyable::value, ""); + qvi_group_t *const group = parent->group; + const uint_t group_size = group->size(); + + qvi_bbuff_t *txbuff = nullptr; + int rc = qvi_bbuff_new(&txbuff); + if (qvi_unlikely(rc != QV_SUCCESS)) return rc; + + rc = txbuff->append(&invalue, sizeof(TYPE)); + if (qvi_unlikely(rc != QV_SUCCESS)) { + qvi_bbuff_delete(&txbuff); + return rc; + } + // Gather the values to the root. + bool shared = false; + qvi_bbuff_t **bbuffs = nullptr; + rc = group->gather(txbuff, root, &shared, &bbuffs); + if (qvi_unlikely(rc != QV_SUCCESS)) goto out; + // The root fills in the output. + if (group->rank() == root) { + outvals.resize(group_size); + // Unpack the values. + for (uint_t i = 0; i < group_size; ++i) { + outvals[i] = *(TYPE *)bbuffs[i]->data(); + } + } +out: + if (!shared || (shared && (group->rank() == root))) { + if (bbuffs) { + for (uint_t i = 0; i < group_size; ++i) { + qvi_bbuff_delete(&bbuffs[i]); + } + delete[] bbuffs; + } + } + qvi_bbuff_delete(&txbuff); + if (qvi_unlikely(rc != QV_SUCCESS)) { + // If something went wrong, just zero-initialize the values. + outvals = {}; + } + return rc; +} + +int +qvi_scope_split_coll_s::gather_hwpools( + int root, + qvi_hwpool_s *txpool, + std::vector &rxpools +) { + qvi_group_t *const group = parent->group; + const uint_t group_size = group->size(); + // Pack the hardware pool into a buffer. + qvi_bbuff_t txbuff; + int rc = txpool->packinto(&txbuff); + if (qvi_unlikely(rc != QV_SUCCESS)) return rc; + // Gather the values to the root. + bool shared = false; + qvi_bbuff_t **bbuffs = nullptr; + rc = group->gather(&txbuff, root, &shared, &bbuffs); + if (rc != QV_SUCCESS) goto out; + + if (group->rank() == root) { + rxpools.resize(group_size); + // Unpack the hwpools. + for (uint_t i = 0; i < group_size; ++i) { + rc = qvi_bbuff_rmi_unpack( + bbuffs[i]->data(), &rxpools[i] + ); + if (qvi_unlikely(rc != QV_SUCCESS)) break; + } + } +out: + if (!shared || (shared && (group->rank() == root))) { + if (bbuffs) { + for (uint_t i = 0; i < group_size; ++i) { + qvi_bbuff_delete(&bbuffs[i]); + } + delete[] bbuffs; + } + } + if (rc != QV_SUCCESS) { + // If something went wrong, just zero-initialize the pools. + rxpools = {}; + } + return rc; +} + +int +qvi_scope_split_coll_s::gather(void) +{ + int rc = gather_values( + s_rootid, qvi_task_t::mytid(), hwsplit.m_taskids + ); + if (qvi_unlikely(rc != QV_SUCCESS)) return rc; + // Note that the result hwpools are copies, so we can modify them freely. + rc = gather_hwpools( + s_rootid, parent->hwpool, hwsplit.m_hwpools + ); + if (qvi_unlikely(rc != QV_SUCCESS)) return rc; + + rc = gather_values( + s_rootid, mycolor, hwsplit.m_colors + ); + if (qvi_unlikely(rc != QV_SUCCESS)) return rc; + + const int myid = parent->group->rank(); + const uint_t group_size = parent->group->size(); + if (myid == qvi_scope_split_coll_s::s_rootid) { + hwsplit.m_affinities.resize(group_size); + for (uint_t tid = 0; tid < group_size; ++tid) { + hwloc_cpuset_t cpuset = nullptr; + rc = parent->group->task()->bind_top(&cpuset); + if (qvi_unlikely(rc != QV_SUCCESS)) break; + // + rc = hwsplit.m_affinities[tid].set(cpuset); + // Clean up. + qvi_hwloc_bitmap_delete(&cpuset); + if (qvi_unlikely(rc != QV_SUCCESS)) break; + } + } + return rc; +} + +int +qvi_scope_split_coll_s::scatter_hwpools( + int root, + const std::vector &pools, + qvi_hwpool_s **pool +) { + int rc = QV_SUCCESS; + std::vector txbuffs(0); + qvi_bbuff_t *rxbuff = nullptr; + + qvi_group_t *const group = parent->group; + + if (root == group->rank()) { + const uint_t group_size = group->size(); + txbuffs.resize(group_size); + // Pack the hwpools. + for (uint_t i = 0; i < group_size; ++i) { + rc = qvi_bbuff_new(&txbuffs[i]); + if (rc != QV_SUCCESS) break; + + rc = pools[i]->packinto(txbuffs[i]); + if (rc != QV_SUCCESS) break; + } + if (rc != QV_SUCCESS) goto out; + } + + rc = group->scatter(txbuffs.data(), root, &rxbuff); + if (rc != QV_SUCCESS) goto out; + + rc = qvi_bbuff_rmi_unpack(rxbuff->data(), pool); +out: + for (auto &buff : txbuffs) { + qvi_bbuff_delete(&buff); + } + qvi_bbuff_delete(&rxbuff); + if (rc != QV_SUCCESS) { + qvi_delete(pool); + } + return rc; +} + +int +qvi_scope_split_coll_s::scatter( + int *colorp, + qvi_hwpool_s **result +) { + const int rc = scatter_values(s_rootid, hwsplit.m_colors, colorp); + if (qvi_unlikely(rc != QV_SUCCESS)) return rc; + return scatter_hwpools(s_rootid, hwsplit.m_hwpools, result); +} + +int +qvi_scope_split_coll_s::barrier(void) +{ + return parent->group->barrier(); +} + +int +qvi_scope_split_coll_s::split( + int *colorp, + qvi_hwpool_s **result +) { + int rc2 = QV_SUCCESS; + const int myid = parent->group->rank(); + // First consolidate the provided information, as this is coming from a + // SPMD-like context (e.g., splitting a resource shared by MPI processes). + // In most cases it is easiest to have a single task calculate the split + // based on global knowledge and later redistribute the calculated result to + // its group members. Note that aggregated data are only valid for the task + // whose id is equal to qvi_global_split_t::rootid after gather has + // completed. + int rc = gather(); + if (rc != QV_SUCCESS) goto out; + // The root does this calculation. + if (myid == s_rootid) { + rc2 = hwsplit.split(); + } + // Wait for the split information. Explicitly barrier here in case the + // underlying broadcast implementation polls heavily for completion. + rc = barrier(); + if (rc != QV_SUCCESS) goto out; + // To avoid hangs in split error paths, share the split rc with everyone. + rc = bcast_value(s_rootid, &rc2); + if (rc != QV_SUCCESS) goto out; + // If the split failed, return the error to all callers. + if (rc2 != QV_SUCCESS) { + rc = rc2; + goto out; + } + // Scatter the results. + rc = scatter(colorp, result); + if (rc != QV_SUCCESS) goto out; +out: + return rc; +} + /* * vim: ft=cpp ts=4 sts=4 sw=4 expandtab */ diff --git a/src/qvi-split.h b/src/qvi-split.h index 3f664d0..356cb77 100644 --- a/src/qvi-split.h +++ b/src/qvi-split.h @@ -30,50 +30,52 @@ * qvi_scope_split_agg_s, but that isn't a requirement. */ struct qvi_hwsplit_s { +//private: /** A pointer to my RMI. */ - qvi_rmi_client_t *rmi = nullptr; + qvi_rmi_client_t *m_rmi = nullptr; /** The base hardware pool we are splitting. */ - qvi_hwpool_s *base_hwpool = nullptr; + qvi_hwpool_s *m_hwpool = nullptr; /** The number of members that are part of the split. */ - uint_t group_size = 0; + uint_t m_group_size = 0; /** The number of pieces in the split. */ - uint_t split_size = 0; + uint_t m_split_size = 0; /** * The potential hardware resource that we are splitting at. QV_HW_OBJ_LAST * indicates that we are called from a split() context. Any other hardware * resource type indicates that we are splitting at that type: called from a * split_at() context. */ - qv_hw_obj_type_t split_at_type; + qv_hw_obj_type_t m_split_at_type; /** * Vector of task IDs, one for each member of the group. Note that the * number of task IDs will always match the group size and that their array * index corresponds to a task ID. It is handy to have the task IDs for * splitting so we can query task characteristics during a splitting. */ - std::vector taskids; + std::vector m_taskids; /** * Vector of hardware pools, one for each member of the group. Note that the * number of hardware pools will always match the group size and that their * array index corresponds to a task ID: 0 ... group_size - 1. */ - std::vector hwpools; + std::vector m_hwpools; /** * Vector of colors, one for each member of the group. Note that the number * of colors will always match the group size and that their array index * corresponds to a task ID. */ - std::vector colors; + std::vector m_colors; /** Vector of task affinities. */ - qvi_hwloc_cpusets_t affinities; + qvi_hwloc_cpusets_t m_affinities; +public: /** Constructor. */ qvi_hwsplit_s(void) = default; /** Constructor. */ qvi_hwsplit_s( qv_scope_t *parent, - uint_t group_size_a, - uint_t split_size_a, - qv_hw_obj_type_t split_at_type_a + uint_t group_size, + uint_t split_size, + qv_hw_obj_type_t split_at_type ); /** Destructor. */ ~qvi_hwsplit_s(void); @@ -91,8 +93,8 @@ struct qvi_hwsplit_s { qvi_hwloc_bitmap_s cpuset(void) const; /** - * Performs a straightforward splitting of the provided cpuset: split the - * provided base cpuset into splitagg.split_size distinct pieces. + * Performs a straightforward splitting of the provided cpuset: + * split the provided base cpuset into split_size distinct pieces. */ int split_cpuset( @@ -146,6 +148,102 @@ struct qvi_hwsplit_s { split(void); }; +/** + * Collective hardware split: a collection of data and operations relevant to + * split operations requiring aggregated resource knowledge AND coordination + * between tasks in the parent scope to perform a split. + */ +struct qvi_scope_split_coll_s { + /** + * The root task ID used for collective operations. + * We use 0 as the root because 0 will always exist. + */ + static constexpr int s_rootid = 0; + /** Points to the parent scope that we are splitting. */ + qv_scope_t *parent = nullptr; + /** My color. */ + int mycolor = 0; + /** + * Stores group-global hardware split information brought together by + * collective operations across the members in the parent scope. + */ + qvi_hwsplit_s hwsplit; + /** Constructor. */ + qvi_scope_split_coll_s(void) = delete; + /** Constructor. */ + qvi_scope_split_coll_s( + qv_scope_t *parent_a, + uint_t split_size_a, + int mycolor_a, + qv_hw_obj_type_t split_at_type_a + ); + /** */ + template + int + scatter_values( + int root, + const std::vector &values, + TYPE *value + ); + /** */ + template + int + bcast_value( + int root, + TYPE *value + ); + /** */ + template + int + gather_values( + int root, + TYPE invalue, + std::vector &outvals + ); + /** */ + int + gather_hwpools( + int root, + qvi_hwpool_s *txpool, + std::vector &rxpools + ); + /** Gathers. */ + int + gather(void); + /** */ + int + scatter_hwpools( + int root, + const std::vector &pools, + qvi_hwpool_s **pool + ); + /** */ + int + scatter( + int *colorp, + qvi_hwpool_s **result + ); + /** */ + int + barrier(void); + /** + * Split the hardware resources based on the provided split parameters: + * - npieces: The number of splits requested. + * - color: Either user-supplied (explicitly set) or a value that requests + * us to do the coloring for the callers. + * maybe_obj_type: Potentially the object type that we are splitting at. This + * value influences how the splitting algorithms perform their mapping. + * - colorp: color' is potentially a new color assignment determined by one + * of our coloring algorithms. This value can be used to influence the + * group splitting that occurs after this call completes. + */ + int + split( + int *colorp, + qvi_hwpool_s **result + ); +}; + #endif /*