From d26c05ae640190f46e309cd916eeb72f5398a263 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 24 Oct 2025 14:28:35 +0200 Subject: [PATCH 1/2] Refactor pack/unpack kernels --- include/ghex/unstructured/user_concepts.hpp | 157 +++++++++++++++----- 1 file changed, 121 insertions(+), 36 deletions(-) diff --git a/include/ghex/unstructured/user_concepts.hpp b/include/ghex/unstructured/user_concepts.hpp index 280872a2..66becac5 100644 --- a/include/ghex/unstructured/user_concepts.hpp +++ b/include/ghex/unstructured/user_concepts.hpp @@ -454,39 +454,71 @@ class data_descriptor #ifdef GHEX_CUDACC -#define GHEX_UNSTRUCTURED_SERIALIZATION_THREADS_PER_BLOCK 32 +#define GHEX_UNSTRUCTURED_SERIALIZATION_THREADS_PER_BLOCK_X 32 +#define GHEX_UNSTRUCTURED_SERIALIZATION_THREADS_PER_BLOCK_Y 8 template __global__ void -pack_kernel(const T* values, const std::size_t local_indices_size, +pack_kernel_levels_first(const T* values, const std::size_t local_indices_size, const std::size_t* local_indices, const std::size_t levels, T* buffer, - const std::size_t index_stride, const std::size_t level_stride, - const std::size_t buffer_index_stride, const std::size_t buffer_level_stride) + const std::size_t index_stride, const std::size_t buffer_index_stride) +{ + const std::size_t level = threadIdx.x + (blockIdx.x * blockDim.x); + const std::size_t idx = threadIdx.y + (blockIdx.y * blockDim.y); + + if (idx < local_indices_size && level < levels) + { + auto const local_index = local_indices[idx]; + buffer[idx * buffer_index_stride + level] = values[local_index * index_stride + level]; + } +} + +template +__global__ void +pack_kernel_levels_last(const T* values, const std::size_t local_indices_size, + const std::size_t* local_indices, const std::size_t levels, T* buffer, + const std::size_t level_stride, const std::size_t buffer_level_stride) { const std::size_t idx = threadIdx.x + (blockIdx.x * blockDim.x); - if (idx < local_indices_size) + const std::size_t level = threadIdx.y + (blockIdx.y * blockDim.y); + + if (idx < local_indices_size && level < levels) { - for (std::size_t level = 0; level < levels; ++level) - { - buffer[idx * buffer_index_stride + level * buffer_level_stride] = values[local_indices[idx] * index_stride + level * level_stride]; - } + auto const local_index = local_indices[idx]; + buffer[idx + level * buffer_level_stride] = values[local_index + level * level_stride]; } } template __global__ void -unpack_kernel(const T* buffer, const std::size_t local_indices_size, +unpack_kernel_levels_first(const T* buffer, const std::size_t local_indices_size, const std::size_t* local_indices, const std::size_t levels, T* values, - const std::size_t index_stride, const std::size_t level_stride, - const std::size_t buffer_index_stride, const std::size_t buffer_level_stride) + + const std::size_t index_stride, const std::size_t buffer_index_stride) +{ + const std::size_t level = threadIdx.x + (blockIdx.x * blockDim.x); + const std::size_t idx = threadIdx.y + (blockIdx.y * blockDim.y); + + if (idx < local_indices_size && level < levels) + { + auto const local_index = local_indices[idx]; + values[local_index * index_stride + level] = buffer[idx * buffer_index_stride + level]; + } +} + +template +__global__ void +unpack_kernel_levels_last(const T* buffer, const std::size_t local_indices_size, + const std::size_t* local_indices, const std::size_t levels, T* values, + const std::size_t level_stride, const std::size_t buffer_level_stride) { const std::size_t idx = threadIdx.x + (blockIdx.x * blockDim.x); - if (idx < local_indices_size) + const std::size_t level = threadIdx.y + (blockIdx.y * blockDim.y); + + if (idx < local_indices_size && level < levels) { - for (std::size_t level = 0; level < levels; ++level) - { - values[local_indices[idx] * index_stride + level * level_stride] = buffer[idx * buffer_index_stride + level * buffer_level_stride]; - } + auto const local_index = local_indices[idx]; + values[local_index + level * level_stride] = buffer[idx + level * buffer_level_stride]; } } @@ -522,7 +554,8 @@ class data_descriptor * @param outer_stride outer dimension's stride measured in number of elements of type T (special value 0: no padding) * @param device_id device id*/ data_descriptor(const domain_descriptor_type& domain, value_type* field, - std::size_t levels = 1u, bool levels_first = true, std::size_t outer_stride = 0u, device_id_type device_id = arch_traits::current_id()) + std::size_t levels = 1u, bool levels_first = true, std::size_t outer_stride = 0u, + device_id_type device_id = arch_traits::current_id()) : m_device_id{device_id} , m_domain_id{domain.domain_id()} , m_domain_size{domain.size()} @@ -549,34 +582,86 @@ class data_descriptor template void pack(value_type* buffer, const IndexContainer& c, void* stream_ptr) { + const dim3 threads_per_block(GHEX_UNSTRUCTURED_SERIALIZATION_THREADS_PER_BLOCK_X, + GHEX_UNSTRUCTURED_SERIALIZATION_THREADS_PER_BLOCK_Y); + for (const auto& is : c) { - const int n_blocks = - static_cast(std::ceil(static_cast(is.local_indices().size()) / - GHEX_UNSTRUCTURED_SERIALIZATION_THREADS_PER_BLOCK)); - const std::size_t buffer_index_stride = m_levels_first ? m_levels : 1u; - const std::size_t buffer_level_stride = m_levels_first ? 1u : is.local_indices().size(); - pack_kernel<<(stream_ptr))>>>(m_values, - is.local_indices().size(), is.local_indices().data(), m_levels, buffer, - m_index_stride, m_level_stride, buffer_index_stride, buffer_level_stride); + if (m_levels_first) + { + const int blocks_levels = static_cast( + std::ceil(static_cast(m_levels) / + GHEX_UNSTRUCTURED_SERIALIZATION_THREADS_PER_BLOCK_X)); + const int blocks_indices = static_cast( + std::ceil(static_cast(is.local_indices().size()) / + GHEX_UNSTRUCTURED_SERIALIZATION_THREADS_PER_BLOCK_Y)); + + const dim3 blocks(blocks_levels, blocks_indices); + + pack_kernel_levels_first<<(stream_ptr))>>>(m_values, + is.local_indices().size(), is.local_indices().data(), m_levels, buffer, + m_index_stride, m_levels); + } + else + { + const int blocks_indices = static_cast( + std::ceil(static_cast(is.local_indices().size()) / + GHEX_UNSTRUCTURED_SERIALIZATION_THREADS_PER_BLOCK_X)); + const int blocks_levels = static_cast( + std::ceil(static_cast(m_levels) / + GHEX_UNSTRUCTURED_SERIALIZATION_THREADS_PER_BLOCK_Y)); + + const dim3 blocks(blocks_indices, blocks_levels); + + pack_kernel_levels_last<<(stream_ptr))>>>(m_values, + is.local_indices().size(), is.local_indices().data(), m_levels, buffer, + m_level_stride, is.local_indices().size()); + } } } template void unpack(const value_type* buffer, const IndexContainer& c, void* stream_ptr) { + const dim3 threads_per_block(GHEX_UNSTRUCTURED_SERIALIZATION_THREADS_PER_BLOCK_X, + GHEX_UNSTRUCTURED_SERIALIZATION_THREADS_PER_BLOCK_Y); + for (const auto& is : c) { - const int n_blocks = - static_cast(std::ceil(static_cast(is.local_indices().size()) / - GHEX_UNSTRUCTURED_SERIALIZATION_THREADS_PER_BLOCK)); - const std::size_t buffer_index_stride = m_levels_first ? m_levels : 1u; - const std::size_t buffer_level_stride = m_levels_first ? 1u : is.local_indices().size(); - unpack_kernel<<(stream_ptr))>>>(buffer, - is.local_indices().size(), is.local_indices().data(), m_levels, m_values, - m_index_stride, m_level_stride, buffer_index_stride, buffer_level_stride); + if (m_levels_first) + { + const int blocks_levels = static_cast( + std::ceil(static_cast(m_levels) / + GHEX_UNSTRUCTURED_SERIALIZATION_THREADS_PER_BLOCK_X)); + const int blocks_indices = static_cast( + std::ceil(static_cast(is.local_indices().size()) / + GHEX_UNSTRUCTURED_SERIALIZATION_THREADS_PER_BLOCK_Y)); + + const dim3 blocks(blocks_levels, blocks_indices); + + unpack_kernel_levels_first<<(stream_ptr))>>>(buffer, + is.local_indices().size(), is.local_indices().data(), m_levels, m_values, + m_index_stride, m_levels); + } + else + { + const int blocks_indices = static_cast( + std::ceil(static_cast(is.local_indices().size()) / + GHEX_UNSTRUCTURED_SERIALIZATION_THREADS_PER_BLOCK_X)); + const int blocks_levels = static_cast( + std::ceil(static_cast(m_levels) / + GHEX_UNSTRUCTURED_SERIALIZATION_THREADS_PER_BLOCK_Y)); + + const dim3 blocks(blocks_indices, blocks_levels); + + unpack_kernel_levels_last<<(stream_ptr))>>>(buffer, + is.local_indices().size(), is.local_indices().data(), m_levels, m_values, + m_level_stride, is.local_indices().size()); + } } } }; From c302542d7072bc42f0479166516c0717a57b2bc7 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Mon, 22 Dec 2025 13:06:20 +0100 Subject: [PATCH 2/2] Format files --- .../atlas_halo_exchange_nodecolumns.cpp | 227 +- ...m_2_test_halo_exchange_3D_generic_full.cpp | 3056 ++++++----------- ...gcl_test_halo_exchange_3D_generic_full.cpp | 2889 +++++----------- ...omm_test_halo_exchange_3D_generic_full.cpp | 2738 +++++---------- benchmarks/simple_rma.cpp | 281 +- .../transport/ghex_p2p_bi_cb_avail_mt.cpp | 293 +- .../transport/ghex_p2p_bi_cb_wait_mt.cpp | 155 +- .../transport/ghex_p2p_bi_ft_avail_mt.cpp | 226 +- .../transport/ghex_p2p_bi_ft_wait_mt.cpp | 111 +- benchmarks/transport/ghex_p2p_cb_dynamic.cpp | 172 +- .../ghex_p2p_cb_dynamic_resubmit.cpp | 168 +- .../ghex_p2p_cb_dynamic_resubmit_mt.cpp | 221 +- benchmarks/transport/ghex_p2p_cb_resubmit.cpp | 169 +- benchmarks/transport/mpi_p2p_avail_any.cpp | 155 +- benchmarks/transport/mpi_p2p_avail_mt.cpp | 145 +- benchmarks/transport/mpi_p2p_bi_avail.cpp | 124 +- benchmarks/transport/mpi_p2p_bi_avail_mt.cpp | 149 +- benchmarks/transport/mpi_p2p_bi_wait_mt.cpp | 193 +- benchmarks/transport/pool_allocator.hpp | 165 +- benchmarks/transport/utils.hpp | 15 +- benchmarks/unstructured_parmetis.cpp | 567 +-- bindings/fhex/cubed_sphere_bind.cpp | 287 +- bindings/python/src/_pyghex/config.cpp | 6 +- bindings/python/src/_pyghex/context_shim.cpp | 3 +- bindings/python/src/_pyghex/mpi_comm_shim.cpp | 1 - .../src/_pyghex/py_dtype_to_cpp_name.cpp | 25 +- .../python/src/_pyghex/register_class.hpp | 8 +- .../regular/communication_object.cpp | 25 +- .../regular/communication_object.hpp | 33 +- .../structured/regular/field_descriptor.cpp | 85 +- .../structured/regular/halo_generator.cpp | 4 +- .../_pyghex/structured/regular/pattern.cpp | 6 +- .../unstructured/communication_object.cpp | 27 +- .../unstructured/communication_object.hpp | 1 - .../unstructured/domain_descriptor.cpp | 8 +- .../_pyghex/unstructured/field_descriptor.cpp | 48 +- .../_pyghex/unstructured/field_descriptor.hpp | 5 +- .../_pyghex/unstructured/halo_generator.cpp | 8 +- .../_pyghex/unstructured/halo_generator.hpp | 1 - .../src/_pyghex/unstructured/pattern.cpp | 15 +- .../src/_pyghex/unstructured/pattern.hpp | 2 - .../python/src/_pyghex/unstructured/types.hpp | 2 +- bindings/python/src/_pyghex/util/demangle.hpp | 17 +- include/ghex/bulk_communication_object.hpp | 157 +- include/ghex/communication_object.hpp | 191 +- include/ghex/context.hpp | 1 + include/ghex/device/cuda/error.hpp | 8 +- include/ghex/device/cuda/future.hpp | 14 +- include/ghex/device/cuda/runtime.hpp | 94 +- include/ghex/device/cuda/stream.hpp | 7 +- include/ghex/glue/gridtools/field.hpp | 20 +- .../ghex/glue/gridtools/make_gt_pattern.hpp | 4 +- .../ghex/glue/gridtools/processor_grid.hpp | 8 +- include/ghex/packer.hpp | 15 +- include/ghex/pattern_container.hpp | 12 +- include/ghex/rma/cuda/handle.hpp | 4 +- include/ghex/rma/event.hpp | 16 +- include/ghex/rma/handle.hpp | 4 +- include/ghex/rma/range_factory.hpp | 16 +- include/ghex/rma/shmem/access_guard.hpp | 6 +- include/ghex/rma/thread/access_guard.hpp | 4 +- include/ghex/rma/xpmem/handle.hpp | 4 +- .../cubed_sphere/field_descriptor.hpp | 8 +- .../cubed_sphere/halo_generator.hpp | 9 +- .../structured/cubed_sphere/transform.hpp | 24 +- include/ghex/structured/field_descriptor.hpp | 12 +- include/ghex/structured/field_utils.hpp | 19 +- include/ghex/structured/pack_kernels.hpp | 18 +- include/ghex/structured/pattern.hpp | 33 +- .../structured/regular/field_descriptor.hpp | 5 +- .../structured/regular/halo_generator.hpp | 12 +- .../ghex/structured/regular/make_pattern.hpp | 4 +- include/ghex/structured/rma_put.hpp | 25 +- .../ghex/structured/rma_range_generator.hpp | 8 +- include/ghex/unstructured/user_concepts.hpp | 12 +- include/ghex/util/coordinate.hpp | 4 +- include/ghex/util/decomposition.hpp | 2 +- include/ghex/util/resource_layout.hpp | 16 +- scripts/container_clang_format.sh | 9 +- test/mpi_runner/gtest_main_mpi.cpp | 2 +- .../test_cubed_sphere_exchange.cpp | 1088 +++--- test/structured/regular/test_local_rma.cpp | 71 +- .../regular/test_regular_domain.cpp | 108 +- .../regular/test_simple_regular_domain.cpp | 30 +- test/unstructured/test_user_concepts.cpp | 4 +- test/unstructured/unstructured_test_case.hpp | 20 +- test/util/memory.hpp | 5 +- 87 files changed, 6172 insertions(+), 8797 deletions(-) diff --git a/benchmarks/atlas_halo_exchange_nodecolumns.cpp b/benchmarks/atlas_halo_exchange_nodecolumns.cpp index 823d7212..477cdddc 100644 --- a/benchmarks/atlas_halo_exchange_nodecolumns.cpp +++ b/benchmarks/atlas_halo_exchange_nodecolumns.cpp @@ -54,9 +54,8 @@ using transport = gridtools::ghex::tl::ucx_tag; #endif using context_type = gridtools::ghex::tl::context; - -TEST(atlas_integration, halo_exchange_nodecolumns) { - +TEST(atlas_integration, halo_exchange_nodecolumns) +{ using timer_type = gridtools::ghex::timer; using domain_id_t = int; using domain_descriptor_t = gridtools::ghex::atlas_domain_descriptor; @@ -67,45 +66,44 @@ TEST(atlas_integration, halo_exchange_nodecolumns) { using storage_traits_cpu = gridtools::storage::cpu_ifirst; #endif using function_space_t = atlas::functionspace::NodeColumns; - using cpu_data_descriptor_t = gridtools::ghex::atlas_data_descriptor; + using cpu_data_descriptor_t = gridtools::ghex::atlas_data_descriptor; const int n_iter = 50; - auto context_ptr = gridtools::ghex::tl::context_factory::create(MPI_COMM_WORLD); + auto context_ptr = gridtools::ghex::tl::context_factory::create(MPI_COMM_WORLD); auto& context = *context_ptr; - int rank = context.rank(); + int rank = context.rank(); // Output file std::stringstream ss_file; ss_file << rank; - std::string filename = "atlas_halo_exchange_nodecolumns_times_" + ss_file.str() + ".txt"; + std::string filename = "atlas_halo_exchange_nodecolumns_times_" + ss_file.str() + ".txt"; std::ofstream file(filename.c_str()); file << "Atlas halo exchange nodecolumns - Timings\n"; // Timers timer_type t_atlas_cpu_local, t_atlas_cpu_global; // Atlas on CPU - timer_type t_ghex_cpu_local, t_ghex_cpu_global; // GHEX on CPU - timer_type t_ghex_gpu_local, t_ghex_gpu_global; // GHEX on GPU + timer_type t_ghex_cpu_local, t_ghex_cpu_global; // GHEX on CPU + timer_type t_ghex_gpu_local, t_ghex_gpu_global; // GHEX on GPU // Global octahedral Gaussian grid atlas::StructuredGrid grid("O1280"); // Generate mesh atlas::StructuredMeshGenerator meshgenerator; - atlas::Mesh mesh = meshgenerator.generate(grid); + atlas::Mesh mesh = meshgenerator.generate(grid); // Number of vertical levels std::size_t nb_levels = 100; // Generate functionspace associated to the mesh - atlas::functionspace::NodeColumns fs_nodes(mesh, atlas::option::levels(nb_levels) | atlas::option::halo(2)); + atlas::functionspace::NodeColumns fs_nodes(mesh, + atlas::option::levels(nb_levels) | atlas::option::halo(2)); // Instantiate domain descriptor std::vector local_domains{}; - domain_descriptor_t d{rank, - mesh.nodes().partition(), - mesh.nodes().remote_index(), - nb_levels}; + domain_descriptor_t d{rank, mesh.nodes().partition(), mesh.nodes().remote_index(), nb_levels}; local_domains.push_back(d); // Instantiate halo generator @@ -118,7 +116,8 @@ TEST(atlas_integration, halo_exchange_nodecolumns) { auto patterns = gridtools::ghex::make_pattern(context, hg, rdig, local_domains); // Make communication object - auto co = gridtools::ghex::make_communication_object(context.get_communicator()); + auto co = + gridtools::ghex::make_communication_object(context.get_communicator()); // Fields creation and initialization ::atlas::FieldSet atlas_fields; @@ -126,10 +125,14 @@ TEST(atlas_integration, halo_exchange_nodecolumns) { atlas_fields.add(fs_nodes.createField(atlas::option::name("atlas_field_2"))); atlas_fields.add(fs_nodes.createField(atlas::option::name("atlas_field_3"))); atlas_fields.add(fs_nodes.createField(atlas::option::name("atlas_field_4"))); - auto GHEX_field_1 = gridtools::ghex::atlas::make_field(fs_nodes, 1); // 1 component / scalar field - auto GHEX_field_2 = gridtools::ghex::atlas::make_field(fs_nodes, 1); // 1 component / scalar field - auto GHEX_field_3 = gridtools::ghex::atlas::make_field(fs_nodes, 1); // 1 component / scalar field - auto GHEX_field_4 = gridtools::ghex::atlas::make_field(fs_nodes, 1); // 1 component / scalar field + auto GHEX_field_1 = gridtools::ghex::atlas::make_field(fs_nodes, + 1); // 1 component / scalar field + auto GHEX_field_2 = gridtools::ghex::atlas::make_field(fs_nodes, + 1); // 1 component / scalar field + auto GHEX_field_3 = gridtools::ghex::atlas::make_field(fs_nodes, + 1); // 1 component / scalar field + auto GHEX_field_4 = gridtools::ghex::atlas::make_field(fs_nodes, + 1); // 1 component / scalar field { auto atlas_field_1_data = atlas::array::make_view(atlas_fields["atlas_field_1"]); auto atlas_field_2_data = atlas::array::make_view(atlas_fields["atlas_field_2"]); @@ -139,17 +142,23 @@ TEST(atlas_integration, halo_exchange_nodecolumns) { auto GHEX_field_2_data = GHEX_field_2.host_view(); auto GHEX_field_3_data = GHEX_field_3.host_view(); auto GHEX_field_4_data = GHEX_field_4.host_view(); - for (auto node = 0; node < fs_nodes.nb_nodes(); ++node) { - for (auto level = 0; level < fs_nodes.levels(); ++level) { + for (auto node = 0; node < fs_nodes.nb_nodes(); ++node) + { + for (auto level = 0; level < fs_nodes.levels(); ++level) + { auto value = (rank << 15) + (node << 7) + level; atlas_field_1_data(node, level) = value; atlas_field_2_data(node, level) = value; atlas_field_3_data(node, level) = value; atlas_field_4_data(node, level) = value; - GHEX_field_1_data(node, level, 0) = value; // TO DO: hard-coded 3d view. Should be more flexible - GHEX_field_2_data(node, level, 0) = value; // TO DO: hard-coded 3d view. Should be more flexible - GHEX_field_3_data(node, level, 0) = value; // TO DO: hard-coded 3d view. Should be more flexible - GHEX_field_4_data(node, level, 0) = value; // TO DO: hard-coded 3d view. Should be more flexible + GHEX_field_1_data(node, level, 0) = + value; // TO DO: hard-coded 3d view. Should be more flexible + GHEX_field_2_data(node, level, 0) = + value; // TO DO: hard-coded 3d view. Should be more flexible + GHEX_field_3_data(node, level, 0) = + value; // TO DO: hard-coded 3d view. Should be more flexible + GHEX_field_4_data(node, level, 0) = + value; // TO DO: hard-coded 3d view. Should be more flexible } } } @@ -161,17 +170,22 @@ TEST(atlas_integration, halo_exchange_nodecolumns) { auto GHEX_field_4_target_data = GHEX_field_4.target_view(); // Instantiate data descriptor - cpu_data_descriptor_t data_1{local_domains.front(), GHEX_field_1_target_data, GHEX_field_1.components()}; - cpu_data_descriptor_t data_2{local_domains.front(), GHEX_field_2_target_data, GHEX_field_2.components()}; - cpu_data_descriptor_t data_3{local_domains.front(), GHEX_field_3_target_data, GHEX_field_3.components()}; - cpu_data_descriptor_t data_4{local_domains.front(), GHEX_field_4_target_data, GHEX_field_4.components()}; + cpu_data_descriptor_t data_1{local_domains.front(), GHEX_field_1_target_data, + GHEX_field_1.components()}; + cpu_data_descriptor_t data_2{local_domains.front(), GHEX_field_2_target_data, + GHEX_field_2.components()}; + cpu_data_descriptor_t data_3{local_domains.front(), GHEX_field_3_target_data, + GHEX_field_3.components()}; + cpu_data_descriptor_t data_4{local_domains.front(), GHEX_field_4_target_data, + GHEX_field_4.components()}; // Atlas halo exchange // Atlas built-in halo exchange function is called (only from the CPU) for testing data correctness. // Time comparison might give a hint that GHEX exchange times are consistent, // but Atlas times should not be considered as a baseline. fs_nodes.haloExchange(atlas_fields); // first iteration - for (auto i = 0; i < n_iter; ++i) { // benchmark + for (auto i = 0; i < n_iter; ++i) + { // benchmark timer_type t_local; MPI_Barrier(context.mpi_comm()); t_local.tic(); @@ -184,13 +198,16 @@ TEST(atlas_integration, halo_exchange_nodecolumns) { } // GHEX halo exchange - auto h = co.exchange(patterns(data_1), patterns(data_2), patterns(data_3), patterns(data_4)); // first iteration + auto h = co.exchange(patterns(data_1), patterns(data_2), patterns(data_3), + patterns(data_4)); // first iteration h.wait(); - for (auto i = 0; i < n_iter; ++i) { // benchmark + for (auto i = 0; i < n_iter; ++i) + { // benchmark timer_type t_local; MPI_Barrier(context.mpi_comm()); t_local.tic(); - auto h = co.exchange(patterns(data_1), patterns(data_2), patterns(data_3), patterns(data_4)); + auto h = + co.exchange(patterns(data_1), patterns(data_2), patterns(data_3), patterns(data_4)); h.wait(); t_local.toc(); t_ghex_cpu_local(t_local); @@ -201,57 +218,87 @@ TEST(atlas_integration, halo_exchange_nodecolumns) { // test for correctness { - auto atlas_field_1_data = atlas::array::make_view(atlas_fields["atlas_field_1"]); - auto atlas_field_2_data = atlas::array::make_view(atlas_fields["atlas_field_2"]); - auto atlas_field_3_data = atlas::array::make_view(atlas_fields["atlas_field_3"]); - auto atlas_field_4_data = atlas::array::make_view(atlas_fields["atlas_field_4"]); + auto atlas_field_1_data = + atlas::array::make_view(atlas_fields["atlas_field_1"]); + auto atlas_field_2_data = + atlas::array::make_view(atlas_fields["atlas_field_2"]); + auto atlas_field_3_data = + atlas::array::make_view(atlas_fields["atlas_field_3"]); + auto atlas_field_4_data = + atlas::array::make_view(atlas_fields["atlas_field_4"]); auto GHEX_field_1_data = GHEX_field_1.const_host_view(); auto GHEX_field_2_data = GHEX_field_2.const_host_view(); auto GHEX_field_3_data = GHEX_field_3.const_host_view(); auto GHEX_field_4_data = GHEX_field_4.const_host_view(); - for (auto node = 0; node < fs_nodes.nb_nodes(); ++node) { - for (auto level = 0; level < fs_nodes.levels(); ++level) { - EXPECT_TRUE(GHEX_field_1_data(node, level, 0) == atlas_field_1_data(node, level)); // TO DO: hard-coded 3d view. Should be more flexible - EXPECT_TRUE(GHEX_field_2_data(node, level, 0) == atlas_field_2_data(node, level)); // TO DO: hard-coded 3d view. Should be more flexible - EXPECT_TRUE(GHEX_field_3_data(node, level, 0) == atlas_field_3_data(node, level)); // TO DO: hard-coded 3d view. Should be more flexible - EXPECT_TRUE(GHEX_field_4_data(node, level, 0) == atlas_field_4_data(node, level)); // TO DO: hard-coded 3d view. Should be more flexible + for (auto node = 0; node < fs_nodes.nb_nodes(); ++node) + { + for (auto level = 0; level < fs_nodes.levels(); ++level) + { + EXPECT_TRUE(GHEX_field_1_data(node, level, 0) == + atlas_field_1_data(node, + level)); // TO DO: hard-coded 3d view. Should be more flexible + EXPECT_TRUE(GHEX_field_2_data(node, level, 0) == + atlas_field_2_data(node, + level)); // TO DO: hard-coded 3d view. Should be more flexible + EXPECT_TRUE(GHEX_field_3_data(node, level, 0) == + atlas_field_3_data(node, + level)); // TO DO: hard-coded 3d view. Should be more flexible + EXPECT_TRUE(GHEX_field_4_data(node, level, 0) == + atlas_field_4_data(node, + level)); // TO DO: hard-coded 3d view. Should be more flexible } } } // Write timings file << "- Atlas CPU benchmark\n" - << "\tlocal time = " << t_atlas_cpu_local.mean() / 1000.0 << "+/-" << t_atlas_cpu_local.stddev() / (sqrt(t_atlas_cpu_local.num_samples()) * 1000.0) << "s\n" - << "\tglobal time = " << t_atlas_cpu_global.mean() / 1000.0 << "+/-" << t_atlas_cpu_global.stddev() / (sqrt(t_atlas_cpu_global.num_samples()) * 1000.0) << "s\n"; + << "\tlocal time = " << t_atlas_cpu_local.mean() / 1000.0 << "+/-" + << t_atlas_cpu_local.stddev() / (sqrt(t_atlas_cpu_local.num_samples()) * 1000.0) << "s\n" + << "\tglobal time = " << t_atlas_cpu_global.mean() / 1000.0 << "+/-" + << t_atlas_cpu_global.stddev() / (sqrt(t_atlas_cpu_global.num_samples()) * 1000.0) + << "s\n"; file << "- GHEX CPU benchmark\n" - << "\tlocal time = " << t_ghex_cpu_local.mean() / 1000.0 << "+/-" << t_ghex_cpu_local.stddev() / (sqrt(t_ghex_cpu_local.num_samples()) * 1000.0) << "s\n" - << "\tglobal time = " << t_ghex_cpu_global.mean() / 1000.0 << "+/-" << t_ghex_cpu_global.stddev() / (sqrt(t_ghex_cpu_global.num_samples()) * 1000.0) << "s\n"; + << "\tlocal time = " << t_ghex_cpu_local.mean() / 1000.0 << "+/-" + << t_ghex_cpu_local.stddev() / (sqrt(t_ghex_cpu_local.num_samples()) * 1000.0) << "s\n" + << "\tglobal time = " << t_ghex_cpu_global.mean() / 1000.0 << "+/-" + << t_ghex_cpu_global.stddev() / (sqrt(t_ghex_cpu_global.num_samples()) * 1000.0) << "s\n"; #ifdef GHEX_CUDACC using storage_traits_gpu = gridtools::storage::gpu; // Additional data descriptor type for GPU - using gpu_data_descriptor_t = gridtools::ghex::atlas_data_descriptor; + using gpu_data_descriptor_t = gridtools::ghex::atlas_data_descriptor; // Additional fields for GPU halo exchange - auto GHEX_field_1_gpu = gridtools::ghex::atlas::make_field(fs_nodes, 1); // 1 component / scalar field - auto GHEX_field_2_gpu = gridtools::ghex::atlas::make_field(fs_nodes, 1); // 1 component / scalar field - auto GHEX_field_3_gpu = gridtools::ghex::atlas::make_field(fs_nodes, 1); // 1 component / scalar field - auto GHEX_field_4_gpu = gridtools::ghex::atlas::make_field(fs_nodes, 1); // 1 component / scalar field + auto GHEX_field_1_gpu = gridtools::ghex::atlas::make_field(fs_nodes, + 1); // 1 component / scalar field + auto GHEX_field_2_gpu = gridtools::ghex::atlas::make_field(fs_nodes, + 1); // 1 component / scalar field + auto GHEX_field_3_gpu = gridtools::ghex::atlas::make_field(fs_nodes, + 1); // 1 component / scalar field + auto GHEX_field_4_gpu = gridtools::ghex::atlas::make_field(fs_nodes, + 1); // 1 component / scalar field { auto GHEX_field_1_gpu_data = GHEX_field_1_gpu.host_view(); auto GHEX_field_2_gpu_data = GHEX_field_2_gpu.host_view(); auto GHEX_field_3_gpu_data = GHEX_field_3_gpu.host_view(); auto GHEX_field_4_gpu_data = GHEX_field_4_gpu.host_view(); - for (auto node = 0; node < fs_nodes.nb_nodes(); ++node) { - for (auto level = 0; level < fs_nodes.levels(); ++level) { + for (auto node = 0; node < fs_nodes.nb_nodes(); ++node) + { + for (auto level = 0; level < fs_nodes.levels(); ++level) + { auto value = (rank << 15) + (node << 7) + level; - GHEX_field_1_gpu_data(node, level, 0) = value; // TO DO: hard-coded 3d view. Should be more flexible - GHEX_field_2_gpu_data(node, level, 0) = value; // TO DO: hard-coded 3d view. Should be more flexible - GHEX_field_3_gpu_data(node, level, 0) = value; // TO DO: hard-coded 3d view. Should be more flexible - GHEX_field_4_gpu_data(node, level, 0) = value; // TO DO: hard-coded 3d view. Should be more flexible + GHEX_field_1_gpu_data(node, level, 0) = + value; // TO DO: hard-coded 3d view. Should be more flexible + GHEX_field_2_gpu_data(node, level, 0) = + value; // TO DO: hard-coded 3d view. Should be more flexible + GHEX_field_3_gpu_data(node, level, 0) = + value; // TO DO: hard-coded 3d view. Should be more flexible + GHEX_field_4_gpu_data(node, level, 0) = + value; // TO DO: hard-coded 3d view. Should be more flexible } } } @@ -263,19 +310,26 @@ TEST(atlas_integration, halo_exchange_nodecolumns) { auto GHEX_field_4_gpu_target_data = GHEX_field_4_gpu.target_view(); // Additional data descriptor for GPU halo exchange - gpu_data_descriptor_t data_1_gpu{local_domains.front(), 0, GHEX_field_1_gpu_target_data, GHEX_field_1_gpu.components()}; - gpu_data_descriptor_t data_2_gpu{local_domains.front(), 0, GHEX_field_2_gpu_target_data, GHEX_field_2_gpu.components()}; - gpu_data_descriptor_t data_3_gpu{local_domains.front(), 0, GHEX_field_3_gpu_target_data, GHEX_field_3_gpu.components()}; - gpu_data_descriptor_t data_4_gpu{local_domains.front(), 0, GHEX_field_4_gpu_target_data, GHEX_field_4_gpu.components()}; + gpu_data_descriptor_t data_1_gpu{local_domains.front(), 0, GHEX_field_1_gpu_target_data, + GHEX_field_1_gpu.components()}; + gpu_data_descriptor_t data_2_gpu{local_domains.front(), 0, GHEX_field_2_gpu_target_data, + GHEX_field_2_gpu.components()}; + gpu_data_descriptor_t data_3_gpu{local_domains.front(), 0, GHEX_field_3_gpu_target_data, + GHEX_field_3_gpu.components()}; + gpu_data_descriptor_t data_4_gpu{local_domains.front(), 0, GHEX_field_4_gpu_target_data, + GHEX_field_4_gpu.components()}; // GHEX halo exchange on GPU - auto h_gpu = co.exchange(patterns(data_1_gpu), patterns(data_2_gpu), patterns(data_3_gpu), patterns(data_4_gpu)); // first iteration + auto h_gpu = co.exchange(patterns(data_1_gpu), patterns(data_2_gpu), patterns(data_3_gpu), + patterns(data_4_gpu)); // first iteration h_gpu.wait(); - for (auto i = 0; i < n_iter; ++i) { // benchmark + for (auto i = 0; i < n_iter; ++i) + { // benchmark timer_type t_local; MPI_Barrier(context.mpi_comm()); t_local.tic(); - auto h_gpu = co.exchange(patterns(data_1_gpu), patterns(data_2_gpu), patterns(data_3_gpu), patterns(data_4_gpu)); + auto h_gpu = co.exchange(patterns(data_1_gpu), patterns(data_2_gpu), patterns(data_3_gpu), + patterns(data_4_gpu)); h_gpu.wait(); t_local.toc(); t_ghex_gpu_local(t_local); @@ -286,29 +340,44 @@ TEST(atlas_integration, halo_exchange_nodecolumns) { // Test for correctness { - auto atlas_field_1_data = atlas::array::make_view(atlas_fields["atlas_field_1"]); - auto atlas_field_2_data = atlas::array::make_view(atlas_fields["atlas_field_2"]); - auto atlas_field_3_data = atlas::array::make_view(atlas_fields["atlas_field_3"]); - auto atlas_field_4_data = atlas::array::make_view(atlas_fields["atlas_field_4"]); + auto atlas_field_1_data = + atlas::array::make_view(atlas_fields["atlas_field_1"]); + auto atlas_field_2_data = + atlas::array::make_view(atlas_fields["atlas_field_2"]); + auto atlas_field_3_data = + atlas::array::make_view(atlas_fields["atlas_field_3"]); + auto atlas_field_4_data = + atlas::array::make_view(atlas_fields["atlas_field_4"]); auto GHEX_field_1_gpu_data = GHEX_field_1_gpu.const_host_view(); auto GHEX_field_2_gpu_data = GHEX_field_2_gpu.const_host_view(); auto GHEX_field_3_gpu_data = GHEX_field_3_gpu.const_host_view(); auto GHEX_field_4_gpu_data = GHEX_field_4_gpu.const_host_view(); - for (auto node = 0; node < fs_nodes.nb_nodes(); ++node) { - for (auto level = 0; level < fs_nodes.levels(); ++level) { - EXPECT_TRUE(GHEX_field_1_gpu_data(node, level, 0) == atlas_field_1_data(node, level)); // TO DO: hard-coded 3d view. Should be more flexible - EXPECT_TRUE(GHEX_field_2_gpu_data(node, level, 0) == atlas_field_2_data(node, level)); // TO DO: hard-coded 3d view. Should be more flexible - EXPECT_TRUE(GHEX_field_3_gpu_data(node, level, 0) == atlas_field_3_data(node, level)); // TO DO: hard-coded 3d view. Should be more flexible - EXPECT_TRUE(GHEX_field_4_gpu_data(node, level, 0) == atlas_field_4_data(node, level)); // TO DO: hard-coded 3d view. Should be more flexible + for (auto node = 0; node < fs_nodes.nb_nodes(); ++node) + { + for (auto level = 0; level < fs_nodes.levels(); ++level) + { + EXPECT_TRUE(GHEX_field_1_gpu_data(node, level, 0) == + atlas_field_1_data(node, + level)); // TO DO: hard-coded 3d view. Should be more flexible + EXPECT_TRUE(GHEX_field_2_gpu_data(node, level, 0) == + atlas_field_2_data(node, + level)); // TO DO: hard-coded 3d view. Should be more flexible + EXPECT_TRUE(GHEX_field_3_gpu_data(node, level, 0) == + atlas_field_3_data(node, + level)); // TO DO: hard-coded 3d view. Should be more flexible + EXPECT_TRUE(GHEX_field_4_gpu_data(node, level, 0) == + atlas_field_4_data(node, + level)); // TO DO: hard-coded 3d view. Should be more flexible } } } // Write timings file << "- GHEX GPU benchmark\n" - << "\tlocal time = " << t_ghex_gpu_local.mean() / 1000.0 << "+/-" << t_ghex_gpu_local.stddev() / (sqrt(t_ghex_gpu_local.num_samples()) * 1000.0) << "s\n" - << "\tglobal time = " << t_ghex_gpu_global.mean() / 1000.0 << "+/-" << t_ghex_gpu_global.stddev() / (sqrt(t_ghex_gpu_global.num_samples()) * 1000.0) << "s\n"; + << "\tlocal time = " << t_ghex_gpu_local.mean() / 1000.0 << "+/-" + << t_ghex_gpu_local.stddev() / (sqrt(t_ghex_gpu_local.num_samples()) * 1000.0) << "s\n" + << "\tglobal time = " << t_ghex_gpu_global.mean() / 1000.0 << "+/-" + << t_ghex_gpu_global.stddev() / (sqrt(t_ghex_gpu_global.num_samples()) * 1000.0) << "s\n"; #endif - } diff --git a/benchmarks/comm_2_test_halo_exchange_3D_generic_full.cpp b/benchmarks/comm_2_test_halo_exchange_3D_generic_full.cpp index 3d6af288..a284a27d 100644 --- a/benchmarks/comm_2_test_halo_exchange_3D_generic_full.cpp +++ b/benchmarks/comm_2_test_halo_exchange_3D_generic_full.cpp @@ -38,2138 +38,1107 @@ using transport = gridtools::ghex::tl::mpi_tag; using context_type = typename gridtools::ghex::tl::context_factory::context_type; -namespace halo_exchange_3D_generic_full { - - using timer_type = gridtools::ghex::timer; +namespace halo_exchange_3D_generic_full +{ +using timer_type = gridtools::ghex::timer; - MPI_Comm CartComm; - int dims[3] = {0, 0, 0}; - int coords[3] = {0, 0, 0}; +MPI_Comm CartComm; +int dims[3] = {0, 0, 0}; +int coords[3] = {0, 0, 0}; #define B_ADD 1 #define C_ADD 2 #ifdef VECTOR_INTERFACE - typedef int T1; - typedef int T2; - typedef int T3; +typedef int T1; +typedef int T2; +typedef int T3; #else - typedef int T1; - typedef double T2; - typedef long long int T3; +typedef int T1; +typedef double T2; +typedef long long int T3; #endif - using domain_descriptor_type = gridtools::ghex::structured::regular::domain_descriptor>; - using halo_generator_type = gridtools::ghex::structured::regular::halo_generator>; - template - using field_descriptor_type = gridtools::ghex::structured::regular::field_descriptor>; +using domain_descriptor_type = + gridtools::ghex::structured::regular::domain_descriptor>; +using halo_generator_type = + gridtools::ghex::structured::regular::halo_generator>; +template +using field_descriptor_type = gridtools::ghex::structured::regular::field_descriptor>; #ifdef GHEX_CUDACC - using arch_type = gridtools::ghex::gpu; +using arch_type = gridtools::ghex::gpu; #else - using arch_type = gridtools::ghex::cpu; +using arch_type = gridtools::ghex::cpu; #endif - template - void printbuff(std::ostream& file, const gridtools::ghex::structured::regular::field_descriptor>& field) +template +void +printbuff(std::ostream& file, const gridtools::ghex::structured::regular::field_descriptor>& field) +{ + if (field.extents()[0] <= 10 && field.extents()[1] <= 10 && field.extents()[2] <= 6) { - if (field.extents()[0] <= 10 && field.extents()[1] <= 10 && field.extents()[2] <= 6) + file << "------------\n"; + for (int kk = 0; kk < field.extents()[2]; ++kk) { - file << "------------\n"; - for (int kk = 0; kk < field.extents()[2]; ++kk) { - for (int jj = 0; jj < field.extents()[1]; ++jj) { - file << "|"; - for (int ii = 0; ii < field.extents()[0]; ++ii) { - file << field(ii-field.offsets()[0], jj-field.offsets()[1], kk-field.offsets()[2]); - } - file << "|\n"; + for (int jj = 0; jj < field.extents()[1]; ++jj) + { + file << "|"; + for (int ii = 0; ii < field.extents()[0]; ++ii) + { + file << field(ii - field.offsets()[0], jj - field.offsets()[1], + kk - field.offsets()[2]); } - file << "\n\n"; + file << "|\n"; } - file << "------------\n\n"; + file << "\n\n"; } + file << "------------\n\n"; } +} - template - bool run(ST &file, context_type& context, Comm comm, - int DIM1, - int DIM2, - int DIM3, - int H1m1, - int H1p1, - int H2m1, - int H2p1, - int H3m1, - int H3p1, - int H1m2, - int H1p2, - int H2m2, - int H2p2, - int H3m2, - int H3p2, - int H1m3, - int H1p3, - int H2m3, - int H2p3, - int H3m3, - int H3p3, - triple_t *_a, - triple_t *_b, - triple_t *_c, bool use_gpu) - { - // compute total domain - const std::array g_first{ 0, 0, 0}; - const std::array g_last {dims[0]*DIM1-1, dims[1]*DIM2-1, dims[2]*DIM3-1}; +template +bool +run(ST& file, context_type& context, Comm comm, int DIM1, int DIM2, int DIM3, int H1m1, int H1p1, + int H2m1, int H2p1, int H3m1, int H3p1, int H1m2, int H1p2, int H2m2, int H2p2, int H3m2, + int H3p2, int H1m3, int H1p3, int H2m3, int H2p3, int H3m3, int H3p3, + triple_t* _a, triple_t* _b, triple_t* _c, + bool use_gpu) +{ + // compute total domain + const std::array g_first{0, 0, 0}; + const std::array g_last{dims[0] * DIM1 - 1, dims[1] * DIM2 - 1, dims[2] * DIM3 - 1}; - // periodicity - const std::array periodic{per0,per1,per2}; + // periodicity + const std::array periodic{per0, per1, per2}; - // halos - const std::array halo_1{H1m1,H1p1,H2m1,H2p1,H3m1,H3p1}; + // halos + const std::array halo_1{H1m1, H1p1, H2m1, H2p1, H3m1, H3p1}; #ifndef GHEX_1_PATTERN_BENCHMARK - const std::array halo_2{H1m2,H1p2,H2m2,H2p2,H3m2,H3p2}; - const std::array halo_3{H1m3,H1p3,H2m3,H2p3,H3m3,H3p3}; + const std::array halo_2{H1m2, H1p2, H2m2, H2p2, H3m2, H3p2}; + const std::array halo_3{H1m3, H1p3, H2m3, H2p3, H3m3, H3p3}; #endif - // define local domain - domain_descriptor_type local_domain{ - context.rank(),//comm.rank(), - std::array{coords[0]*DIM1,coords[1]*DIM2,coords[2]*DIM3}, - std::array{(coords[0]+1)*DIM1-1,(coords[1]+1)*DIM2-1,(coords[2]+1)*DIM3-1}}; - std::vector local_domains{local_domain}; - - // wrap raw fields - auto a = gridtools::ghex::wrap_field>(local_domain, _a, - std::array{H1m1,H2m1,H3m1}, - std::array{(DIM1 + H1m1 + H1p1), (DIM2 + H2m1 + H2p1), (DIM3 + H3m1 + H3p1)}); - auto b = gridtools::ghex::wrap_field>(local_domain, _b, - std::array{H1m2,H2m2,H3m2}, - std::array{(DIM1 + H1m2 + H1p2), (DIM2 + H2m2 + H2p2), (DIM3 + H3m2 + H3p2)}); - auto c = gridtools::ghex::wrap_field>(local_domain, _c, - std::array{H1m3,H2m3,H3m3}, - std::array{(DIM1 + H1m3 + H1p3), (DIM2 + H2m3 + H2p3), (DIM3 + H3m3 + H3p3)}); - - // make halo generators - auto halo_gen_1 = halo_generator_type(g_first, g_last, halo_1, periodic); + // define local domain + domain_descriptor_type local_domain{context.rank(), //comm.rank(), + std::array{coords[0] * DIM1, coords[1] * DIM2, coords[2] * DIM3}, + std::array{(coords[0] + 1) * DIM1 - 1, (coords[1] + 1) * DIM2 - 1, + (coords[2] + 1) * DIM3 - 1}}; + std::vector local_domains{local_domain}; + + // wrap raw fields + auto a = gridtools::ghex::wrap_field>( + local_domain, _a, std::array{H1m1, H2m1, H3m1}, + std::array{(DIM1 + H1m1 + H1p1), (DIM2 + H2m1 + H2p1), (DIM3 + H3m1 + H3p1)}); + auto b = gridtools::ghex::wrap_field>( + local_domain, _b, std::array{H1m2, H2m2, H3m2}, + std::array{(DIM1 + H1m2 + H1p2), (DIM2 + H2m2 + H2p2), (DIM3 + H3m2 + H3p2)}); + auto c = gridtools::ghex::wrap_field>( + local_domain, _c, std::array{H1m3, H2m3, H3m3}, + std::array{(DIM1 + H1m3 + H1p3), (DIM2 + H2m3 + H2p3), (DIM3 + H3m3 + H3p3)}); + + // make halo generators + auto halo_gen_1 = halo_generator_type(g_first, g_last, halo_1, periodic); #ifndef GHEX_1_PATTERN_BENCHMARK - auto halo_gen_2 = halo_generator_type(g_first, g_last, halo_2, periodic); - auto halo_gen_3 = halo_generator_type(g_first, g_last, halo_3, periodic); + auto halo_gen_2 = halo_generator_type(g_first, g_last, halo_2, periodic); + auto halo_gen_3 = halo_generator_type(g_first, g_last, halo_3, periodic); #endif - // make patterns - auto pattern_1 = gridtools::ghex::make_pattern(context, halo_gen_1, local_domains); + // make patterns + auto pattern_1 = gridtools::ghex::make_pattern(context, + halo_gen_1, local_domains); #ifndef GHEX_1_PATTERN_BENCHMARK - auto pattern_2 = gridtools::ghex::make_pattern(context, halo_gen_2, local_domains); - auto pattern_3 = gridtools::ghex::make_pattern(context, halo_gen_3, local_domains); + auto pattern_2 = gridtools::ghex::make_pattern(context, + halo_gen_2, local_domains); + auto pattern_3 = gridtools::ghex::make_pattern(context, + halo_gen_3, local_domains); #endif - // communication object - auto co = gridtools::ghex::make_communication_object(comm); - - - file << "Proc: (" << coords[0] << ", " << coords[1] << ", " << coords[2] << ")\n"; - - /* Just an initialization */ - for (int ii = 0; ii < DIM1 + H1m1 + H1p1; ++ii) - for (int jj = 0; jj < DIM2 + H2m1 + H2p1; ++jj) - for (int kk = 0; kk < DIM3 + H3m1 + H3p1; ++kk) - a(ii-H1m1, jj-H2m1, kk-H3m1) = triple_t(); - - for (int ii = 0; ii < DIM1 + H1m2 + H1p2; ++ii) - for (int jj = 0; jj < DIM2 + H2m2 + H2p2; ++jj) - for (int kk = 0; kk < DIM3 + H3m2 + H3p2; ++kk) - b(ii-H1m2, jj-H2m2, kk-H3m2) = triple_t(); - - for (int ii = 0; ii < DIM1 + H1m3 + H1p3; ++ii) - for (int jj = 0; jj < DIM2 + H2m3 + H2p3; ++jj) - for (int kk = 0; kk < DIM3 + H3m3 + H3p3; ++kk) - c(ii-H1m3, jj-H2m3, kk-H3m3) = triple_t(); - - for (int ii = 0; ii < DIM1; ++ii) - for (int jj = 0; jj < DIM2; ++jj) - for (int kk = 0; kk < DIM3; ++kk) - a(ii, jj, kk) = triple_t( - ii + (DIM1)*coords[0], jj + (DIM2)*coords[1], kk + (DIM3)*coords[2]); - - for (int ii = 0; ii < DIM1; ++ii) - for (int jj = 0; jj < DIM2; ++jj) - for (int kk = 0; kk < DIM3; ++kk) - b(ii, jj, kk) = triple_t( - ii + (DIM1)*coords[0] + B_ADD, jj + (DIM2)*coords[1] + B_ADD, kk + (DIM3)*coords[2] + B_ADD); - - for (int ii = 0; ii < DIM1; ++ii) - for (int jj = 0; jj < DIM2; ++jj) - for (int kk = 0; kk < DIM3; ++kk) - c(ii, jj, kk) = triple_t( - ii + (DIM1)*coords[0] + C_ADD, jj + (DIM2)*coords[1] + C_ADD, kk + (DIM3)*coords[2] + C_ADD); - - file << "A \n"; - printbuff(file, a); - file << "B \n"; - printbuff(file, b); - file << "C \n"; - printbuff(file, c); - file.flush(); - - if (use_gpu) - { - triple_t::data_type *gpu_a = 0; - triple_t::data_type *gpu_b = 0; - triple_t::data_type *gpu_c = 0; - file << "***** GPU ON *****\n"; + // communication object + auto co = gridtools::ghex::make_communication_object(comm); + + file << "Proc: (" << coords[0] << ", " << coords[1] << ", " << coords[2] << ")\n"; + + /* Just an initialization */ + for (int ii = 0; ii < DIM1 + H1m1 + H1p1; ++ii) + for (int jj = 0; jj < DIM2 + H2m1 + H2p1; ++jj) + for (int kk = 0; kk < DIM3 + H3m1 + H3p1; ++kk) + a(ii - H1m1, jj - H2m1, kk - H3m1) = triple_t(); + + for (int ii = 0; ii < DIM1 + H1m2 + H1p2; ++ii) + for (int jj = 0; jj < DIM2 + H2m2 + H2p2; ++jj) + for (int kk = 0; kk < DIM3 + H3m2 + H3p2; ++kk) + b(ii - H1m2, jj - H2m2, kk - H3m2) = triple_t(); + + for (int ii = 0; ii < DIM1 + H1m3 + H1p3; ++ii) + for (int jj = 0; jj < DIM2 + H2m3 + H2p3; ++jj) + for (int kk = 0; kk < DIM3 + H3m3 + H3p3; ++kk) + c(ii - H1m3, jj - H2m3, kk - H3m3) = triple_t(); + + for (int ii = 0; ii < DIM1; ++ii) + for (int jj = 0; jj < DIM2; ++jj) + for (int kk = 0; kk < DIM3; ++kk) + a(ii, jj, kk) = triple_t(ii + (DIM1)*coords[0], + jj + (DIM2)*coords[1], kk + (DIM3)*coords[2]); + + for (int ii = 0; ii < DIM1; ++ii) + for (int jj = 0; jj < DIM2; ++jj) + for (int kk = 0; kk < DIM3; ++kk) + b(ii, jj, kk) = triple_t(ii + (DIM1)*coords[0] + B_ADD, + jj + (DIM2)*coords[1] + B_ADD, kk + (DIM3)*coords[2] + B_ADD); + + for (int ii = 0; ii < DIM1; ++ii) + for (int jj = 0; jj < DIM2; ++jj) + for (int kk = 0; kk < DIM3; ++kk) + c(ii, jj, kk) = triple_t(ii + (DIM1)*coords[0] + C_ADD, + jj + (DIM2)*coords[1] + C_ADD, kk + (DIM3)*coords[2] + C_ADD); + + file << "A \n"; + printbuff(file, a); + file << "B \n"; + printbuff(file, b); + file << "C \n"; + printbuff(file, c); + file.flush(); + + if (use_gpu) + { + triple_t::data_type* gpu_a = 0; + triple_t::data_type* gpu_b = 0; + triple_t::data_type* gpu_c = 0; + file << "***** GPU ON *****\n"; #ifdef GHEX_CUDACC - GT_CUDA_CHECK(cudaMalloc(&gpu_a, - (DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1) * - sizeof(triple_t::data_type))); - GT_CUDA_CHECK(cudaMalloc(&gpu_b, - (DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2) * - sizeof(triple_t::data_type))); - GT_CUDA_CHECK(cudaMalloc(&gpu_c, - (DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3) * - sizeof(triple_t::data_type))); - - GT_CUDA_CHECK(cudaMemcpy(gpu_a, - a.data(), - (DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1) * - sizeof(triple_t::data_type), - cudaMemcpyHostToDevice)); - - GT_CUDA_CHECK(cudaMemcpy(gpu_b, - b.data(), - (DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2) * - sizeof(triple_t::data_type), - cudaMemcpyHostToDevice)); - - GT_CUDA_CHECK(cudaMemcpy(gpu_c, - c.data(), - (DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3) * - sizeof(triple_t::data_type), - cudaMemcpyHostToDevice)); + GT_CUDA_CHECK( + cudaMalloc(&gpu_a, (DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1) * + sizeof(triple_t::data_type))); + GT_CUDA_CHECK( + cudaMalloc(&gpu_b, (DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2) * + sizeof(triple_t::data_type))); + GT_CUDA_CHECK( + cudaMalloc(&gpu_c, (DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3) * + sizeof(triple_t::data_type))); + + GT_CUDA_CHECK(cudaMemcpy(gpu_a, a.data(), + (DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1) * + sizeof(triple_t::data_type), + cudaMemcpyHostToDevice)); + + GT_CUDA_CHECK(cudaMemcpy(gpu_b, b.data(), + (DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2) * + sizeof(triple_t::data_type), + cudaMemcpyHostToDevice)); + + GT_CUDA_CHECK(cudaMemcpy(gpu_c, c.data(), + (DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3) * + sizeof(triple_t::data_type), + cudaMemcpyHostToDevice)); #else - gpu_a = new triple_t[(DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1)]; - gpu_b = new triple_t[(DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2)]; - gpu_c = new triple_t[(DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3)]; - - std::memcpy((void*)gpu_a, (const void*)a.data(), - (DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1) * sizeof(triple_t::data_type)); - std::memcpy((void*)gpu_b, (const void*)b.data(), - (DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2) * sizeof(triple_t::data_type)); - std::memcpy((void*)gpu_c, (const void*)c.data(), - (DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3) * sizeof(triple_t::data_type)); + gpu_a = new triple_t[(DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * + (DIM3 + H3m1 + H3p1)]; + gpu_b = new triple_t[(DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * + (DIM3 + H3m2 + H3p2)]; + gpu_c = new triple_t[(DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * + (DIM3 + H3m3 + H3p3)]; + + std::memcpy((void*)gpu_a, (const void*)a.data(), + (DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1) * + sizeof(triple_t::data_type)); + std::memcpy((void*)gpu_b, (const void*)b.data(), + (DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2) * + sizeof(triple_t::data_type)); + std::memcpy((void*)gpu_c, (const void*)c.data(), + (DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3) * + sizeof(triple_t::data_type)); #endif - // wrap raw fields - auto field1 = gridtools::ghex::wrap_field>(local_domain, gpu_a, - std::array{H1m1,H2m1,H3m1}, - std::array{(DIM1 + H1m1 + H1p1), (DIM2 + H2m1 + H2p1), (DIM3 + H3m1 + H3p1)}); - auto field2 = gridtools::ghex::wrap_field>(local_domain, gpu_b, - std::array{H1m2,H2m2,H3m2}, - std::array{(DIM1 + H1m2 + H1p2), (DIM2 + H2m2 + H2p2), (DIM3 + H3m2 + H3p2)}); - auto field3 = gridtools::ghex::wrap_field>(local_domain, gpu_c, - std::array{H1m3,H2m3,H3m3}, - std::array{(DIM1 + H1m3 + H1p3), (DIM2 + H2m3 + H2p3), (DIM3 + H3m3 + H3p3)}); - + // wrap raw fields + auto field1 = gridtools::ghex::wrap_field>( + local_domain, gpu_a, std::array{H1m1, H2m1, H3m1}, + std::array{(DIM1 + H1m1 + H1p1), (DIM2 + H2m1 + H2p1), (DIM3 + H3m1 + H3p1)}); + auto field2 = gridtools::ghex::wrap_field>( + local_domain, gpu_b, std::array{H1m2, H2m2, H3m2}, + std::array{(DIM1 + H1m2 + H1p2), (DIM2 + H2m2 + H2p2), (DIM3 + H3m2 + H3p2)}); + auto field3 = gridtools::ghex::wrap_field>( + local_domain, gpu_c, std::array{H1m3, H2m3, H3m3}, + std::array{(DIM1 + H1m3 + H1p3), (DIM2 + H2m3 + H2p3), (DIM3 + H3m3 + H3p3)}); + + MPI_Barrier(context.mpi_comm()); + + // do all the stuff here + file << " LOCAL MEAN STD MIN MAX" + << std::endl; + timer_type t_0_local; + timer_type t_1_local; + timer_type t_local; + timer_type t_0_global; + timer_type t_1_global; + timer_type t_global; + const int k_start = 5; + for (int k = 0; k < 25; ++k) + { + timer_type t_0; + timer_type t_1; MPI_Barrier(context.mpi_comm()); - - // do all the stuff here - file << " LOCAL MEAN STD MIN MAX" << std::endl; - timer_type t_0_local; - timer_type t_1_local; - timer_type t_local; - timer_type t_0_global; - timer_type t_1_global; - timer_type t_global; - const int k_start = 5; - for (int k=0; k<25; ++k) - { - timer_type t_0; - timer_type t_1; - MPI_Barrier(context.mpi_comm()); - t_0.tic(); - auto h = co.exchange( + t_0.tic(); + auto h = co.exchange( #ifndef GHEX_1_PATTERN_BENCHMARK - pattern_1(field1), - pattern_2(field2), - pattern_3(field3)); + pattern_1(field1), pattern_2(field2), pattern_3(field3)); #else - pattern_1(field1), - pattern_1(field2), - pattern_1(field3)); + pattern_1(field1), pattern_1(field2), pattern_1(field3)); #endif - t_0.toc(); - t_1.tic(); - h.wait(); - t_1.toc(); - MPI_Barrier(context.mpi_comm()); - - timer_type t; - t(t_0.sum()+t_1.sum()); - - auto t_0_all = gridtools::ghex::reduce(t_0,context.mpi_comm()); - auto t_1_all = gridtools::ghex::reduce(t_1,context.mpi_comm()); - auto t_all = gridtools::ghex::reduce(t,context.mpi_comm()); - if (k >= k_start) - { - t_0_local(t_0); - t_1_local(t_1); - t_local(t); - t_0_global(t_0_all); - t_1_global(t_1_all); - t_global(t_all); - } + t_0.toc(); + t_1.tic(); + h.wait(); + t_1.toc(); + MPI_Barrier(context.mpi_comm()); - file << "TIME PACK/POST: " - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0.mean()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_all.mean()/1000.0 - << " ±" - << std::scientific << std::setprecision(4) << std::right << std::setw(11) << t_0_all.stddev()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_all.min()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_all.max()/1000.0 - << std::endl; - file << "TIME WAIT/UNPACK: " - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1.mean()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_all.mean()/1000.0 - << " ±" - << std::scientific << std::setprecision(4) << std::right << std::setw(11) << t_1_all.stddev()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_all.min()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_all.max()/1000.0 - << std::endl; - file << "TIME ALL: " - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t.mean()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_all.mean()/1000.0 - << " ±" - << std::scientific << std::setprecision(4) << std::right << std::setw(11) << t_all.stddev()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_all.min()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_all.max()/1000.0 - << std::endl; - file << std::endl; + timer_type t; + t(t_0.sum() + t_1.sum()); + + auto t_0_all = gridtools::ghex::reduce(t_0, context.mpi_comm()); + auto t_1_all = gridtools::ghex::reduce(t_1, context.mpi_comm()); + auto t_all = gridtools::ghex::reduce(t, context.mpi_comm()); + if (k >= k_start) + { + t_0_local(t_0); + t_1_local(t_1); + t_local(t); + t_0_global(t_0_all); + t_1_global(t_1_all); + t_global(t_all); } - file << std::endl << "-----------------" << std::endl; - file << "TIME PACK/POST: " - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_local.mean()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_global.mean()/1000.0 - << " ±" - << std::scientific << std::setprecision(4) << std::right << std::setw(11) << t_0_global.stddev()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_global.min()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_global.max()/1000.0 - << std::endl; - file << "TIME WAIT/UNPACK: " - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_local.mean()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_global.mean()/1000.0 - << " ±" - << std::scientific << std::setprecision(4) << std::right << std::setw(11) << t_1_global.stddev()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_global.min()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_global.max()/1000.0 - << std::endl; - file << "TIME ALL: " - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_local.mean()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_global.mean()/1000.0 - << " ±" - << std::scientific << std::setprecision(4) << std::right << std::setw(11) << t_global.stddev()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_global.min()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_global.max()/1000.0 - << std::endl; + file << "TIME PACK/POST: " << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_0.mean() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_0_all.mean() / 1000.0 << " ±" + << std::scientific << std::setprecision(4) << std::right << std::setw(11) + << t_0_all.stddev() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_0_all.min() / 1000.0 << std::scientific + << std::setprecision(4) << std::right << std::setw(12) << t_0_all.max() / 1000.0 + << std::endl; + file << "TIME WAIT/UNPACK: " << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_1.mean() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_1_all.mean() / 1000.0 << " ±" + << std::scientific << std::setprecision(4) << std::right << std::setw(11) + << t_1_all.stddev() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_1_all.min() / 1000.0 << std::scientific + << std::setprecision(4) << std::right << std::setw(12) << t_1_all.max() / 1000.0 + << std::endl; + file << "TIME ALL: " << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t.mean() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_all.mean() / 1000.0 << " ±" << std::scientific + << std::setprecision(4) << std::right << std::setw(11) << t_all.stddev() / 1000.0 + << std::scientific << std::setprecision(4) << std::right << std::setw(12) + << t_all.min() / 1000.0 << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_all.max() / 1000.0 << std::endl; + file << std::endl; + } + + file << std::endl << "-----------------" << std::endl; + file << "TIME PACK/POST: " << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_0_local.mean() / 1000.0 << std::scientific + << std::setprecision(4) << std::right << std::setw(12) << t_0_global.mean() / 1000.0 + << " ±" << std::scientific << std::setprecision(4) << std::right << std::setw(11) + << t_0_global.stddev() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_0_global.min() / 1000.0 << std::scientific + << std::setprecision(4) << std::right << std::setw(12) << t_0_global.max() / 1000.0 + << std::endl; + file << "TIME WAIT/UNPACK: " << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_1_local.mean() / 1000.0 << std::scientific + << std::setprecision(4) << std::right << std::setw(12) << t_1_global.mean() / 1000.0 + << " ±" << std::scientific << std::setprecision(4) << std::right << std::setw(11) + << t_1_global.stddev() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_1_global.min() / 1000.0 << std::scientific + << std::setprecision(4) << std::right << std::setw(12) << t_1_global.max() / 1000.0 + << std::endl; + file << "TIME ALL: " << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_local.mean() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_global.mean() / 1000.0 << " ±" << std::scientific + << std::setprecision(4) << std::right << std::setw(11) << t_global.stddev() / 1000.0 + << std::scientific << std::setprecision(4) << std::right << std::setw(12) + << t_global.min() / 1000.0 << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_global.max() / 1000.0 << std::endl; #ifdef GHEX_CUDACC - GT_CUDA_CHECK(cudaMemcpy(a.data(), - gpu_a, - (DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1) * - sizeof(triple_t::data_type), - cudaMemcpyDeviceToHost)); - - GT_CUDA_CHECK(cudaMemcpy(b.data(), - gpu_b, - (DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2) * - sizeof(triple_t::data_type), - cudaMemcpyDeviceToHost)); - - GT_CUDA_CHECK(cudaMemcpy(c.data(), - gpu_c, - (DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3) * - sizeof(triple_t::data_type), - cudaMemcpyDeviceToHost)); - - GT_CUDA_CHECK(cudaFree(gpu_a)); - GT_CUDA_CHECK(cudaFree(gpu_b)); - GT_CUDA_CHECK(cudaFree(gpu_c)); + GT_CUDA_CHECK(cudaMemcpy(a.data(), gpu_a, + (DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1) * + sizeof(triple_t::data_type), + cudaMemcpyDeviceToHost)); + + GT_CUDA_CHECK(cudaMemcpy(b.data(), gpu_b, + (DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2) * + sizeof(triple_t::data_type), + cudaMemcpyDeviceToHost)); + + GT_CUDA_CHECK(cudaMemcpy(c.data(), gpu_c, + (DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3) * + sizeof(triple_t::data_type), + cudaMemcpyDeviceToHost)); + + GT_CUDA_CHECK(cudaFree(gpu_a)); + GT_CUDA_CHECK(cudaFree(gpu_b)); + GT_CUDA_CHECK(cudaFree(gpu_c)); #else - std::memcpy((void*)a.data(), (const void*)gpu_a, - (DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1) * sizeof(triple_t::data_type)); - std::memcpy((void*)b.data(), (const void*)gpu_b, - (DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2) * sizeof(triple_t::data_type)); - std::memcpy((void*)c.data(), (const void*)gpu_c, - (DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3) * sizeof(triple_t::data_type)); - - delete[] gpu_a; - delete[] gpu_b; - delete[] gpu_c; + std::memcpy((void*)a.data(), (const void*)gpu_a, + (DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1) * + sizeof(triple_t::data_type)); + std::memcpy((void*)b.data(), (const void*)gpu_b, + (DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2) * + sizeof(triple_t::data_type)); + std::memcpy((void*)c.data(), (const void*)gpu_c, + (DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3) * + sizeof(triple_t::data_type)); + + delete[] gpu_a; + delete[] gpu_b; + delete[] gpu_c; #endif - MPI_Barrier(context.mpi_comm()); - - } - else + MPI_Barrier(context.mpi_comm()); + } + else + { + auto field1 = a; + auto field2 = b; + auto field3 = c; + MPI_Barrier(context.mpi_comm()); + + file << " LOCAL MEAN STD MIN MAX" + << std::endl; + timer_type t_0_local; + timer_type t_1_local; + timer_type t_local; + timer_type t_0_global; + timer_type t_1_global; + timer_type t_global; + const int k_start = 5; + for (int k = 0; k < 25; ++k) { - auto field1 = a; - auto field2 = b; - auto field3 = c; + timer_type t_0; + timer_type t_1; MPI_Barrier(context.mpi_comm()); - - file << " LOCAL MEAN STD MIN MAX" << std::endl; - timer_type t_0_local; - timer_type t_1_local; - timer_type t_local; - timer_type t_0_global; - timer_type t_1_global; - timer_type t_global; - const int k_start = 5; - for (int k=0; k<25; ++k) - { - timer_type t_0; - timer_type t_1; - MPI_Barrier(context.mpi_comm()); - t_0.tic(); - auto h = co.exchange( + t_0.tic(); + auto h = co.exchange( #ifndef GHEX_1_PATTERN_BENCHMARK - pattern_1(field1), - pattern_2(field2), - pattern_3(field3)); + pattern_1(field1), pattern_2(field2), pattern_3(field3)); #else - pattern_1(field1), - pattern_1(field2), - pattern_1(field3)); + pattern_1(field1), pattern_1(field2), pattern_1(field3)); #endif - t_0.toc(); - t_1.tic(); - h.wait(); - t_1.toc(); - MPI_Barrier(context.mpi_comm()); - - timer_type t; - t(t_0.sum()+t_1.sum()); - - auto t_0_all = gridtools::ghex::reduce(t_0,context.mpi_comm()); - auto t_1_all = gridtools::ghex::reduce(t_1,context.mpi_comm()); - auto t_all = gridtools::ghex::reduce(t,context.mpi_comm()); - if (k >= k_start) - { - t_0_local(t_0); - t_1_local(t_1); - t_local(t); - t_0_global(t_0_all); - t_1_global(t_1_all); - t_global(t_all); - } + t_0.toc(); + t_1.tic(); + h.wait(); + t_1.toc(); + MPI_Barrier(context.mpi_comm()); - file << "TIME PACK/POST: " - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0.mean()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_all.mean()/1000.0 - << " ±" - << std::scientific << std::setprecision(4) << std::right << std::setw(11) << t_0_all.stddev()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_all.min()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_all.max()/1000.0 - << std::endl; - file << "TIME WAIT/UNPACK: " - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1.mean()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_all.mean()/1000.0 - << " ±" - << std::scientific << std::setprecision(4) << std::right << std::setw(11) << t_1_all.stddev()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_all.min()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_all.max()/1000.0 - << std::endl; - file << "TIME ALL: " - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t.mean()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_all.mean()/1000.0 - << " ±" - << std::scientific << std::setprecision(4) << std::right << std::setw(11) << t_all.stddev()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_all.min()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_all.max()/1000.0 - << std::endl; - file << std::endl; - } + timer_type t; + t(t_0.sum() + t_1.sum()); - file << std::endl << "-----------------" << std::endl; - file << "TIME PACK/POST: " - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_local.mean()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_global.mean()/1000.0 - << " ±" - << std::scientific << std::setprecision(4) << std::right << std::setw(11) << t_0_global.stddev()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_global.min()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_global.max()/1000.0 - << std::endl; - file << "TIME WAIT/UNPACK: " - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_local.mean()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_global.mean()/1000.0 - << " ±" - << std::scientific << std::setprecision(4) << std::right << std::setw(11) << t_1_global.stddev()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_global.min()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_global.max()/1000.0 - << std::endl; - file << "TIME ALL: " - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_local.mean()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_global.mean()/1000.0 - << " ±" - << std::scientific << std::setprecision(4) << std::right << std::setw(11) << t_global.stddev()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_global.min()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_global.max()/1000.0 - << std::endl; - //file << std::endl << std::endl; + auto t_0_all = gridtools::ghex::reduce(t_0, context.mpi_comm()); + auto t_1_all = gridtools::ghex::reduce(t_1, context.mpi_comm()); + auto t_all = gridtools::ghex::reduce(t, context.mpi_comm()); + if (k >= k_start) + { + t_0_local(t_0); + t_1_local(t_1); + t_local(t); + t_0_global(t_0_all); + t_1_global(t_1_all); + t_global(t_all); + } - MPI_Barrier(context.mpi_comm()); + file << "TIME PACK/POST: " << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_0.mean() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_0_all.mean() / 1000.0 << " ±" + << std::scientific << std::setprecision(4) << std::right << std::setw(11) + << t_0_all.stddev() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_0_all.min() / 1000.0 << std::scientific + << std::setprecision(4) << std::right << std::setw(12) << t_0_all.max() / 1000.0 + << std::endl; + file << "TIME WAIT/UNPACK: " << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_1.mean() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_1_all.mean() / 1000.0 << " ±" + << std::scientific << std::setprecision(4) << std::right << std::setw(11) + << t_1_all.stddev() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_1_all.min() / 1000.0 << std::scientific + << std::setprecision(4) << std::right << std::setw(12) << t_1_all.max() / 1000.0 + << std::endl; + file << "TIME ALL: " << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t.mean() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_all.mean() / 1000.0 << " ±" << std::scientific + << std::setprecision(4) << std::right << std::setw(11) << t_all.stddev() / 1000.0 + << std::scientific << std::setprecision(4) << std::right << std::setw(12) + << t_all.min() / 1000.0 << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_all.max() / 1000.0 << std::endl; + file << std::endl; } + file << std::endl << "-----------------" << std::endl; + file << "TIME PACK/POST: " << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_0_local.mean() / 1000.0 << std::scientific + << std::setprecision(4) << std::right << std::setw(12) << t_0_global.mean() / 1000.0 + << " ±" << std::scientific << std::setprecision(4) << std::right << std::setw(11) + << t_0_global.stddev() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_0_global.min() / 1000.0 << std::scientific + << std::setprecision(4) << std::right << std::setw(12) << t_0_global.max() / 1000.0 + << std::endl; + file << "TIME WAIT/UNPACK: " << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_1_local.mean() / 1000.0 << std::scientific + << std::setprecision(4) << std::right << std::setw(12) << t_1_global.mean() / 1000.0 + << " ±" << std::scientific << std::setprecision(4) << std::right << std::setw(11) + << t_1_global.stddev() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_1_global.min() / 1000.0 << std::scientific + << std::setprecision(4) << std::right << std::setw(12) << t_1_global.max() / 1000.0 + << std::endl; + file << "TIME ALL: " << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_local.mean() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_global.mean() / 1000.0 << " ±" << std::scientific + << std::setprecision(4) << std::right << std::setw(11) << t_global.stddev() / 1000.0 + << std::scientific << std::setprecision(4) << std::right << std::setw(12) + << t_global.min() / 1000.0 << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_global.max() / 1000.0 << std::endl; + //file << std::endl << std::endl; + + MPI_Barrier(context.mpi_comm()); + } - file << "\n********************************************************************************\n"; - - file << "A \n"; - printbuff(file, a); - file << "B \n"; - printbuff(file, b); - file << "C \n"; - printbuff(file, c); - file.flush(); + file << "\n********************************************************************************\n"; - bool passed = true; + file << "A \n"; + printbuff(file, a); + file << "B \n"; + printbuff(file, b); + file << "C \n"; + printbuff(file, c); + file.flush(); + bool passed = true; - /* Checking the data arrived correctly in the whole region + /* Checking the data arrived correctly in the whole region */ - for (int ii = 0; ii < DIM1 + H1m1 + H1p1; ++ii) - for (int jj = 0; jj < DIM2 + H2m1 + H2p1; ++jj) - for (int kk = 0; kk < DIM3 + H3m1 + H3p1; ++kk) { - - triple_t ta; - int tax, tay, taz; + for (int ii = 0; ii < DIM1 + H1m1 + H1p1; ++ii) + for (int jj = 0; jj < DIM2 + H2m1 + H2p1; ++jj) + for (int kk = 0; kk < DIM3 + H3m1 + H3p1; ++kk) + { + triple_t ta; + int tax, tay, taz; - tax = modulus(ii - H1m1 + (DIM1)*coords[0], DIM1 * dims[0]); + tax = modulus(ii - H1m1 + (DIM1)*coords[0], DIM1 * dims[0]); - tay = modulus(jj - H2m1 + (DIM2)*coords[1], DIM2 * dims[1]); + tay = modulus(jj - H2m1 + (DIM2)*coords[1], DIM2 * dims[1]); - taz = modulus(kk - H3m1 + (DIM3)*coords[2], DIM3 * dims[2]); + taz = modulus(kk - H3m1 + (DIM3)*coords[2], DIM3 * dims[2]); - if (!per0) { - if (((coords[0] == 0) && (ii < H1m1)) || ((coords[0] == dims[0] - 1) && (ii >= DIM1 + H1m1))) { - tax = triple_t().x(); - } + if (!per0) + { + if (((coords[0] == 0) && (ii < H1m1)) || + ((coords[0] == dims[0] - 1) && (ii >= DIM1 + H1m1))) + { + tax = triple_t().x(); } + } - if (!per1) { - if (((coords[1] == 0) && (jj < H2m1)) || ((coords[1] == dims[1] - 1) && (jj >= DIM2 + H2m1))) { - tay = triple_t().y(); - } + if (!per1) + { + if (((coords[1] == 0) && (jj < H2m1)) || + ((coords[1] == dims[1] - 1) && (jj >= DIM2 + H2m1))) + { + tay = triple_t().y(); } + } - if (!per2) { - if (((coords[2] == 0) && (kk < H3m1)) || ((coords[2] == dims[2] - 1) && (kk >= DIM3 + H3m1))) { - taz = triple_t().z(); - } + if (!per2) + { + if (((coords[2] == 0) && (kk < H3m1)) || + ((coords[2] == dims[2] - 1) && (kk >= DIM3 + H3m1))) + { + taz = triple_t().z(); } + } - ta = triple_t(tax, tay, taz).floor(); + ta = triple_t(tax, tay, taz).floor(); - if (a(ii-H1m1, jj-H2m1, kk-H3m1) != ta) { - passed = false; - file << ii << ", " << jj << ", " << kk << " values found != expected: " - << "a " << a(ii-H1m1, jj-H2m1, kk-H3m1) << " != " << ta << "\n"; - } + if (a(ii - H1m1, jj - H2m1, kk - H3m1) != ta) + { + passed = false; + file << ii << ", " << jj << ", " << kk << " values found != expected: " + << "a " << a(ii - H1m1, jj - H2m1, kk - H3m1) << " != " << ta << "\n"; } + } - for (int ii = 0; ii < DIM1 + H1m2 + H1p2; ++ii) - for (int jj = 0; jj < DIM2 + H2m2 + H2p2; ++jj) - for (int kk = 0; kk < DIM3 + H3m2 + H3p2; ++kk) { - - triple_t tb; - int tbx, tby, tbz; + for (int ii = 0; ii < DIM1 + H1m2 + H1p2; ++ii) + for (int jj = 0; jj < DIM2 + H2m2 + H2p2; ++jj) + for (int kk = 0; kk < DIM3 + H3m2 + H3p2; ++kk) + { + triple_t tb; + int tbx, tby, tbz; - tbx = modulus(ii - H1m2 + (DIM1)*coords[0], DIM1 * dims[0]) + B_ADD; + tbx = modulus(ii - H1m2 + (DIM1)*coords[0], DIM1 * dims[0]) + B_ADD; - tby = modulus(jj - H2m2 + (DIM2)*coords[1], DIM2 * dims[1]) + B_ADD; + tby = modulus(jj - H2m2 + (DIM2)*coords[1], DIM2 * dims[1]) + B_ADD; - tbz = modulus(kk - H3m2 + (DIM3)*coords[2], DIM3 * dims[2]) + B_ADD; + tbz = modulus(kk - H3m2 + (DIM3)*coords[2], DIM3 * dims[2]) + B_ADD; - if (!per0) { - if (((coords[0] == 0) && (ii < H1m2)) || ((coords[0] == dims[0] - 1) && (ii >= DIM1 + H1m2))) { - tbx = triple_t().x(); - } + if (!per0) + { + if (((coords[0] == 0) && (ii < H1m2)) || + ((coords[0] == dims[0] - 1) && (ii >= DIM1 + H1m2))) + { + tbx = triple_t().x(); } + } - if (!per1) { - if (((coords[1] == 0) && (jj < H2m2)) || ((coords[1] == dims[1] - 1) && (jj >= DIM2 + H2m2))) { - tby = triple_t().y(); - } + if (!per1) + { + if (((coords[1] == 0) && (jj < H2m2)) || + ((coords[1] == dims[1] - 1) && (jj >= DIM2 + H2m2))) + { + tby = triple_t().y(); } + } - if (!per2) { - if (((coords[2] == 0) && (kk < H3m2)) || ((coords[2] == dims[2] - 1) && (kk >= DIM3 + H3m2))) { - tbz = triple_t().z(); - } + if (!per2) + { + if (((coords[2] == 0) && (kk < H3m2)) || + ((coords[2] == dims[2] - 1) && (kk >= DIM3 + H3m2))) + { + tbz = triple_t().z(); } + } - tb = triple_t(tbx, tby, tbz).floor(); + tb = triple_t(tbx, tby, tbz).floor(); - if (b(ii-H1m2, jj-H2m2, kk-H3m2) != tb) { - passed = false; - file << ii << ", " << jj << ", " << kk << " values found != expected: " - << "b " << b(ii-H1m2, jj-H2m2, kk-H3m2) << " != " << tb << "\n"; - } + if (b(ii - H1m2, jj - H2m2, kk - H3m2) != tb) + { + passed = false; + file << ii << ", " << jj << ", " << kk << " values found != expected: " + << "b " << b(ii - H1m2, jj - H2m2, kk - H3m2) << " != " << tb << "\n"; } + } - for (int ii = 0; ii < DIM1 + H1m3 + H1p3; ++ii) - for (int jj = 0; jj < DIM2 + H2m3 + H2p3; ++jj) - for (int kk = 0; kk < DIM3 + H3m3 + H3p3; ++kk) { - - triple_t tc; - int tcx, tcy, tcz; + for (int ii = 0; ii < DIM1 + H1m3 + H1p3; ++ii) + for (int jj = 0; jj < DIM2 + H2m3 + H2p3; ++jj) + for (int kk = 0; kk < DIM3 + H3m3 + H3p3; ++kk) + { + triple_t tc; + int tcx, tcy, tcz; - tcx = modulus(ii - H1m3 + (DIM1)*coords[0], DIM1 * dims[0]) + C_ADD; + tcx = modulus(ii - H1m3 + (DIM1)*coords[0], DIM1 * dims[0]) + C_ADD; - tcy = modulus(jj - H2m3 + (DIM2)*coords[1], DIM2 * dims[1]) + C_ADD; + tcy = modulus(jj - H2m3 + (DIM2)*coords[1], DIM2 * dims[1]) + C_ADD; - tcz = modulus(kk - H3m3 + (DIM3)*coords[2], DIM3 * dims[2]) + C_ADD; + tcz = modulus(kk - H3m3 + (DIM3)*coords[2], DIM3 * dims[2]) + C_ADD; - if (!per0) { - if (((coords[0] == 0) && (ii < H1m3)) || ((coords[0] == dims[0] - 1) && (ii >= DIM1 + H1m3))) { - tcx = triple_t().x(); - } + if (!per0) + { + if (((coords[0] == 0) && (ii < H1m3)) || + ((coords[0] == dims[0] - 1) && (ii >= DIM1 + H1m3))) + { + tcx = triple_t().x(); } + } - if (!per1) { - if (((coords[1] == 0) && (jj < H2m3)) || ((coords[1] == dims[1] - 1) && (jj >= DIM2 + H2m3))) { - tcy = triple_t().y(); - } + if (!per1) + { + if (((coords[1] == 0) && (jj < H2m3)) || + ((coords[1] == dims[1] - 1) && (jj >= DIM2 + H2m3))) + { + tcy = triple_t().y(); } + } - if (!per2) { - if (((coords[2] == 0) && (kk < H3m3)) || ((coords[2] == dims[2] - 1) && (kk >= DIM3 + H3m3))) { - tcz = triple_t().z(); - } + if (!per2) + { + if (((coords[2] == 0) && (kk < H3m3)) || + ((coords[2] == dims[2] - 1) && (kk >= DIM3 + H3m3))) + { + tcz = triple_t().z(); } + } - tc = triple_t(tcx, tcy, tcz).floor(); + tc = triple_t(tcx, tcy, tcz).floor(); - if (c(ii-H1m3, jj-H2m3, kk-H3m3) != tc) { - passed = false; - file << ii << ", " << jj << ", " << kk << " values found != expected: " - << "c " << c(ii-H1m3, jj-H2m3, kk-H3m3) << " != " << tc << "\n"; - } + if (c(ii - H1m3, jj - H2m3, kk - H3m3) != tc) + { + passed = false; + file << ii << ", " << jj << ", " << kk << " values found != expected: " + << "c " << c(ii - H1m3, jj - H2m3, kk - H3m3) << " != " << tc << "\n"; } + } - if (passed) - file << "RESULT: PASSED!\n"; - else - file << "RESULT: FAILED!\n"; + if (passed) file << "RESULT: PASSED!\n"; + else + file << "RESULT: FAILED!\n"; - return passed; - } + return passed; +} - bool test(bool use_gpu, - int DIM1, - int DIM2, - int DIM3, - int H1m1, - int H1p1, - int H2m1, - int H2p1, - int H3m1, - int H3p1, - int H1m2, - int H1p2, - int H2m2, - int H2p2, - int H3m2, - int H3p2, - int H1m3, - int H1p3, - int H2m3, - int H2p3, - int H3m3, - int H3p3) - { - gridtools::ghex::tl::mpi::communicator_base world; - //std::cout << context.rank() << " " << context.world().size() << "\n"; +bool +test(bool use_gpu, int DIM1, int DIM2, int DIM3, int H1m1, int H1p1, int H2m1, int H2p1, int H3m1, + int H3p1, int H1m2, int H1p2, int H2m2, int H2p2, int H3m2, int H3p2, int H1m3, int H1p3, + int H2m3, int H2p3, int H3m3, int H3p3) +{ + gridtools::ghex::tl::mpi::communicator_base world; + //std::cout << context.rank() << " " << context.world().size() << "\n"; - std::stringstream ss; - ss << world.rank(); - std::string filename = "comm_2_out" + ss.str() + ".txt"; - //std::cout << filename << std::endl; - std::ofstream file(filename.c_str()); + std::stringstream ss; + ss << world.rank(); + std::string filename = "comm_2_out" + ss.str() + ".txt"; + //std::cout << filename << std::endl; + std::ofstream file(filename.c_str()); - file << world.rank() << " " << world.size() << "\n"; - dims[2] = 1; - MPI_Dims_create(world.size(), 3, dims); - int period[3] = {1, 1, 1}; + file << world.rank() << " " << world.size() << "\n"; + dims[2] = 1; + MPI_Dims_create(world.size(), 3, dims); + int period[3] = {1, 1, 1}; - file << "@" << world.rank() << "@ MPI GRID SIZE " << dims[0] << " - " << dims[1] << " - " << dims[2] << "\n"; + file << "@" << world.rank() << "@ MPI GRID SIZE " << dims[0] << " - " << dims[1] << " - " + << dims[2] << "\n"; - MPI_Cart_create(world, 3, dims, period, false, &CartComm); + MPI_Cart_create(world, 3, dims, period, false, &CartComm); - MPI_Cart_get(CartComm, 3, dims, period, coords); + MPI_Cart_get(CartComm, 3, dims, period, coords); - auto context_ptr = gridtools::ghex::tl::context_factory::create(CartComm); - auto& context = *context_ptr; - auto comm = context.get_communicator(); + auto context_ptr = gridtools::ghex::tl::context_factory::create(CartComm); + auto& context = *context_ptr; + auto comm = context.get_communicator(); - /* Each process will hold a tile of size + /* Each process will hold a tile of size (DIM1+2*H)x(DIM2+2*H)x(DIM3+2*H). The DIM1xDIM2xDIM3 area inside the H width border is the inner region of an hypothetical stencil computation whise halo width is H. */ - file << "Field A " - << "size = " << DIM1 << "x" << DIM2 << "x" << DIM3 << " " - << "Halo along i " << H1m1 << " - " << H1p1 << ", " - << "Halo along j " << H2m1 << " - " << H2p1 << ", " - << "Halo along k " << H3m1 << " - " << H3p1 << std::endl; - - file << "Field B " - << "size = " << DIM1 << "x" << DIM2 << "x" << DIM3 << " " - << "Halo along i " << H1m2 << " - " << H1p2 << ", " - << "Halo along j " << H2m2 << " - " << H2p2 << ", " - << "Halo along k " << H3m2 << " - " << H3p2 << std::endl; - - file << "Field C " - << "size = " << DIM1 << "x" << DIM2 << "x" << DIM3 << " " - << "Halo along i " << H1m3 << " - " << H1p3 << ", " - << "Halo along j " << H2m3 << " - " << H2p3 << ", " - << "Halo along k " << H3m3 << " - " << H3p3 << std::endl; - file.flush(); - - /* This example will exchange 3 data arrays at the same time with + file << "Field A " + << "size = " << DIM1 << "x" << DIM2 << "x" << DIM3 << " " + << "Halo along i " << H1m1 << " - " << H1p1 << ", " + << "Halo along j " << H2m1 << " - " << H2p1 << ", " + << "Halo along k " << H3m1 << " - " << H3p1 << std::endl; + + file << "Field B " + << "size = " << DIM1 << "x" << DIM2 << "x" << DIM3 << " " + << "Halo along i " << H1m2 << " - " << H1p2 << ", " + << "Halo along j " << H2m2 << " - " << H2p2 << ", " + << "Halo along k " << H3m2 << " - " << H3p2 << std::endl; + + file << "Field C " + << "size = " << DIM1 << "x" << DIM2 << "x" << DIM3 << " " + << "Halo along i " << H1m3 << " - " << H1p3 << ", " + << "Halo along j " << H2m3 << " - " << H2p3 << ", " + << "Halo along k " << H3m3 << " - " << H3p3 << std::endl; + file.flush(); + + /* This example will exchange 3 data arrays at the same time with different values. */ - triple_t *_a = - new triple_t[(DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1)]; - triple_t *_b = - new triple_t[(DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2)]; - triple_t *_c = - new triple_t[(DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3)]; - - bool passed = true; - - file << "Permutation 0,1,2\n"; - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " - "_a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - file << "---------------------------------------------------\n"; - - file << "Permutation 0,2,1\n"; - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " - "_a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - file << "---------------------------------------------------\n"; - - file << "Permutation 1,0,2\n"; - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " - "_a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - file << "---------------------------------------------------\n"; - - file << "Permutation 1,2,0\n"; - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H31, " - "_a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - file << "---------------------------------------------------\n"; - - file << "Permutation 2,0,1\n"; - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " - "_a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - file << "---------------------------------------------------\n"; - - file << "Permutation 2,1,0\n"; - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " - "_a, " - "_b, _c)\n"; - passed = passed && run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c, use_gpu); - file << "---------------------------------------------------\n"; - - delete[] _a; - delete[] _b; - delete[] _c; - - return passed; - } + triple_t* _a = new triple_t[(DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1)]; + triple_t* _b = new triple_t[(DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2)]; + triple_t* _c = new triple_t[(DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3)]; + + bool passed = true; + + file << "Permutation 0,1,2\n"; + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + + passed = passed && run(file, context, comm, DIM1, DIM2, + DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, + H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " + "_a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + file << "---------------------------------------------------\n"; + + file << "Permutation 0,2,1\n"; + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, DIM2, + DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, + H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " + "_a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + file << "---------------------------------------------------\n"; + + file << "Permutation 1,0,2\n"; + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, DIM2, + DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, + H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " + "_a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + file << "---------------------------------------------------\n"; + + file << "Permutation 1,2,0\n"; + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, DIM2, + DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, + H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H31, " + "_a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + file << "---------------------------------------------------\n"; + + file << "Permutation 2,0,1\n"; + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, DIM2, + DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, + H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " + "_a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + file << "---------------------------------------------------\n"; + + file << "Permutation 2,1,0\n"; + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, DIM2, + DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, + H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " + "_a, " + "_b, _c)\n"; + passed = passed && run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c, use_gpu); + file << "---------------------------------------------------\n"; + + delete[] _a; + delete[] _b; + delete[] _c; + + return passed; +} } // namespace halo_exchange_3D_generic_full #ifdef STANDALONE -int main(int argc, char **argv) +int +main(int argc, char** argv) { #ifdef GT_USE_GPU device_binding(); @@ -2179,19 +1148,20 @@ int main(int argc, char **argv) int required = MPI_THREAD_MULTIPLE; int provided; int init_result = MPI_Init_thread(&argc, &argv, required, &provided); - if (init_result == MPI_ERR_OTHER) - throw std::runtime_error("MPI init failed"); + if (init_result == MPI_ERR_OTHER) throw std::runtime_error("MPI init failed"); if (provided < required) throw std::runtime_error("MPI does not support required threading level"); #else - MPI_Init(&argc,&argv); + MPI_Init(&argc, &argv); #endif - if (argc != 22) { - std::cout << "Usage: test_halo_exchange_3D dimx dimy dimz h1m1 hip1 h2m1 h2m1 h3m1 h3p1 h1m2 hip2 h2m2 h2m2 " - "h3m2 h3p2 h1m3 hip3 h2m3 h2m3 h3m3 h3p3\n where args are integer sizes of the data fields and " - "halo width" - << std::endl; + if (argc != 22) + { + std::cout + << "Usage: test_halo_exchange_3D dimx dimy dimz h1m1 hip1 h2m1 h2m1 h3m1 h3p1 h1m2 hip2 h2m2 h2m2 " + "h3m2 h3p2 h1m3 hip3 h2m3 h2m3 h3m3 h3p3\n where args are integer sizes of the data fields and " + "halo width" + << std::endl; return 1; } int DIM1 = atoi(argv[1]); @@ -2216,33 +1186,15 @@ int main(int argc, char **argv) int H3m3 = atoi(argv[20]); int H3p3 = atoi(argv[21]); - halo_exchange_3D_generic_full::test(DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3); + halo_exchange_3D_generic_full::test(DIM1, DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, + H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3); MPI_Finalize(); return 0; } #else -TEST(Communication, comm_2_test_halo_exchange_3D_generic_full) { +TEST(Communication, comm_2_test_halo_exchange_3D_generic_full) +{ bool passed = true; //const int Nx = 98*2; @@ -2254,21 +1206,27 @@ TEST(Communication, comm_2_test_halo_exchange_3D_generic_full) { #ifdef GHEX_CUDACC gridtools::ghex::tl::mpi::communicator_base mpi_comm; - int num_devices_per_node; + int num_devices_per_node; cudaGetDeviceCount(&num_devices_per_node); MPI_Comm raw_local_comm; - MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, mpi_comm.rank(), MPI_INFO_NULL, &raw_local_comm); - gridtools::ghex::tl::mpi::communicator_base local_comm(raw_local_comm, gridtools::ghex::tl::mpi::comm_take_ownership); - if (local_comm.rank() #endif -namespace halo_exchange_3D_generic_full { - int pid; - int nprocs; - MPI_Comm CartComm; - int dims[3] = {0, 0, 0}; - int coords[3] = {0, 0, 0}; +namespace halo_exchange_3D_generic_full +{ +int pid; +int nprocs; +MPI_Comm CartComm; +int dims[3] = {0, 0, 0}; +int coords[3] = {0, 0, 0}; - using timer_type = gridtools::ghex::timer; +using timer_type = gridtools::ghex::timer; #define B_ADD 1 #define C_ADD 2 #ifdef VECTOR_INTERFACE - typedef int T1; - typedef int T2; - typedef int T3; +typedef int T1; +typedef int T2; +typedef int T3; #else - typedef int T1; - typedef double T2; - typedef long long int T3; +typedef int T1; +typedef double T2; +typedef long long int T3; #endif #ifdef GHEX_CUDACC - typedef gridtools::gcl::gpu arch_type; +typedef gridtools::gcl::gpu arch_type; #else - typedef gridtools::gcl::cpu arch_type; +typedef gridtools::gcl::cpu arch_type; #endif - template - bool run(ST &file, - int DIM1, - int DIM2, - int DIM3, - int H1m1, - int H1p1, - int H2m1, - int H2p1, - int H3m1, - int H3p1, - int H1m2, - int H1p2, - int H2m2, - int H2p2, - int H3m2, - int H3p2, - int H1m3, - int H1p3, - int H2m3, - int H2p3, - int H3m3, - int H3p3, - triple_t *_a, - triple_t *_b, - triple_t *_c) { - - typedef gridtools::layout_map layoutmap; - - gridtools::ghex::tl::mpi::communicator_base world; - - array, layoutmap> a( - _a, (DIM1 + H1m1 + H1p1), (DIM2 + H2m1 + H2p1), (DIM3 + H3m1 + H3p1)); - array, layoutmap> b( - _b, (DIM1 + H1m2 + H1p2), (DIM2 + H2m2 + H2p2), (DIM3 + H3m2 + H3p2)); - array, layoutmap> c( - _c, (DIM1 + H1m3 + H1p3), (DIM2 + H2m3 + H2p3), (DIM3 + H3m3 + H3p3)); - - /* The pattern type is defined with the layouts, data types and +template +bool +run(ST& file, int DIM1, int DIM2, int DIM3, int H1m1, int H1p1, int H2m1, int H2p1, int H3m1, + int H3p1, int H1m2, int H1p2, int H2m2, int H2p2, int H3m2, int H3p2, int H1m3, int H1p3, + int H2m3, int H2p3, int H3m3, int H3p3, triple_t* _a, + triple_t* _b, triple_t* _c) +{ + typedef gridtools::layout_map layoutmap; + + gridtools::ghex::tl::mpi::communicator_base world; + + array, layoutmap> a(_a, (DIM1 + H1m1 + H1p1), (DIM2 + H2m1 + H2p1), + (DIM3 + H3m1 + H3p1)); + array, layoutmap> b(_b, (DIM1 + H1m2 + H1p2), (DIM2 + H2m2 + H2p2), + (DIM3 + H3m2 + H3p2)); + array, layoutmap> c(_c, (DIM1 + H1m3 + H1p3), (DIM2 + H2m3 + H2p3), + (DIM3 + H3m3 + H3p3)); + + /* The pattern type is defined with the layouts, data types and number of dimensions. The logical assumption done in the program is that 'i' is the @@ -117,1969 +98,995 @@ namespace halo_exchange_3D_generic_full { logically to processor (p+1,q,r). The other dimensions goes as the others. */ - typedef gridtools::gcl::halo_exchange_generic, arch_type> pattern_type; + typedef gridtools::gcl::halo_exchange_generic, arch_type> + pattern_type; - /* The pattern is now instantiated with the periodicities and the + /* The pattern is now instantiated with the periodicities and the communicator. The periodicity of the communicator is irrelevant. Setting it to be periodic is the best choice, then GCL can deal with any periodicity easily. */ - pattern_type he(typename pattern_type::grid_type::period_type(per0, per1, per2), CartComm); - - gridtools::array halo_dsc1; - halo_dsc1[0] = gridtools::halo_descriptor(H1m1, H1p1, H1m1, DIM1 + H1m1 - 1, DIM1 + H1m1 + H1p1); - halo_dsc1[1] = gridtools::halo_descriptor(H2m1, H2p1, H2m1, DIM2 + H2m1 - 1, DIM2 + H2m1 + H2p1); - halo_dsc1[2] = gridtools::halo_descriptor(H3m1, H3p1, H3m1, DIM3 + H3m1 - 1, DIM3 + H3m1 + H3p1); - - gridtools::array halo_dsc2; - halo_dsc2[0] = gridtools::halo_descriptor(H1m2, H1p2, H1m2, DIM1 + H1m2 - 1, DIM1 + H1m2 + H1p2); - halo_dsc2[1] = gridtools::halo_descriptor(H2m2, H2p2, H2m2, DIM2 + H2m2 - 1, DIM2 + H2m2 + H2p2); - halo_dsc2[2] = gridtools::halo_descriptor(H3m2, H3p2, H3m2, DIM3 + H3m2 - 1, DIM3 + H3m2 + H3p2); - - gridtools::array halo_dsc3; - halo_dsc3[0] = gridtools::halo_descriptor(H1m3, H1p3, H1m3, DIM1 + H1m3 - 1, DIM1 + H1m3 + H1p3); - halo_dsc3[1] = gridtools::halo_descriptor(H2m3, H2p3, H2m3, DIM2 + H2m3 - 1, DIM2 + H2m3 + H2p3); - halo_dsc3[2] = gridtools::halo_descriptor(H3m3, H3p3, H3m3, DIM3 + H3m3 - 1, DIM3 + H3m3 + H3p3); - - /* Pattern is set up. This must be done only once per pattern. The + pattern_type he(typename pattern_type::grid_type::period_type(per0, per1, per2), CartComm); + + gridtools::array halo_dsc1; + halo_dsc1[0] = + gridtools::halo_descriptor(H1m1, H1p1, H1m1, DIM1 + H1m1 - 1, DIM1 + H1m1 + H1p1); + halo_dsc1[1] = + gridtools::halo_descriptor(H2m1, H2p1, H2m1, DIM2 + H2m1 - 1, DIM2 + H2m1 + H2p1); + halo_dsc1[2] = + gridtools::halo_descriptor(H3m1, H3p1, H3m1, DIM3 + H3m1 - 1, DIM3 + H3m1 + H3p1); + + gridtools::array halo_dsc2; + halo_dsc2[0] = + gridtools::halo_descriptor(H1m2, H1p2, H1m2, DIM1 + H1m2 - 1, DIM1 + H1m2 + H1p2); + halo_dsc2[1] = + gridtools::halo_descriptor(H2m2, H2p2, H2m2, DIM2 + H2m2 - 1, DIM2 + H2m2 + H2p2); + halo_dsc2[2] = + gridtools::halo_descriptor(H3m2, H3p2, H3m2, DIM3 + H3m2 - 1, DIM3 + H3m2 + H3p2); + + gridtools::array halo_dsc3; + halo_dsc3[0] = + gridtools::halo_descriptor(H1m3, H1p3, H1m3, DIM1 + H1m3 - 1, DIM1 + H1m3 + H1p3); + halo_dsc3[1] = + gridtools::halo_descriptor(H2m3, H2p3, H2m3, DIM2 + H2m3 - 1, DIM2 + H2m3 + H2p3); + halo_dsc3[2] = + gridtools::halo_descriptor(H3m3, H3p3, H3m3, DIM3 + H3m3 - 1, DIM3 + H3m3 + H3p3); + + /* Pattern is set up. This must be done only once per pattern. The parameter must me greater or equal to the largest number of arrays updated in a single step. */ - // he.setup(100, halo_dsc, sizeof(double)); + // he.setup(100, halo_dsc, sizeof(double)); - gridtools::array h_example; + gridtools::array h_example; #define MAX3(a, b, c) std::max(a, std::max(b, c)) - h_example[0] = gridtools::halo_descriptor(MAX3(H1m1, H1m2, H1m3), - MAX3(H1p1, H1p2, H1p3), - MAX3(H1m1, H1m2, H1m3), - DIM1 + MAX3(H1m1, H1m2, H1m3) - 1, - DIM1 + MAX3(H1m1, H1m2, H1m3) + MAX3(H1p1, H1p3, H1p3)); - h_example[1] = gridtools::halo_descriptor(MAX3(H2m1, H2m2, H2m3), - MAX3(H2p1, H2p2, H2p3), - MAX3(H2m1, H2m2, H2m3), - DIM2 + MAX3(H2m1, H2m2, H2m3) - 1, - DIM2 + MAX3(H2m1, H2m2, H2m3) + MAX3(H2p1, H2p3, H2p3)); - h_example[2] = gridtools::halo_descriptor(MAX3(H3m1, H3m2, H3m3), - MAX3(H3p1, H3p2, H3p3), - MAX3(H3m1, H3m2, H3m3), - DIM3 + MAX3(H3m1, H3m2, H3m3) - 1, - DIM3 + MAX3(H3m1, H3m2, H3m3) + MAX3(H3p1, H3p3, H3p3)); + h_example[0] = gridtools::halo_descriptor(MAX3(H1m1, H1m2, H1m3), MAX3(H1p1, H1p2, H1p3), + MAX3(H1m1, H1m2, H1m3), DIM1 + MAX3(H1m1, H1m2, H1m3) - 1, + DIM1 + MAX3(H1m1, H1m2, H1m3) + MAX3(H1p1, H1p3, H1p3)); + h_example[1] = gridtools::halo_descriptor(MAX3(H2m1, H2m2, H2m3), MAX3(H2p1, H2p2, H2p3), + MAX3(H2m1, H2m2, H2m3), DIM2 + MAX3(H2m1, H2m2, H2m3) - 1, + DIM2 + MAX3(H2m1, H2m2, H2m3) + MAX3(H2p1, H2p3, H2p3)); + h_example[2] = gridtools::halo_descriptor(MAX3(H3m1, H3m2, H3m3), MAX3(H3p1, H3p2, H3p3), + MAX3(H3m1, H3m2, H3m3), DIM3 + MAX3(H3m1, H3m2, H3m3) - 1, + DIM3 + MAX3(H3m1, H3m2, H3m3) + MAX3(H3p1, H3p3, H3p3)); #undef MAX3 - he.setup(3, - gridtools::gcl::field_on_the_fly(nullptr, h_example), // BEWARE!!!! - std::max(sizeof(triple_t::data_type), - std::max(sizeof(triple_t::data_type), - sizeof(triple_t::data_type)) // Estimates the size - )); - - file << "Proc: (" << coords[0] << ", " << coords[1] << ", " << coords[2] << ")\n"; - - /* Just an initialization */ - for (int ii = 0; ii < DIM1 + H1m1 + H1p1; ++ii) - for (int jj = 0; jj < DIM2 + H2m1 + H2p1; ++jj) { - for (int kk = 0; kk < DIM3 + H3m1 + H3p1; ++kk) { - a(ii, jj, kk) = triple_t(); - } + he.setup(3, + gridtools::gcl::field_on_the_fly(nullptr, + h_example), // BEWARE!!!! + std::max(sizeof(triple_t::data_type), + std::max(sizeof(triple_t::data_type), + sizeof(triple_t::data_type)) // Estimates the size + )); + + file << "Proc: (" << coords[0] << ", " << coords[1] << ", " << coords[2] << ")\n"; + + /* Just an initialization */ + for (int ii = 0; ii < DIM1 + H1m1 + H1p1; ++ii) + for (int jj = 0; jj < DIM2 + H2m1 + H2p1; ++jj) + { + for (int kk = 0; kk < DIM3 + H3m1 + H3p1; ++kk) + { + a(ii, jj, kk) = triple_t(); } + } - for (int ii = 0; ii < DIM1 + H1m2 + H1p2; ++ii) - for (int jj = 0; jj < DIM2 + H2m2 + H2p2; ++jj) { - for (int kk = 0; kk < DIM3 + H3m2 + H3p2; ++kk) { - b(ii, jj, kk) = triple_t(); - } + for (int ii = 0; ii < DIM1 + H1m2 + H1p2; ++ii) + for (int jj = 0; jj < DIM2 + H2m2 + H2p2; ++jj) + { + for (int kk = 0; kk < DIM3 + H3m2 + H3p2; ++kk) + { + b(ii, jj, kk) = triple_t(); } + } - for (int ii = 0; ii < DIM1 + H1m3 + H1p3; ++ii) - for (int jj = 0; jj < DIM2 + H2m3 + H2p3; ++jj) { - for (int kk = 0; kk < DIM3 + H3m3 + H3p3; ++kk) { - c(ii, jj, kk) = triple_t(); - } + for (int ii = 0; ii < DIM1 + H1m3 + H1p3; ++ii) + for (int jj = 0; jj < DIM2 + H2m3 + H2p3; ++jj) + { + for (int kk = 0; kk < DIM3 + H3m3 + H3p3; ++kk) + { + c(ii, jj, kk) = triple_t(); } + } - for (int ii = H1m1; ii < DIM1 + H1m1; ++ii) - for (int jj = H2m1; jj < DIM2 + H2m1; ++jj) - for (int kk = H3m1; kk < DIM3 + H3m1; ++kk) { - a(ii, jj, kk) = triple_t( - ii - H1m1 + (DIM1)*coords[0], jj - H2m1 + (DIM2)*coords[1], kk - H3m1 + (DIM3)*coords[2]); - } + for (int ii = H1m1; ii < DIM1 + H1m1; ++ii) + for (int jj = H2m1; jj < DIM2 + H2m1; ++jj) + for (int kk = H3m1; kk < DIM3 + H3m1; ++kk) + { + a(ii, jj, kk) = triple_t(ii - H1m1 + (DIM1)*coords[0], + jj - H2m1 + (DIM2)*coords[1], kk - H3m1 + (DIM3)*coords[2]); + } - for (int ii = H1m2; ii < DIM1 + H1m2; ++ii) - for (int jj = H2m2; jj < DIM2 + H2m2; ++jj) - for (int kk = H3m2; kk < DIM3 + H3m2; ++kk) { - b(ii, jj, kk) = triple_t(ii - H1m2 + (DIM1)*coords[0] + B_ADD, - jj - H2m2 + (DIM2)*coords[1] + B_ADD, - kk - H3m2 + (DIM3)*coords[2] + B_ADD); - } + for (int ii = H1m2; ii < DIM1 + H1m2; ++ii) + for (int jj = H2m2; jj < DIM2 + H2m2; ++jj) + for (int kk = H3m2; kk < DIM3 + H3m2; ++kk) + { + b(ii, jj, kk) = triple_t(ii - H1m2 + (DIM1)*coords[0] + B_ADD, + jj - H2m2 + (DIM2)*coords[1] + B_ADD, kk - H3m2 + (DIM3)*coords[2] + B_ADD); + } - for (int ii = H1m3; ii < DIM1 + H1m3; ++ii) - for (int jj = H2m3; jj < DIM2 + H2m3; ++jj) - for (int kk = H3m3; kk < DIM3 + H3m3; ++kk) { - c(ii, jj, kk) = triple_t(ii - H1m3 + (DIM1)*coords[0] + C_ADD, - jj - H2m3 + (DIM2)*coords[1] + C_ADD, - kk - H3m3 + (DIM3)*coords[2] + C_ADD); - } + for (int ii = H1m3; ii < DIM1 + H1m3; ++ii) + for (int jj = H2m3; jj < DIM2 + H2m3; ++jj) + for (int kk = H3m3; kk < DIM3 + H3m3; ++kk) + { + c(ii, jj, kk) = triple_t(ii - H1m3 + (DIM1)*coords[0] + C_ADD, + jj - H2m3 + (DIM2)*coords[1] + C_ADD, kk - H3m3 + (DIM3)*coords[2] + C_ADD); + } - file << "A \n"; - printbuff(file, a, DIM1 + H1m1 + H1p1, DIM2 + H2m1 + H2p1, DIM3 + H3m1 + H3p1); - file << "B \n"; - printbuff(file, b, DIM1 + H1m2 + H1p2, DIM2 + H2m2 + H2p2, DIM3 + H3m2 + H3p2); - file << "C \n"; - printbuff(file, c, DIM1 + H1m3 + H1p3, DIM2 + H2m3 + H2p3, DIM3 + H3m3 + H3p3); - file.flush(); + file << "A \n"; + printbuff(file, a, DIM1 + H1m1 + H1p1, DIM2 + H2m1 + H2p1, DIM3 + H3m1 + H3p1); + file << "B \n"; + printbuff(file, b, DIM1 + H1m2 + H1p2, DIM2 + H2m2 + H2p2, DIM3 + H3m2 + H3p2); + file << "C \n"; + printbuff(file, c, DIM1 + H1m3 + H1p3, DIM2 + H2m3 + H2p3, DIM3 + H3m3 + H3p3); + file.flush(); #ifdef GHEX_CUDACC - file << "***** GPU ON *****\n"; - - triple_t::data_type *gpu_a = 0; - triple_t::data_type *gpu_b = 0; - triple_t::data_type *gpu_c = 0; - GT_CUDA_CHECK(cudaMalloc(&gpu_a, - (DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1) * - sizeof(triple_t::data_type))); - GT_CUDA_CHECK(cudaMalloc(&gpu_b, - (DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2) * - sizeof(triple_t::data_type))); - GT_CUDA_CHECK(cudaMalloc(&gpu_c, - (DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3) * - sizeof(triple_t::data_type))); - - GT_CUDA_CHECK(cudaMemcpy(gpu_a, - a.ptr, - (DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1) * - sizeof(triple_t::data_type), - cudaMemcpyHostToDevice)); - - GT_CUDA_CHECK(cudaMemcpy(gpu_b, - b.ptr, - (DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2) * - sizeof(triple_t::data_type), - cudaMemcpyHostToDevice)); - - GT_CUDA_CHECK(cudaMemcpy(gpu_c, - c.ptr, - (DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3) * - sizeof(triple_t::data_type), - cudaMemcpyHostToDevice)); - - gridtools::gcl::field_on_the_fly::data_type, layoutmap, pattern_type::traits> field1( - reinterpret_cast::data_type *>(gpu_a), halo_dsc1); - gridtools::gcl::field_on_the_fly::data_type, layoutmap, pattern_type::traits> field2( - reinterpret_cast::data_type *>(gpu_b), halo_dsc2); - gridtools::gcl::field_on_the_fly::data_type, layoutmap, pattern_type::traits> field3( - reinterpret_cast::data_type *>(gpu_c), halo_dsc3); + file << "***** GPU ON *****\n"; + + triple_t::data_type* gpu_a = 0; + triple_t::data_type* gpu_b = 0; + triple_t::data_type* gpu_c = 0; + GT_CUDA_CHECK( + cudaMalloc(&gpu_a, (DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1) * + sizeof(triple_t::data_type))); + GT_CUDA_CHECK( + cudaMalloc(&gpu_b, (DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2) * + sizeof(triple_t::data_type))); + GT_CUDA_CHECK( + cudaMalloc(&gpu_c, (DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3) * + sizeof(triple_t::data_type))); + + GT_CUDA_CHECK(cudaMemcpy(gpu_a, a.ptr, + (DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1) * + sizeof(triple_t::data_type), + cudaMemcpyHostToDevice)); + + GT_CUDA_CHECK(cudaMemcpy(gpu_b, b.ptr, + (DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2) * + sizeof(triple_t::data_type), + cudaMemcpyHostToDevice)); + + GT_CUDA_CHECK(cudaMemcpy(gpu_c, c.ptr, + (DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3) * + sizeof(triple_t::data_type), + cudaMemcpyHostToDevice)); + + gridtools::gcl::field_on_the_fly::data_type, layoutmap, + pattern_type::traits> + field1(reinterpret_cast::data_type*>(gpu_a), halo_dsc1); + gridtools::gcl::field_on_the_fly::data_type, layoutmap, + pattern_type::traits> + field2(reinterpret_cast::data_type*>(gpu_b), halo_dsc2); + gridtools::gcl::field_on_the_fly::data_type, layoutmap, + pattern_type::traits> + field3(reinterpret_cast::data_type*>(gpu_c), halo_dsc3); #else - gridtools::gcl::field_on_the_fly::data_type, layoutmap, pattern_type::traits> field1( - reinterpret_cast::data_type *>(a.ptr), halo_dsc1); - gridtools::gcl::field_on_the_fly::data_type, layoutmap, pattern_type::traits> field2( - reinterpret_cast::data_type *>(b.ptr), halo_dsc2); - gridtools::gcl::field_on_the_fly::data_type, layoutmap, pattern_type::traits> field3( - reinterpret_cast::data_type *>(c.ptr), halo_dsc3); + gridtools::gcl::field_on_the_fly::data_type, layoutmap, + pattern_type::traits> + field1(reinterpret_cast::data_type*>(a.ptr), halo_dsc1); + gridtools::gcl::field_on_the_fly::data_type, layoutmap, + pattern_type::traits> + field2(reinterpret_cast::data_type*>(b.ptr), halo_dsc2); + gridtools::gcl::field_on_the_fly::data_type, layoutmap, + pattern_type::traits> + field3(reinterpret_cast::data_type*>(c.ptr), halo_dsc3); #endif - - file << " LOCAL MEAN STD MIN MAX" << std::endl; - timer_type t_0_local; - timer_type t_1_local; - timer_type t_local; - timer_type t_0_global; - timer_type t_1_global; - timer_type t_global; - const int k_start = 5; - for (int k=0; k<25; ++k) - { - timer_type t_0; - timer_type t_1; + file << " LOCAL MEAN STD MIN MAX" + << std::endl; + timer_type t_0_local; + timer_type t_1_local; + timer_type t_local; + timer_type t_0_global; + timer_type t_1_global; + timer_type t_global; + const int k_start = 5; + for (int k = 0; k < 25; ++k) + { + timer_type t_0; + timer_type t_1; #ifdef VECTOR_INTERFACE - world.barrier(); - t_0.tic(); - he.pack(vect); - t_0.toc(); - t_1.tic(); - he.exchange(); - he.unpack(vect); - t_1.toc(); - world.barrier(); + world.barrier(); + t_0.tic(); + he.pack(vect); + t_0.toc(); + t_1.tic(); + he.exchange(); + he.unpack(vect); + t_1.toc(); + world.barrier(); #else - world.barrier(); - t_0.tic(); - he.pack(field1, field2, field3); - t_0.toc(); - t_1.tic(); - he.exchange(); - he.unpack(field1, field2, field3); - t_1.toc(); - world.barrier(); + world.barrier(); + t_0.tic(); + he.pack(field1, field2, field3); + t_0.toc(); + t_1.tic(); + he.exchange(); + he.unpack(field1, field2, field3); + t_1.toc(); + world.barrier(); #endif - timer_type t; - t(t_0.sum()+t_1.sum()); + timer_type t; + t(t_0.sum() + t_1.sum()); - auto t_0_all = gridtools::ghex::reduce(t_0,world); - auto t_1_all = gridtools::ghex::reduce(t_1,world); - auto t_all = gridtools::ghex::reduce(t,world); - if (k >= k_start) - { - t_0_local(t_0); - t_1_local(t_1); - t_local(t); - t_0_global(t_0_all); - t_1_global(t_1_all); - t_global(t_all); - } - - file << "TIME PACK: " - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0.mean()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_all.mean()/1000.0 - << " ±" - << std::scientific << std::setprecision(4) << std::right << std::setw(11) << t_0_all.stddev()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_all.min()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_all.max()/1000.0 - << std::endl; - file << "TIME WAIT/UNPACK: " - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1.mean()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_all.mean()/1000.0 - << " ±" - << std::scientific << std::setprecision(4) << std::right << std::setw(11) << t_1_all.stddev()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_all.min()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_all.max()/1000.0 - << std::endl; - file << "TIME ALL: " - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t.mean()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_all.mean()/1000.0 - << " ±" - << std::scientific << std::setprecision(4) << std::right << std::setw(11) << t_all.stddev()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_all.min()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_all.max()/1000.0 - << std::endl; - file << std::endl; + auto t_0_all = gridtools::ghex::reduce(t_0, world); + auto t_1_all = gridtools::ghex::reduce(t_1, world); + auto t_all = gridtools::ghex::reduce(t, world); + if (k >= k_start) + { + t_0_local(t_0); + t_1_local(t_1); + t_local(t); + t_0_global(t_0_all); + t_1_global(t_1_all); + t_global(t_all); } - file << std::endl << "-----------------" << std::endl; - file << "TIME PACK: " - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_local.mean()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_global.mean()/1000.0 - << " ±" - << std::scientific << std::setprecision(4) << std::right << std::setw(11) << t_0_global.stddev()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_global.min()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_0_global.max()/1000.0 - << std::endl; - file << "TIME WAIT/UNPACK: " - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_local.mean()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_global.mean()/1000.0 - << " ±" - << std::scientific << std::setprecision(4) << std::right << std::setw(11) << t_1_global.stddev()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_global.min()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_1_global.max()/1000.0 - << std::endl; - file << "TIME ALL: " - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_local.mean()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_global.mean()/1000.0 - << " ±" - << std::scientific << std::setprecision(4) << std::right << std::setw(11) << t_global.stddev()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_global.min()/1000.0 - << std::scientific << std::setprecision(4) << std::right << std::setw(12) << t_global.max()/1000.0 - << std::endl; + file << "TIME PACK: " << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_0.mean() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_0_all.mean() / 1000.0 << " ±" << std::scientific + << std::setprecision(4) << std::right << std::setw(11) << t_0_all.stddev() / 1000.0 + << std::scientific << std::setprecision(4) << std::right << std::setw(12) + << t_0_all.min() / 1000.0 << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_0_all.max() / 1000.0 << std::endl; + file << "TIME WAIT/UNPACK: " << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_1.mean() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_1_all.mean() / 1000.0 << " ±" << std::scientific + << std::setprecision(4) << std::right << std::setw(11) << t_1_all.stddev() / 1000.0 + << std::scientific << std::setprecision(4) << std::right << std::setw(12) + << t_1_all.min() / 1000.0 << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_1_all.max() / 1000.0 << std::endl; + file << "TIME ALL: " << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t.mean() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_all.mean() / 1000.0 << " ±" << std::scientific + << std::setprecision(4) << std::right << std::setw(11) << t_all.stddev() / 1000.0 + << std::scientific << std::setprecision(4) << std::right << std::setw(12) + << t_all.min() / 1000.0 << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_all.max() / 1000.0 << std::endl; + file << std::endl; + } + + file << std::endl << "-----------------" << std::endl; + file << "TIME PACK: " << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_0_local.mean() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_0_global.mean() / 1000.0 << " ±" << std::scientific + << std::setprecision(4) << std::right << std::setw(11) << t_0_global.stddev() / 1000.0 + << std::scientific << std::setprecision(4) << std::right << std::setw(12) + << t_0_global.min() / 1000.0 << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_0_global.max() / 1000.0 << std::endl; + file << "TIME WAIT/UNPACK: " << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_1_local.mean() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_1_global.mean() / 1000.0 << " ±" << std::scientific + << std::setprecision(4) << std::right << std::setw(11) << t_1_global.stddev() / 1000.0 + << std::scientific << std::setprecision(4) << std::right << std::setw(12) + << t_1_global.min() / 1000.0 << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_1_global.max() / 1000.0 << std::endl; + file << "TIME ALL: " << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_local.mean() / 1000.0 << std::scientific << std::setprecision(4) + << std::right << std::setw(12) << t_global.mean() / 1000.0 << " ±" << std::scientific + << std::setprecision(4) << std::right << std::setw(11) << t_global.stddev() / 1000.0 + << std::scientific << std::setprecision(4) << std::right << std::setw(12) + << t_global.min() / 1000.0 << std::scientific << std::setprecision(4) << std::right + << std::setw(12) << t_global.max() / 1000.0 << std::endl; #ifdef GHEX_CUDACC - GT_CUDA_CHECK(cudaMemcpy(a.ptr, - gpu_a, - (DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1) * - sizeof(triple_t::data_type), - cudaMemcpyDeviceToHost)); - - GT_CUDA_CHECK(cudaMemcpy(b.ptr, - gpu_b, - (DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2) * - sizeof(triple_t::data_type), - cudaMemcpyDeviceToHost)); - - GT_CUDA_CHECK(cudaMemcpy(c.ptr, - gpu_c, - (DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3) * - sizeof(triple_t::data_type), - cudaMemcpyDeviceToHost)); - - GT_CUDA_CHECK(cudaFree(gpu_a)); - GT_CUDA_CHECK(cudaFree(gpu_b)); - GT_CUDA_CHECK(cudaFree(gpu_c)); + GT_CUDA_CHECK(cudaMemcpy(a.ptr, gpu_a, + (DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1) * + sizeof(triple_t::data_type), + cudaMemcpyDeviceToHost)); + + GT_CUDA_CHECK(cudaMemcpy(b.ptr, gpu_b, + (DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2) * + sizeof(triple_t::data_type), + cudaMemcpyDeviceToHost)); + + GT_CUDA_CHECK(cudaMemcpy(c.ptr, gpu_c, + (DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3) * + sizeof(triple_t::data_type), + cudaMemcpyDeviceToHost)); + + GT_CUDA_CHECK(cudaFree(gpu_a)); + GT_CUDA_CHECK(cudaFree(gpu_b)); + GT_CUDA_CHECK(cudaFree(gpu_c)); #endif - file << "\n********************************************************************************\n"; + file << "\n********************************************************************************\n"; - file << "A \n"; - printbuff(file, a, DIM1 + H1m1 + H1p1, DIM2 + H2m1 + H2p1, DIM3 + H3m1 + H3p1); - file << "B \n"; - printbuff(file, b, DIM1 + H1m2 + H1p2, DIM2 + H2m2 + H2p2, DIM3 + H3m2 + H3p2); - file << "C \n"; - printbuff(file, c, DIM1 + H1m3 + H1p3, DIM2 + H2m3 + H2p3, DIM3 + H3m3 + H3p3); - file.flush(); + file << "A \n"; + printbuff(file, a, DIM1 + H1m1 + H1p1, DIM2 + H2m1 + H2p1, DIM3 + H3m1 + H3p1); + file << "B \n"; + printbuff(file, b, DIM1 + H1m2 + H1p2, DIM2 + H2m2 + H2p2, DIM3 + H3m2 + H3p2); + file << "C \n"; + printbuff(file, c, DIM1 + H1m3 + H1p3, DIM2 + H2m3 + H2p3, DIM3 + H3m3 + H3p3); + file.flush(); - int passed = true; + int passed = true; - /* Checking the data arrived correctly in the whole region + /* Checking the data arrived correctly in the whole region */ - for (int ii = 0; ii < DIM1 + H1m1 + H1p1; ++ii) - for (int jj = 0; jj < DIM2 + H2m1 + H2p1; ++jj) - for (int kk = 0; kk < DIM3 + H3m1 + H3p1; ++kk) { - - triple_t ta; - int tax, tay, taz; + for (int ii = 0; ii < DIM1 + H1m1 + H1p1; ++ii) + for (int jj = 0; jj < DIM2 + H2m1 + H2p1; ++jj) + for (int kk = 0; kk < DIM3 + H3m1 + H3p1; ++kk) + { + triple_t ta; + int tax, tay, taz; - tax = modulus(ii - H1m1 + (DIM1)*coords[0], DIM1 * dims[0]); + tax = modulus(ii - H1m1 + (DIM1)*coords[0], DIM1 * dims[0]); - tay = modulus(jj - H2m1 + (DIM2)*coords[1], DIM2 * dims[1]); + tay = modulus(jj - H2m1 + (DIM2)*coords[1], DIM2 * dims[1]); - taz = modulus(kk - H3m1 + (DIM3)*coords[2], DIM3 * dims[2]); + taz = modulus(kk - H3m1 + (DIM3)*coords[2], DIM3 * dims[2]); - if (!per0) { - if (((coords[0] == 0) && (ii < H1m1)) || ((coords[0] == dims[0] - 1) && (ii >= DIM1 + H1m1))) { - tax = triple_t().x(); - } + if (!per0) + { + if (((coords[0] == 0) && (ii < H1m1)) || + ((coords[0] == dims[0] - 1) && (ii >= DIM1 + H1m1))) + { + tax = triple_t().x(); } + } - if (!per1) { - if (((coords[1] == 0) && (jj < H2m1)) || ((coords[1] == dims[1] - 1) && (jj >= DIM2 + H2m1))) { - tay = triple_t().y(); - } + if (!per1) + { + if (((coords[1] == 0) && (jj < H2m1)) || + ((coords[1] == dims[1] - 1) && (jj >= DIM2 + H2m1))) + { + tay = triple_t().y(); } + } - if (!per2) { - if (((coords[2] == 0) && (kk < H3m1)) || ((coords[2] == dims[2] - 1) && (kk >= DIM3 + H3m1))) { - taz = triple_t().z(); - } + if (!per2) + { + if (((coords[2] == 0) && (kk < H3m1)) || + ((coords[2] == dims[2] - 1) && (kk >= DIM3 + H3m1))) + { + taz = triple_t().z(); } + } - ta = triple_t(tax, tay, taz).floor(); + ta = triple_t(tax, tay, taz).floor(); - if (a(ii, jj, kk) != ta) { - passed = false; - file << ii << ", " << jj << ", " << kk << " values found != expected: " - << "a " << a(ii, jj, kk) << " != " << ta << "\n"; - } + if (a(ii, jj, kk) != ta) + { + passed = false; + file << ii << ", " << jj << ", " << kk << " values found != expected: " + << "a " << a(ii, jj, kk) << " != " << ta << "\n"; } + } - for (int ii = 0; ii < DIM1 + H1m2 + H1p2; ++ii) - for (int jj = 0; jj < DIM2 + H2m2 + H2p2; ++jj) - for (int kk = 0; kk < DIM3 + H3m2 + H3p2; ++kk) { - - triple_t tb; - int tbx, tby, tbz; + for (int ii = 0; ii < DIM1 + H1m2 + H1p2; ++ii) + for (int jj = 0; jj < DIM2 + H2m2 + H2p2; ++jj) + for (int kk = 0; kk < DIM3 + H3m2 + H3p2; ++kk) + { + triple_t tb; + int tbx, tby, tbz; - tbx = modulus(ii - H1m2 + (DIM1)*coords[0], DIM1 * dims[0]) + B_ADD; + tbx = modulus(ii - H1m2 + (DIM1)*coords[0], DIM1 * dims[0]) + B_ADD; - tby = modulus(jj - H2m2 + (DIM2)*coords[1], DIM2 * dims[1]) + B_ADD; + tby = modulus(jj - H2m2 + (DIM2)*coords[1], DIM2 * dims[1]) + B_ADD; - tbz = modulus(kk - H3m2 + (DIM3)*coords[2], DIM3 * dims[2]) + B_ADD; + tbz = modulus(kk - H3m2 + (DIM3)*coords[2], DIM3 * dims[2]) + B_ADD; - if (!per0) { - if (((coords[0] == 0) && (ii < H1m2)) || ((coords[0] == dims[0] - 1) && (ii >= DIM1 + H1m2))) { - tbx = triple_t().x(); - } + if (!per0) + { + if (((coords[0] == 0) && (ii < H1m2)) || + ((coords[0] == dims[0] - 1) && (ii >= DIM1 + H1m2))) + { + tbx = triple_t().x(); } + } - if (!per1) { - if (((coords[1] == 0) && (jj < H2m2)) || ((coords[1] == dims[1] - 1) && (jj >= DIM2 + H2m2))) { - tby = triple_t().y(); - } + if (!per1) + { + if (((coords[1] == 0) && (jj < H2m2)) || + ((coords[1] == dims[1] - 1) && (jj >= DIM2 + H2m2))) + { + tby = triple_t().y(); } + } - if (!per2) { - if (((coords[2] == 0) && (kk < H3m2)) || ((coords[2] == dims[2] - 1) && (kk >= DIM3 + H3m2))) { - tbz = triple_t().z(); - } + if (!per2) + { + if (((coords[2] == 0) && (kk < H3m2)) || + ((coords[2] == dims[2] - 1) && (kk >= DIM3 + H3m2))) + { + tbz = triple_t().z(); } + } - tb = triple_t(tbx, tby, tbz).floor(); + tb = triple_t(tbx, tby, tbz).floor(); - if (b(ii, jj, kk) != tb) { - passed = false; - file << ii << ", " << jj << ", " << kk << " values found != expected: " - << "b " << b(ii, jj, kk) << " != " << tb << "\n"; - } + if (b(ii, jj, kk) != tb) + { + passed = false; + file << ii << ", " << jj << ", " << kk << " values found != expected: " + << "b " << b(ii, jj, kk) << " != " << tb << "\n"; } + } - for (int ii = 0; ii < DIM1 + H1m3 + H1p3; ++ii) - for (int jj = 0; jj < DIM2 + H2m3 + H2p3; ++jj) - for (int kk = 0; kk < DIM3 + H3m3 + H3p3; ++kk) { - - triple_t tc; - int tcx, tcy, tcz; + for (int ii = 0; ii < DIM1 + H1m3 + H1p3; ++ii) + for (int jj = 0; jj < DIM2 + H2m3 + H2p3; ++jj) + for (int kk = 0; kk < DIM3 + H3m3 + H3p3; ++kk) + { + triple_t tc; + int tcx, tcy, tcz; - tcx = modulus(ii - H1m3 + (DIM1)*coords[0], DIM1 * dims[0]) + C_ADD; + tcx = modulus(ii - H1m3 + (DIM1)*coords[0], DIM1 * dims[0]) + C_ADD; - tcy = modulus(jj - H2m3 + (DIM2)*coords[1], DIM2 * dims[1]) + C_ADD; + tcy = modulus(jj - H2m3 + (DIM2)*coords[1], DIM2 * dims[1]) + C_ADD; - tcz = modulus(kk - H3m3 + (DIM3)*coords[2], DIM3 * dims[2]) + C_ADD; + tcz = modulus(kk - H3m3 + (DIM3)*coords[2], DIM3 * dims[2]) + C_ADD; - if (!per0) { - if (((coords[0] == 0) && (ii < H1m3)) || ((coords[0] == dims[0] - 1) && (ii >= DIM1 + H1m3))) { - tcx = triple_t().x(); - } + if (!per0) + { + if (((coords[0] == 0) && (ii < H1m3)) || + ((coords[0] == dims[0] - 1) && (ii >= DIM1 + H1m3))) + { + tcx = triple_t().x(); } + } - if (!per1) { - if (((coords[1] == 0) && (jj < H2m3)) || ((coords[1] == dims[1] - 1) && (jj >= DIM2 + H2m3))) { - tcy = triple_t().y(); - } + if (!per1) + { + if (((coords[1] == 0) && (jj < H2m3)) || + ((coords[1] == dims[1] - 1) && (jj >= DIM2 + H2m3))) + { + tcy = triple_t().y(); } + } - if (!per2) { - if (((coords[2] == 0) && (kk < H3m3)) || ((coords[2] == dims[2] - 1) && (kk >= DIM3 + H3m3))) { - tcz = triple_t().z(); - } + if (!per2) + { + if (((coords[2] == 0) && (kk < H3m3)) || + ((coords[2] == dims[2] - 1) && (kk >= DIM3 + H3m3))) + { + tcz = triple_t().z(); } + } - tc = triple_t(tcx, tcy, tcz).floor(); + tc = triple_t(tcx, tcy, tcz).floor(); - if (c(ii, jj, kk) != tc) { - passed = false; - file << ii << ", " << jj << ", " << kk << " values found != expected: " - << "c " << c(ii, jj, kk) << " != " << tc << "\n"; - } + if (c(ii, jj, kk) != tc) + { + passed = false; + file << ii << ", " << jj << ", " << kk << " values found != expected: " + << "c " << c(ii, jj, kk) << " != " << tc << "\n"; } + } - if (passed) - file << "RESULT: PASSED!\n"; - else - file << "RESULT: FAILED!\n"; + if (passed) file << "RESULT: PASSED!\n"; + else + file << "RESULT: FAILED!\n"; - return passed; - } + return passed; +} - bool test(int DIM1, - int DIM2, - int DIM3, - int H1m1, - int H1p1, - int H2m1, - int H2p1, - int H3m1, - int H3p1, - int H1m2, - int H1p2, - int H2m2, - int H2p2, - int H3m2, - int H3p2, - int H1m3, - int H1p3, - int H2m3, - int H2p3, - int H3m3, - int H3p3) { - - /* Here we compute the computing gris as in many applications +bool +test(int DIM1, int DIM2, int DIM3, int H1m1, int H1p1, int H2m1, int H2p1, int H3m1, int H3p1, + int H1m2, int H1p2, int H2m2, int H2p2, int H3m2, int H3p2, int H1m3, int H1p3, int H2m3, + int H2p3, int H3m3, int H3p3) +{ + /* Here we compute the computing gris as in many applications */ - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - //std::cout << pid << " " << nprocs << "\n"; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + //std::cout << pid << " " << nprocs << "\n"; - std::stringstream ss; - ss << pid; - std::string filename = "gcl_out" + ss.str() + ".txt"; - //std::cout << filename << std::endl; - std::ofstream file(filename.c_str()); + std::stringstream ss; + ss << pid; + std::string filename = "gcl_out" + ss.str() + ".txt"; + //std::cout << filename << std::endl; + std::ofstream file(filename.c_str()); - file << pid << " " << nprocs << "\n"; + file << pid << " " << nprocs << "\n"; - dims[2]=1; - MPI_Dims_create(nprocs, 3, dims); - int period[3] = {1, 1, 1}; + dims[2] = 1; + MPI_Dims_create(nprocs, 3, dims); + int period[3] = {1, 1, 1}; - file << "@" << pid << "@ MPI GRID SIZE " << dims[0] << " - " << dims[1] << " - " << dims[2] << "\n"; + file << "@" << pid << "@ MPI GRID SIZE " << dims[0] << " - " << dims[1] << " - " << dims[2] + << "\n"; - MPI_Cart_create(MPI_COMM_WORLD, 3, dims, period, false, &CartComm); + MPI_Cart_create(MPI_COMM_WORLD, 3, dims, period, false, &CartComm); - MPI_Cart_get(CartComm, 3, dims, period, coords); + MPI_Cart_get(CartComm, 3, dims, period, coords); - /* Each process will hold a tile of size + /* Each process will hold a tile of size (DIM1+2*H)x(DIM2+2*H)x(DIM3+2*H). The DIM1xDIM2xDIM3 area inside the H width border is the inner region of an hypothetical stencil computation whise halo width is H. */ - file << "Field A " - << "size = " << DIM1 << "x" << DIM2 << "x" << DIM3 << " " - << "Halo along i " << H1m1 << " - " << H1p1 << ", " - << "Halo along j " << H2m1 << " - " << H2p1 << ", " - << "Halo along k " << H3m1 << " - " << H3p1 << std::endl; - - file << "Field B " - << "size = " << DIM1 << "x" << DIM2 << "x" << DIM3 << " " - << "Halo along i " << H1m2 << " - " << H1p2 << ", " - << "Halo along j " << H2m2 << " - " << H2p2 << ", " - << "Halo along k " << H3m2 << " - " << H3p2 << std::endl; - - file << "Field C " - << "size = " << DIM1 << "x" << DIM2 << "x" << DIM3 << " " - << "Halo along i " << H1m3 << " - " << H1p3 << ", " - << "Halo along j " << H2m3 << " - " << H2p3 << ", " - << "Halo along k " << H3m3 << " - " << H3p3 << std::endl; - file.flush(); - - /* This example will exchange 3 data arrays at the same time with + file << "Field A " + << "size = " << DIM1 << "x" << DIM2 << "x" << DIM3 << " " + << "Halo along i " << H1m1 << " - " << H1p1 << ", " + << "Halo along j " << H2m1 << " - " << H2p1 << ", " + << "Halo along k " << H3m1 << " - " << H3p1 << std::endl; + + file << "Field B " + << "size = " << DIM1 << "x" << DIM2 << "x" << DIM3 << " " + << "Halo along i " << H1m2 << " - " << H1p2 << ", " + << "Halo along j " << H2m2 << " - " << H2p2 << ", " + << "Halo along k " << H3m2 << " - " << H3p2 << std::endl; + + file << "Field C " + << "size = " << DIM1 << "x" << DIM2 << "x" << DIM3 << " " + << "Halo along i " << H1m3 << " - " << H1p3 << ", " + << "Halo along j " << H2m3 << " - " << H2p3 << ", " + << "Halo along k " << H3m3 << " - " << H3p3 << std::endl; + file.flush(); + + /* This example will exchange 3 data arrays at the same time with different values. */ - triple_t *_a = - new triple_t[(DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1)]; - triple_t *_b = - new triple_t[(DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2)]; - triple_t *_c = - new triple_t[(DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3)]; - - bool passed = true; - - file << "Permutation 0,1,2\n"; - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " - "_a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - file << "---------------------------------------------------\n"; - - file << "Permutation 0,2,1\n"; - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " - "_a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - file << "---------------------------------------------------\n"; - - file << "Permutation 1,0,2\n"; - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " - "_a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - file << "---------------------------------------------------\n"; - - file << "Permutation 1,2,0\n"; - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H31, " - "_a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - file << "---------------------------------------------------\n"; - - file << "Permutation 2,0,1\n"; - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " - "_a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - file << "---------------------------------------------------\n"; - - file << "Permutation 2,1,0\n"; - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " - "_a, " - "_b, _c)\n"; - passed = passed and run(file, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - file << "---------------------------------------------------\n"; - - delete[] _a; - delete[] _b; - delete[] _c; - - return passed; - } + triple_t* _a = new triple_t[(DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1)]; + triple_t* _b = new triple_t[(DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2)]; + triple_t* _c = new triple_t[(DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3)]; + + bool passed = true; + + file << "Permutation 0,1,2\n"; + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " + "_a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, + H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, + H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + file << "---------------------------------------------------\n"; + + file << "Permutation 0,2,1\n"; + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " + "_a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, + H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, + H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + file << "---------------------------------------------------\n"; + + file << "Permutation 1,0,2\n"; + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " + "_a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, + H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, + H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + file << "---------------------------------------------------\n"; + + file << "Permutation 1,2,0\n"; + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H31, " + "_a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, + H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, + H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + file << "---------------------------------------------------\n"; + + file << "Permutation 2,0,1\n"; + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " + "_a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, + H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, + H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + file << "---------------------------------------------------\n"; + + file << "Permutation 2,1,0\n"; + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, H1m1, + H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, + H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " + "_a, " + "_b, _c)\n"; + passed = passed and run(file, DIM1, DIM2, DIM3, + H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, H3m2, H3p2, + H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + file << "---------------------------------------------------\n"; + + delete[] _a; + delete[] _b; + delete[] _c; + + return passed; +} } // namespace halo_exchange_3D_generic_full #ifdef STANDALONE -int main(int argc, char **argv) { +int +main(int argc, char** argv) +{ #ifdef GT_USE_GPU device_binding(); #endif @@ -2087,11 +1094,13 @@ int main(int argc, char **argv) { MPI_Init(&argc, &argv); gridtools::GCL_Init(argc, argv); - if (argc != 22) { - std::cout << "Usage: test_halo_exchange_3D dimx dimy dimz h1m1 hip1 h2m1 h2m1 h3m1 h3p1 h1m2 hip2 h2m2 h2m2 " - "h3m2 h3p2 h1m3 hip3 h2m3 h2m3 h3m3 h3p3\n where args are integer sizes of the data fields and " - "halo width" - << std::endl; + if (argc != 22) + { + std::cout + << "Usage: test_halo_exchange_3D dimx dimy dimz h1m1 hip1 h2m1 h2m1 h3m1 h3p1 h1m2 hip2 h2m2 h2m2 " + "h3m2 h3p2 h1m3 hip3 h2m3 h2m3 h3m3 h3p3\n where args are integer sizes of the data fields and " + "halo width" + << std::endl; return 1; } int DIM1 = atoi(argv[1]); @@ -2116,32 +1125,14 @@ int main(int argc, char **argv) { int H3m3 = atoi(argv[20]); int H3p3 = atoi(argv[21]); - halo_exchange_3D_generic_full::test(DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3); + halo_exchange_3D_generic_full::test(DIM1, DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, + H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3); MPI_Finalize(); } #else -TEST(Communication, gcl_test_halo_exchange_3D_generic_full) { +TEST(Communication, gcl_test_halo_exchange_3D_generic_full) +{ //const int Nx = 98*2; //const int Ny = 54*3; //const int Nz = 87*2; @@ -2149,10 +1140,12 @@ TEST(Communication, gcl_test_halo_exchange_3D_generic_full) { const int Ny = 260; const int Nz = 80; #ifndef GHEX_1_PATTERN_BENCHMARK - bool passed = halo_exchange_3D_generic_full::test(Nx, Ny, Nz, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 0, 1); + bool passed = halo_exchange_3D_generic_full::test(Nx, Ny, Nz, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 2, + 1, 0, 1, 2, 3, 0, 1); #else //bool passed = halo_exchange_3D_generic_full::test(Nx, Ny, Nz, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 2, 1); - bool passed = halo_exchange_3D_generic_full::test(Nx, Ny, Nz, 3, 3, 3, 3, 0, 0, 3, 3, 3, 3, 0, 0, 3, 3, 3, 3, 0, 0); + bool passed = halo_exchange_3D_generic_full::test(Nx, Ny, Nz, 3, 3, 3, 3, 0, 0, 3, 3, 3, 3, 0, + 0, 3, 3, 3, 3, 0, 0); #endif EXPECT_TRUE(passed); } diff --git a/benchmarks/simple_comm_test_halo_exchange_3D_generic_full.cpp b/benchmarks/simple_comm_test_halo_exchange_3D_generic_full.cpp index 530c6e20..66e2034b 100644 --- a/benchmarks/simple_comm_test_halo_exchange_3D_generic_full.cpp +++ b/benchmarks/simple_comm_test_halo_exchange_3D_generic_full.cpp @@ -40,276 +40,280 @@ using transport = gridtools::ghex::tl::mpi_tag; using context_type = typename gridtools::ghex::tl::context_factory::context_type; /* CPU data descriptor */ -template -class my_data_desc { - +template +class my_data_desc +{ using coordinate_t = typename DomainDescriptor::coordinate_type; using Byte = unsigned char; const DomainDescriptor& m_domain; - coordinate_t m_halos_offset; - array m_values; - -public: + coordinate_t m_halos_offset; + array m_values; + public: using value_type = T; - my_data_desc(const DomainDescriptor& domain, - const coordinate_t& halos_offset, - const array& values) : - m_domain{domain}, - m_halos_offset{halos_offset}, - m_values{values} {} + my_data_desc(const DomainDescriptor& domain, const coordinate_t& halos_offset, + const array& values) + : m_domain{domain} + , m_halos_offset{halos_offset} + , m_values{values} + { + } - void set(const T& value, const coordinate_t& coords) { - m_values(coords[0] + m_halos_offset[0], coords[1] + m_halos_offset[1], coords[2] + m_halos_offset[2]) = value; + void set(const T& value, const coordinate_t& coords) + { + m_values(coords[0] + m_halos_offset[0], coords[1] + m_halos_offset[1], + coords[2] + m_halos_offset[2]) = value; } - const T& get(const coordinate_t& coords) const { - return m_values(coords[0] + m_halos_offset[0], coords[1] + m_halos_offset[1], coords[2] + m_halos_offset[2]); + const T& get(const coordinate_t& coords) const + { + return m_values(coords[0] + m_halos_offset[0], coords[1] + m_halos_offset[1], + coords[2] + m_halos_offset[2]); } - template - void set(const IterationSpace& is, const Byte* buffer) { + template + void set(const IterationSpace& is, const Byte* buffer) + { //std::cout << "DEBUG: is.first()[2] = " << is.local().first()[2] << "\n"; //std::cout << "DEBUG: is.last()[2] = " << is.local().last()[2] << "\n"; //std::cout.flush(); - gridtools::ghex::detail::for_loop<3, 3, LayoutMap>::apply([this, &buffer](auto... indices){ - coordinate_t coords{indices...}; - //std::cout << "DEBUG: coords = " << coords[0] << ", " << coords[1] << ", " << coords[2] << "\n"; - //std::cout.flush(); - set(*(reinterpret_cast(buffer)), coords); - //std::cout << "DEBUG: just set value " << get(coords) << "\n"; - //std::cout.flush(); - buffer += sizeof(T); - }, is.local().first(), is.local().last()); + gridtools::ghex::detail::for_loop<3, 3, LayoutMap>::apply( + [this, &buffer](auto... indices) + { + coordinate_t coords{indices...}; + //std::cout << "DEBUG: coords = " << coords[0] << ", " << coords[1] << ", " << coords[2] << "\n"; + //std::cout.flush(); + set(*(reinterpret_cast(buffer)), coords); + //std::cout << "DEBUG: just set value " << get(coords) << "\n"; + //std::cout.flush(); + buffer += sizeof(T); + }, + is.local().first(), is.local().last()); } - template - void get(const IterationSpace& is, Byte* buffer) const { - gridtools::ghex::detail::for_loop<3, 3, LayoutMap>::apply([this, &buffer](auto... indices){ - coordinate_t coords{indices...}; - //std::cout << "DEBUG: coords = " << coords[0] << ", " << coords[1] << ", " << coords[2] << "\n"; - //std::cout.flush(); - const T* tmp_ptr{&get(coords)}; - std::memcpy(buffer, tmp_ptr, sizeof(T)); - //std::cout << "DEBUG: just got value " << *(reinterpret_cast(buffer)) << "\n"; - //std::cout.flush(); - buffer += sizeof(T); - }, is.local().first(), is.local().last()); + template + void get(const IterationSpace& is, Byte* buffer) const + { + gridtools::ghex::detail::for_loop<3, 3, LayoutMap>::apply( + [this, &buffer](auto... indices) + { + coordinate_t coords{indices...}; + //std::cout << "DEBUG: coords = " << coords[0] << ", " << coords[1] << ", " << coords[2] << "\n"; + //std::cout.flush(); + const T* tmp_ptr{&get(coords)}; + std::memcpy(buffer, tmp_ptr, sizeof(T)); + //std::cout << "DEBUG: just got value " << *(reinterpret_cast(buffer)) << "\n"; + //std::cout.flush(); + buffer += sizeof(T); + }, + is.local().first(), is.local().last()); } - }; - -namespace halo_exchange_3D_generic_full { - - using domain_descriptor_t = gridtools::ghex::structured::regular::domain_descriptor>; - using domain_id_t = domain_descriptor_t::domain_id_type; - using coordinate_t = domain_descriptor_t::coordinate_type; - using halo_generator_t = gridtools::ghex::structured::regular::halo_generator>; - - int pid; - int nprocs; - MPI_Comm CartComm; - int dims[3] = {0, 0, 0}; - int coords[3] = {0, 0, 0}; - - struct timeval start_tv; - struct timeval stop1_tv; - struct timeval stop2_tv; - struct timeval stop3_tv; - double lapse_time1; - double lapse_time2; - double lapse_time3; - double lapse_time4; +namespace halo_exchange_3D_generic_full +{ + +using domain_descriptor_t = + gridtools::ghex::structured::regular::domain_descriptor>; +using domain_id_t = domain_descriptor_t::domain_id_type; +using coordinate_t = domain_descriptor_t::coordinate_type; +using halo_generator_t = gridtools::ghex::structured::regular::halo_generator>; + +int pid; +int nprocs; +MPI_Comm CartComm; +int dims[3] = {0, 0, 0}; +int coords[3] = {0, 0, 0}; + +struct timeval start_tv; +struct timeval stop1_tv; +struct timeval stop2_tv; +struct timeval stop3_tv; +double lapse_time1; +double lapse_time2; +double lapse_time3; +double lapse_time4; #define B_ADD 1 #define C_ADD 2 - typedef int T1; - typedef double T2; - typedef long long int T3; - - template - bool run(ST &file, context_type& context, Comm comm, - int DIM1, - int DIM2, - int DIM3, - int H1m1, - int H1p1, - int H2m1, - int H2p1, - int H3m1, - int H3p1, - int H1m2, - int H1p2, - int H2m2, - int H2p2, - int H3m2, - int H3p2, - int H1m3, - int H1p3, - int H2m3, - int H2p3, - int H3m3, - int H3p3, - triple_t *_a, - triple_t *_b, - triple_t *_c) { - - typedef gridtools::layout_map layoutmap; - - typedef my_data_desc, domain_descriptor_t, layoutmap> data_dsc_type_1; - typedef my_data_desc, domain_descriptor_t, layoutmap> data_dsc_type_2; - typedef my_data_desc, domain_descriptor_t, layoutmap> data_dsc_type_3; - - const std::array g_first{0 , 0 , 0 }; - const std::array g_last {dims[0] * DIM1 - 1, dims[1] * DIM2 - 1, dims[2] * DIM3 - 1}; - - const std::array halos_1{H1m1, H1p1, H2m1, H2p1, H3m1, H3p1}; - const std::array halos_2{H1m2, H1p2, H2m2, H2p2, H3m2, H3p2}; - const std::array halos_3{H1m3, H1p3, H2m3, H2p3, H3m3, H3p3}; - - const std::array periodic{per0, per1, per2}; - - std::vector local_domains; - - domain_descriptor_t my_domain_1{ - pid, - coordinate_t{(coords[0] ) * DIM1 , (coords[1] ) * DIM2 , (coords[2] ) * DIM3 }, - coordinate_t{(coords[0] + 1) * DIM1 - 1, (coords[1] + 1) * DIM2 - 1, (coords[2] + 1) * DIM3 - 1} - }; - local_domains.push_back(my_domain_1); - - auto halo_gen_1 = halo_generator_t{g_first, g_last, halos_1, periodic}; - auto halo_gen_2 = halo_generator_t{g_first, g_last, halos_2, periodic}; - auto halo_gen_3 = halo_generator_t{g_first, g_last, halos_3, periodic}; - - auto patterns_1 = gridtools::ghex::make_pattern(context, halo_gen_1, local_domains); - auto patterns_2 = gridtools::ghex::make_pattern(context, halo_gen_2, local_domains); - auto patterns_3 = gridtools::ghex::make_pattern(context, halo_gen_3, local_domains); - - using communication_object_t = gridtools::ghex::communication_object; // same type for all patterns - - std::vector cos_1; - for (const auto& p : patterns_1) cos_1.push_back(communication_object_t{p,comm}); - std::vector cos_2; - for (const auto& p : patterns_2) cos_2.push_back(communication_object_t{p,comm}); - std::vector cos_3; - for (const auto& p : patterns_3) cos_3.push_back(communication_object_t{p,comm}); - - array, layoutmap> a( - _a, (DIM1 + H1m1 + H1p1), (DIM2 + H2m1 + H2p1), (DIM3 + H3m1 + H3p1)); - array, layoutmap> b( - _b, (DIM1 + H1m2 + H1p2), (DIM2 + H2m2 + H2p2), (DIM3 + H3m2 + H3p2)); - array, layoutmap> c( - _c, (DIM1 + H1m3 + H1p3), (DIM2 + H2m3 + H2p3), (DIM3 + H3m3 + H3p3)); - - file << "Proc: (" << coords[0] << ", " << coords[1] << ", " << coords[2] << ")\n"; - - /* Just an initialization */ - for (int ii = 0; ii < DIM1 + H1m1 + H1p1; ++ii) - for (int jj = 0; jj < DIM2 + H2m1 + H2p1; ++jj) { - for (int kk = 0; kk < DIM3 + H3m1 + H3p1; ++kk) { - a(ii, jj, kk) = triple_t(); - } +typedef int T1; +typedef double T2; +typedef long long int T3; + +template +bool +run(ST& file, context_type& context, Comm comm, int DIM1, int DIM2, int DIM3, int H1m1, int H1p1, + int H2m1, int H2p1, int H3m1, int H3p1, int H1m2, int H1p2, int H2m2, int H2p2, int H3m2, + int H3p2, int H1m3, int H1p3, int H2m3, int H2p3, int H3m3, int H3p3, + triple_t* _a, triple_t* _b, triple_t* _c) +{ + typedef gridtools::layout_map layoutmap; + + typedef my_data_desc, domain_descriptor_t, layoutmap> data_dsc_type_1; + typedef my_data_desc, domain_descriptor_t, layoutmap> data_dsc_type_2; + typedef my_data_desc, domain_descriptor_t, layoutmap> data_dsc_type_3; + + const std::array g_first{0, 0, 0}; + const std::array g_last{dims[0] * DIM1 - 1, dims[1] * DIM2 - 1, dims[2] * DIM3 - 1}; + + const std::array halos_1{H1m1, H1p1, H2m1, H2p1, H3m1, H3p1}; + const std::array halos_2{H1m2, H1p2, H2m2, H2p2, H3m2, H3p2}; + const std::array halos_3{H1m3, H1p3, H2m3, H2p3, H3m3, H3p3}; + + const std::array periodic{per0, per1, per2}; + + std::vector local_domains; + + domain_descriptor_t my_domain_1{pid, + coordinate_t{(coords[0]) * DIM1, (coords[1]) * DIM2, (coords[2]) * DIM3}, + coordinate_t{(coords[0] + 1) * DIM1 - 1, (coords[1] + 1) * DIM2 - 1, + (coords[2] + 1) * DIM3 - 1}}; + local_domains.push_back(my_domain_1); + + auto halo_gen_1 = halo_generator_t{g_first, g_last, halos_1, periodic}; + auto halo_gen_2 = halo_generator_t{g_first, g_last, halos_2, periodic}; + auto halo_gen_3 = halo_generator_t{g_first, g_last, halos_3, periodic}; + + auto patterns_1 = gridtools::ghex::make_pattern(context, + halo_gen_1, local_domains); + auto patterns_2 = gridtools::ghex::make_pattern(context, + halo_gen_2, local_domains); + auto patterns_3 = gridtools::ghex::make_pattern(context, + halo_gen_3, local_domains); + + using communication_object_t = + gridtools::ghex::communication_object; // same type for all patterns + + std::vector cos_1; + for (const auto& p : patterns_1) cos_1.push_back(communication_object_t{p, comm}); + std::vector cos_2; + for (const auto& p : patterns_2) cos_2.push_back(communication_object_t{p, comm}); + std::vector cos_3; + for (const auto& p : patterns_3) cos_3.push_back(communication_object_t{p, comm}); + + array, layoutmap> a(_a, (DIM1 + H1m1 + H1p1), (DIM2 + H2m1 + H2p1), + (DIM3 + H3m1 + H3p1)); + array, layoutmap> b(_b, (DIM1 + H1m2 + H1p2), (DIM2 + H2m2 + H2p2), + (DIM3 + H3m2 + H3p2)); + array, layoutmap> c(_c, (DIM1 + H1m3 + H1p3), (DIM2 + H2m3 + H2p3), + (DIM3 + H3m3 + H3p3)); + + file << "Proc: (" << coords[0] << ", " << coords[1] << ", " << coords[2] << ")\n"; + + /* Just an initialization */ + for (int ii = 0; ii < DIM1 + H1m1 + H1p1; ++ii) + for (int jj = 0; jj < DIM2 + H2m1 + H2p1; ++jj) + { + for (int kk = 0; kk < DIM3 + H3m1 + H3p1; ++kk) + { + a(ii, jj, kk) = triple_t(); } + } - for (int ii = 0; ii < DIM1 + H1m2 + H1p2; ++ii) - for (int jj = 0; jj < DIM2 + H2m2 + H2p2; ++jj) { - for (int kk = 0; kk < DIM3 + H3m2 + H3p2; ++kk) { - b(ii, jj, kk) = triple_t(); - } + for (int ii = 0; ii < DIM1 + H1m2 + H1p2; ++ii) + for (int jj = 0; jj < DIM2 + H2m2 + H2p2; ++jj) + { + for (int kk = 0; kk < DIM3 + H3m2 + H3p2; ++kk) + { + b(ii, jj, kk) = triple_t(); } + } - for (int ii = 0; ii < DIM1 + H1m3 + H1p3; ++ii) - for (int jj = 0; jj < DIM2 + H2m3 + H2p3; ++jj) { - for (int kk = 0; kk < DIM3 + H3m3 + H3p3; ++kk) { - c(ii, jj, kk) = triple_t(); - } + for (int ii = 0; ii < DIM1 + H1m3 + H1p3; ++ii) + for (int jj = 0; jj < DIM2 + H2m3 + H2p3; ++jj) + { + for (int kk = 0; kk < DIM3 + H3m3 + H3p3; ++kk) + { + c(ii, jj, kk) = triple_t(); } + } - for (int ii = H1m1; ii < DIM1 + H1m1; ++ii) - for (int jj = H2m1; jj < DIM2 + H2m1; ++jj) - for (int kk = H3m1; kk < DIM3 + H3m1; ++kk) { - a(ii, jj, kk) = triple_t( - ii - H1m1 + (DIM1)*coords[0], jj - H2m1 + (DIM2)*coords[1], kk - H3m1 + (DIM3)*coords[2]); - } + for (int ii = H1m1; ii < DIM1 + H1m1; ++ii) + for (int jj = H2m1; jj < DIM2 + H2m1; ++jj) + for (int kk = H3m1; kk < DIM3 + H3m1; ++kk) + { + a(ii, jj, kk) = triple_t(ii - H1m1 + (DIM1)*coords[0], + jj - H2m1 + (DIM2)*coords[1], kk - H3m1 + (DIM3)*coords[2]); + } - for (int ii = H1m2; ii < DIM1 + H1m2; ++ii) - for (int jj = H2m2; jj < DIM2 + H2m2; ++jj) - for (int kk = H3m2; kk < DIM3 + H3m2; ++kk) { - b(ii, jj, kk) = triple_t(ii - H1m2 + (DIM1)*coords[0] + B_ADD, - jj - H2m2 + (DIM2)*coords[1] + B_ADD, - kk - H3m2 + (DIM3)*coords[2] + B_ADD); - } + for (int ii = H1m2; ii < DIM1 + H1m2; ++ii) + for (int jj = H2m2; jj < DIM2 + H2m2; ++jj) + for (int kk = H3m2; kk < DIM3 + H3m2; ++kk) + { + b(ii, jj, kk) = triple_t(ii - H1m2 + (DIM1)*coords[0] + B_ADD, + jj - H2m2 + (DIM2)*coords[1] + B_ADD, kk - H3m2 + (DIM3)*coords[2] + B_ADD); + } - for (int ii = H1m3; ii < DIM1 + H1m3; ++ii) - for (int jj = H2m3; jj < DIM2 + H2m3; ++jj) - for (int kk = H3m3; kk < DIM3 + H3m3; ++kk) { - c(ii, jj, kk) = triple_t(ii - H1m3 + (DIM1)*coords[0] + C_ADD, - jj - H2m3 + (DIM2)*coords[1] + C_ADD, - kk - H3m3 + (DIM3)*coords[2] + C_ADD); - } + for (int ii = H1m3; ii < DIM1 + H1m3; ++ii) + for (int jj = H2m3; jj < DIM2 + H2m3; ++jj) + for (int kk = H3m3; kk < DIM3 + H3m3; ++kk) + { + c(ii, jj, kk) = triple_t(ii - H1m3 + (DIM1)*coords[0] + C_ADD, + jj - H2m3 + (DIM2)*coords[1] + C_ADD, kk - H3m3 + (DIM3)*coords[2] + C_ADD); + } - file << "A \n"; - printbuff(file, a, DIM1 + H1m1 + H1p1, DIM2 + H2m1 + H2p1, DIM3 + H3m1 + H3p1); - file << "B \n"; - printbuff(file, b, DIM1 + H1m2 + H1p2, DIM2 + H2m2 + H2p2, DIM3 + H3m2 + H3p2); - file << "C \n"; - printbuff(file, c, DIM1 + H1m3 + H1p3, DIM2 + H2m3 + H2p3, DIM3 + H3m3 + H3p3); - file.flush(); + file << "A \n"; + printbuff(file, a, DIM1 + H1m1 + H1p1, DIM2 + H2m1 + H2p1, DIM3 + H3m1 + H3p1); + file << "B \n"; + printbuff(file, b, DIM1 + H1m2 + H1p2, DIM2 + H2m2 + H2p2, DIM3 + H3m2 + H3p2); + file << "C \n"; + printbuff(file, c, DIM1 + H1m3 + H1p3, DIM2 + H2m3 + H2p3, DIM3 + H3m3 + H3p3); + file.flush(); - data_dsc_type_1 data_dsc_a{local_domains[0], coordinate_t{H1m1, H2m1, H3m1}, a}; - data_dsc_type_2 data_dsc_b{local_domains[0], coordinate_t{H1m2, H2m2, H3m2}, b}; - data_dsc_type_3 data_dsc_c{local_domains[0], coordinate_t{H1m3, H2m3, H3m3}, c}; + data_dsc_type_1 data_dsc_a{local_domains[0], coordinate_t{H1m1, H2m1, H3m1}, a}; + data_dsc_type_2 data_dsc_b{local_domains[0], coordinate_t{H1m2, H2m2, H3m2}, b}; + data_dsc_type_3 data_dsc_c{local_domains[0], coordinate_t{H1m3, H2m3, H3m3}, c}; - MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); - gettimeofday(&start_tv, nullptr); + gettimeofday(&start_tv, nullptr); #ifndef NDEBUG - std::stringstream ss; - ss << pid; - std::string filename = "tout" + ss.str() + ".txt"; - std::ofstream tfile(filename.c_str()); - tfile << "\nFILE for " << pid << "\n"; + std::stringstream ss; + ss << pid; + std::string filename = "tout" + ss.str() + ".txt"; + std::ofstream tfile(filename.c_str()); + tfile << "\nFILE for " << pid << "\n"; #endif - if ((halos_1 == halos_2) && (halos_2 == halos_3)) { - - auto h_1 = cos_1[0].exchange(data_dsc_a, data_dsc_b, data_dsc_c); - h_1.wait(); - - } else { - - auto h_1 = cos_1[0].exchange(data_dsc_a); - h_1.wait(); - auto h_2 = cos_2[0].exchange(data_dsc_b); - h_2.wait(); - auto h_3 = cos_3[0].exchange(data_dsc_c); - h_3.wait(); - - } + if ((halos_1 == halos_2) && (halos_2 == halos_3)) + { + auto h_1 = cos_1[0].exchange(data_dsc_a, data_dsc_b, data_dsc_c); + h_1.wait(); + } + else + { + auto h_1 = cos_1[0].exchange(data_dsc_a); + h_1.wait(); + auto h_2 = cos_2[0].exchange(data_dsc_b); + h_2.wait(); + auto h_3 = cos_3[0].exchange(data_dsc_c); + h_3.wait(); + } #ifndef NDEBUG - tfile.flush(); - tfile.close(); + tfile.flush(); + tfile.close(); #endif - gettimeofday(&stop1_tv, nullptr); + gettimeofday(&stop1_tv, nullptr); - lapse_time1 = - ((static_cast(stop1_tv.tv_sec) + 1 / 1000000.0 * static_cast(stop1_tv.tv_usec)) - - (static_cast(start_tv.tv_sec) + 1 / 1000000.0 * static_cast(start_tv.tv_usec))) * - 1000.0; + lapse_time1 = ((static_cast(stop1_tv.tv_sec) + + 1 / 1000000.0 * static_cast(stop1_tv.tv_usec)) - + (static_cast(start_tv.tv_sec) + + 1 / 1000000.0 * static_cast(start_tv.tv_usec))) * + 1000.0; - MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); - file << "TIME TOT : " << lapse_time1 << "ms" << std::endl; + file << "TIME TOT : " << lapse_time1 << "ms" << std::endl; - /* + /* file << "Detailed times :" << std::endl; double sum_times{0.0}; for (auto const& time : m_co.get_times()) { @@ -323,1697 +327,717 @@ namespace halo_exchange_3D_generic_full { file << "Sum of detailed times : " << sum_times << "ms" << std::endl; */ - file << "\n********************************************************************************\n"; + file << "\n********************************************************************************\n"; - file << "A \n"; - printbuff(file, a, DIM1 + H1m1 + H1p1, DIM2 + H2m1 + H2p1, DIM3 + H3m1 + H3p1); - file << "B \n"; - printbuff(file, b, DIM1 + H1m2 + H1p2, DIM2 + H2m2 + H2p2, DIM3 + H3m2 + H3p2); - file << "C \n"; - printbuff(file, c, DIM1 + H1m3 + H1p3, DIM2 + H2m3 + H2p3, DIM3 + H3m3 + H3p3); - file.flush(); + file << "A \n"; + printbuff(file, a, DIM1 + H1m1 + H1p1, DIM2 + H2m1 + H2p1, DIM3 + H3m1 + H3p1); + file << "B \n"; + printbuff(file, b, DIM1 + H1m2 + H1p2, DIM2 + H2m2 + H2p2, DIM3 + H3m2 + H3p2); + file << "C \n"; + printbuff(file, c, DIM1 + H1m3 + H1p3, DIM2 + H2m3 + H2p3, DIM3 + H3m3 + H3p3); + file.flush(); - int passed = true; + int passed = true; - /* Checking the data arrived correctly in the whole region + /* Checking the data arrived correctly in the whole region */ - for (int ii = 0; ii < DIM1 + H1m1 + H1p1; ++ii) - for (int jj = 0; jj < DIM2 + H2m1 + H2p1; ++jj) - for (int kk = 0; kk < DIM3 + H3m1 + H3p1; ++kk) { - - triple_t ta; - int tax, tay, taz; + for (int ii = 0; ii < DIM1 + H1m1 + H1p1; ++ii) + for (int jj = 0; jj < DIM2 + H2m1 + H2p1; ++jj) + for (int kk = 0; kk < DIM3 + H3m1 + H3p1; ++kk) + { + triple_t ta; + int tax, tay, taz; - tax = modulus(ii - H1m1 + (DIM1)*coords[0], DIM1 * dims[0]); + tax = modulus(ii - H1m1 + (DIM1)*coords[0], DIM1 * dims[0]); - tay = modulus(jj - H2m1 + (DIM2)*coords[1], DIM2 * dims[1]); + tay = modulus(jj - H2m1 + (DIM2)*coords[1], DIM2 * dims[1]); - taz = modulus(kk - H3m1 + (DIM3)*coords[2], DIM3 * dims[2]); + taz = modulus(kk - H3m1 + (DIM3)*coords[2], DIM3 * dims[2]); - if (!per0) { - if (((coords[0] == 0) && (ii < H1m1)) || ((coords[0] == dims[0] - 1) && (ii >= DIM1 + H1m1))) { - tax = triple_t().x(); - } + if (!per0) + { + if (((coords[0] == 0) && (ii < H1m1)) || + ((coords[0] == dims[0] - 1) && (ii >= DIM1 + H1m1))) + { + tax = triple_t().x(); } + } - if (!per1) { - if (((coords[1] == 0) && (jj < H2m1)) || ((coords[1] == dims[1] - 1) && (jj >= DIM2 + H2m1))) { - tay = triple_t().y(); - } + if (!per1) + { + if (((coords[1] == 0) && (jj < H2m1)) || + ((coords[1] == dims[1] - 1) && (jj >= DIM2 + H2m1))) + { + tay = triple_t().y(); } + } - if (!per2) { - if (((coords[2] == 0) && (kk < H3m1)) || ((coords[2] == dims[2] - 1) && (kk >= DIM3 + H3m1))) { - taz = triple_t().z(); - } + if (!per2) + { + if (((coords[2] == 0) && (kk < H3m1)) || + ((coords[2] == dims[2] - 1) && (kk >= DIM3 + H3m1))) + { + taz = triple_t().z(); } + } - ta = triple_t(tax, tay, taz).floor(); + ta = triple_t(tax, tay, taz).floor(); - if (a(ii, jj, kk) != ta) { - passed = false; - file << ii << ", " << jj << ", " << kk << " values found != expected: " - << "a " << a(ii, jj, kk) << " != " << ta << "\n"; - } + if (a(ii, jj, kk) != ta) + { + passed = false; + file << ii << ", " << jj << ", " << kk << " values found != expected: " + << "a " << a(ii, jj, kk) << " != " << ta << "\n"; } + } - for (int ii = 0; ii < DIM1 + H1m2 + H1p2; ++ii) - for (int jj = 0; jj < DIM2 + H2m2 + H2p2; ++jj) - for (int kk = 0; kk < DIM3 + H3m2 + H3p2; ++kk) { - - triple_t tb; - int tbx, tby, tbz; + for (int ii = 0; ii < DIM1 + H1m2 + H1p2; ++ii) + for (int jj = 0; jj < DIM2 + H2m2 + H2p2; ++jj) + for (int kk = 0; kk < DIM3 + H3m2 + H3p2; ++kk) + { + triple_t tb; + int tbx, tby, tbz; - tbx = modulus(ii - H1m2 + (DIM1)*coords[0], DIM1 * dims[0]) + B_ADD; + tbx = modulus(ii - H1m2 + (DIM1)*coords[0], DIM1 * dims[0]) + B_ADD; - tby = modulus(jj - H2m2 + (DIM2)*coords[1], DIM2 * dims[1]) + B_ADD; + tby = modulus(jj - H2m2 + (DIM2)*coords[1], DIM2 * dims[1]) + B_ADD; - tbz = modulus(kk - H3m2 + (DIM3)*coords[2], DIM3 * dims[2]) + B_ADD; + tbz = modulus(kk - H3m2 + (DIM3)*coords[2], DIM3 * dims[2]) + B_ADD; - if (!per0) { - if (((coords[0] == 0) && (ii < H1m2)) || ((coords[0] == dims[0] - 1) && (ii >= DIM1 + H1m2))) { - tbx = triple_t().x(); - } + if (!per0) + { + if (((coords[0] == 0) && (ii < H1m2)) || + ((coords[0] == dims[0] - 1) && (ii >= DIM1 + H1m2))) + { + tbx = triple_t().x(); } + } - if (!per1) { - if (((coords[1] == 0) && (jj < H2m2)) || ((coords[1] == dims[1] - 1) && (jj >= DIM2 + H2m2))) { - tby = triple_t().y(); - } + if (!per1) + { + if (((coords[1] == 0) && (jj < H2m2)) || + ((coords[1] == dims[1] - 1) && (jj >= DIM2 + H2m2))) + { + tby = triple_t().y(); } + } - if (!per2) { - if (((coords[2] == 0) && (kk < H3m2)) || ((coords[2] == dims[2] - 1) && (kk >= DIM3 + H3m2))) { - tbz = triple_t().z(); - } + if (!per2) + { + if (((coords[2] == 0) && (kk < H3m2)) || + ((coords[2] == dims[2] - 1) && (kk >= DIM3 + H3m2))) + { + tbz = triple_t().z(); } + } - tb = triple_t(tbx, tby, tbz).floor(); + tb = triple_t(tbx, tby, tbz).floor(); - if (b(ii, jj, kk) != tb) { - passed = false; - file << ii << ", " << jj << ", " << kk << " values found != expected: " - << "b " << b(ii, jj, kk) << " != " << tb << "\n"; - } + if (b(ii, jj, kk) != tb) + { + passed = false; + file << ii << ", " << jj << ", " << kk << " values found != expected: " + << "b " << b(ii, jj, kk) << " != " << tb << "\n"; } + } - for (int ii = 0; ii < DIM1 + H1m3 + H1p3; ++ii) - for (int jj = 0; jj < DIM2 + H2m3 + H2p3; ++jj) - for (int kk = 0; kk < DIM3 + H3m3 + H3p3; ++kk) { - - triple_t tc; - int tcx, tcy, tcz; + for (int ii = 0; ii < DIM1 + H1m3 + H1p3; ++ii) + for (int jj = 0; jj < DIM2 + H2m3 + H2p3; ++jj) + for (int kk = 0; kk < DIM3 + H3m3 + H3p3; ++kk) + { + triple_t tc; + int tcx, tcy, tcz; - tcx = modulus(ii - H1m3 + (DIM1)*coords[0], DIM1 * dims[0]) + C_ADD; + tcx = modulus(ii - H1m3 + (DIM1)*coords[0], DIM1 * dims[0]) + C_ADD; - tcy = modulus(jj - H2m3 + (DIM2)*coords[1], DIM2 * dims[1]) + C_ADD; + tcy = modulus(jj - H2m3 + (DIM2)*coords[1], DIM2 * dims[1]) + C_ADD; - tcz = modulus(kk - H3m3 + (DIM3)*coords[2], DIM3 * dims[2]) + C_ADD; + tcz = modulus(kk - H3m3 + (DIM3)*coords[2], DIM3 * dims[2]) + C_ADD; - if (!per0) { - if (((coords[0] == 0) && (ii < H1m3)) || ((coords[0] == dims[0] - 1) && (ii >= DIM1 + H1m3))) { - tcx = triple_t().x(); - } + if (!per0) + { + if (((coords[0] == 0) && (ii < H1m3)) || + ((coords[0] == dims[0] - 1) && (ii >= DIM1 + H1m3))) + { + tcx = triple_t().x(); } + } - if (!per1) { - if (((coords[1] == 0) && (jj < H2m3)) || ((coords[1] == dims[1] - 1) && (jj >= DIM2 + H2m3))) { - tcy = triple_t().y(); - } + if (!per1) + { + if (((coords[1] == 0) && (jj < H2m3)) || + ((coords[1] == dims[1] - 1) && (jj >= DIM2 + H2m3))) + { + tcy = triple_t().y(); } + } - if (!per2) { - if (((coords[2] == 0) && (kk < H3m3)) || ((coords[2] == dims[2] - 1) && (kk >= DIM3 + H3m3))) { - tcz = triple_t().z(); - } + if (!per2) + { + if (((coords[2] == 0) && (kk < H3m3)) || + ((coords[2] == dims[2] - 1) && (kk >= DIM3 + H3m3))) + { + tcz = triple_t().z(); } + } - tc = triple_t(tcx, tcy, tcz).floor(); + tc = triple_t(tcx, tcy, tcz).floor(); - if (c(ii, jj, kk) != tc) { - passed = false; - file << ii << ", " << jj << ", " << kk << " values found != expected: " - << "c " << c(ii, jj, kk) << " != " << tc << "\n"; - } + if (c(ii, jj, kk) != tc) + { + passed = false; + file << ii << ", " << jj << ", " << kk << " values found != expected: " + << "c " << c(ii, jj, kk) << " != " << tc << "\n"; } + } - if (passed) - file << "RESULT: PASSED!\n"; - else - file << "RESULT: FAILED!\n"; + if (passed) file << "RESULT: PASSED!\n"; + else + file << "RESULT: FAILED!\n"; - return passed; - } + return passed; +} - bool test(int DIM1, - int DIM2, - int DIM3, - int H1m1, - int H1p1, - int H2m1, - int H2p1, - int H3m1, - int H3p1, - int H1m2, - int H1p2, - int H2m2, - int H2p2, - int H3m2, - int H3p2, - int H1m3, - int H1p3, - int H2m3, - int H2p3, - int H3m3, - int H3p3) { - - /* Here we compute the computing grid as in many applications +bool +test(int DIM1, int DIM2, int DIM3, int H1m1, int H1p1, int H2m1, int H2p1, int H3m1, int H3p1, + int H1m2, int H1p2, int H2m2, int H2p2, int H3m2, int H3p2, int H1m3, int H1p3, int H2m3, + int H2p3, int H3m3, int H3p3) +{ + /* Here we compute the computing grid as in many applications */ - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - std::cout << pid << " " << nprocs << "\n"; + std::cout << pid << " " << nprocs << "\n"; - std::stringstream ss; - ss << pid; + std::stringstream ss; + ss << pid; - std::string filename = "out" + ss.str() + ".txt"; + std::string filename = "out" + ss.str() + ".txt"; - std::cout << filename << std::endl; - std::ofstream file(filename.c_str()); + std::cout << filename << std::endl; + std::ofstream file(filename.c_str()); - file << pid << " " << nprocs << "\n"; + file << pid << " " << nprocs << "\n"; - MPI_Dims_create(nprocs, 3, dims); - int period[3] = {1, 1, 1}; + MPI_Dims_create(nprocs, 3, dims); + int period[3] = {1, 1, 1}; - file << "@" << pid << "@ MPI GRID SIZE " << dims[0] << " - " << dims[1] << " - " << dims[2] << "\n"; + file << "@" << pid << "@ MPI GRID SIZE " << dims[0] << " - " << dims[1] << " - " << dims[2] + << "\n"; - MPI_Cart_create(MPI_COMM_WORLD, 3, dims, period, false, &CartComm); + MPI_Cart_create(MPI_COMM_WORLD, 3, dims, period, false, &CartComm); - MPI_Cart_get(CartComm, 3, dims, period, coords); + MPI_Cart_get(CartComm, 3, dims, period, coords); - auto context_ptr = gridtools::ghex::tl::context_factory::create(CartComm); - auto& context = *context_ptr; - auto comm = context.get_communicator(); + auto context_ptr = gridtools::ghex::tl::context_factory::create(CartComm); + auto& context = *context_ptr; + auto comm = context.get_communicator(); - /* Each process will hold a tile of size + /* Each process will hold a tile of size (DIM1+2*H)x(DIM2+2*H)x(DIM3+2*H). The DIM1xDIM2xDIM3 area inside the H width border is the inner region of an hypothetical stencil computation whise halo width is H. */ - file << "Field A " - << "size = " << DIM1 << "x" << DIM2 << "x" << DIM3 << " " - << "Halo along i " << H1m1 << " - " << H1p1 << ", " - << "Halo along j " << H2m1 << " - " << H2p1 << ", " - << "Halo along k " << H3m1 << " - " << H3p1 << std::endl; - - file << "Field B " - << "size = " << DIM1 << "x" << DIM2 << "x" << DIM3 << " " - << "Halo along i " << H1m2 << " - " << H1p2 << ", " - << "Halo along j " << H2m2 << " - " << H2p2 << ", " - << "Halo along k " << H3m2 << " - " << H3p2 << std::endl; - - file << "Field C " - << "size = " << DIM1 << "x" << DIM2 << "x" << DIM3 << " " - << "Halo along i " << H1m3 << " - " << H1p3 << ", " - << "Halo along j " << H2m3 << " - " << H2p3 << ", " - << "Halo along k " << H3m3 << " - " << H3p3 << std::endl; - file.flush(); - - /* This example will exchange 3 data arrays at the same time with + file << "Field A " + << "size = " << DIM1 << "x" << DIM2 << "x" << DIM3 << " " + << "Halo along i " << H1m1 << " - " << H1p1 << ", " + << "Halo along j " << H2m1 << " - " << H2p1 << ", " + << "Halo along k " << H3m1 << " - " << H3p1 << std::endl; + + file << "Field B " + << "size = " << DIM1 << "x" << DIM2 << "x" << DIM3 << " " + << "Halo along i " << H1m2 << " - " << H1p2 << ", " + << "Halo along j " << H2m2 << " - " << H2p2 << ", " + << "Halo along k " << H3m2 << " - " << H3p2 << std::endl; + + file << "Field C " + << "size = " << DIM1 << "x" << DIM2 << "x" << DIM3 << " " + << "Halo along i " << H1m3 << " - " << H1p3 << ", " + << "Halo along j " << H2m3 << " - " << H2p3 << ", " + << "Halo along k " << H3m3 << " - " << H3p3 << std::endl; + file.flush(); + + /* This example will exchange 3 data arrays at the same time with different values. */ - triple_t *_a = - new triple_t[(DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1)]; - triple_t *_b = - new triple_t[(DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2)]; - triple_t *_c = - new triple_t[(DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3)]; - - file << "Permutation 0,1,2\n"; - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - - bool passed = true; - - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " - "_a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - file << "---------------------------------------------------\n"; - - file << "Permutation 0,2,1\n"; - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " - "_a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - file << "---------------------------------------------------\n"; - - file << "Permutation 1,0,2\n"; - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " - "_a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - file << "---------------------------------------------------\n"; - - file << "Permutation 1,2,0\n"; - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H31, " - "_a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - file << "---------------------------------------------------\n"; - - file << "Permutation 2,0,1\n"; - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " - "_a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - file << "---------------------------------------------------\n"; - - file << "Permutation 2,1,0\n"; - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, " - "_c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file - << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - _a, - _b, - _c); - - file << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " - "_a, " - "_b, _c)\n"; - passed = passed and run(file, context, comm, - DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3, - - _a, - _b, - _c); - file << "---------------------------------------------------\n"; - - delete[] _a; - delete[] _b; - delete[] _c; - - return passed; - } + triple_t* _a = new triple_t[(DIM1 + H1m1 + H1p1) * (DIM2 + H2m1 + H2p1) * (DIM3 + H3m1 + H3p1)]; + triple_t* _b = new triple_t[(DIM1 + H1m2 + H1p2) * (DIM2 + H2m2 + H2p2) * (DIM3 + H3m2 + H3p2)]; + triple_t* _c = new triple_t[(DIM1 + H1m3 + H1p3) * (DIM2 + H2m3 + H2p3) * (DIM3 + H3m3 + H3p3)]; + + file << "Permutation 0,1,2\n"; + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + + bool passed = true; + + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " + "_a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + file << "---------------------------------------------------\n"; + + file << "Permutation 0,2,1\n"; + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " + "_a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + file << "---------------------------------------------------\n"; + + file << "Permutation 1,0,2\n"; + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " + "_a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + file << "---------------------------------------------------\n"; + + file << "Permutation 1,2,0\n"; + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H31, " + "_a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + file << "---------------------------------------------------\n"; + + file << "Permutation 2,0,1\n"; + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " + "_a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + file << "---------------------------------------------------\n"; + + file << "Permutation 2,1,0\n"; + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, " + "_c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, _a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, _a, _b, _c); + + file + << "run(file, DIM1, DIM2, DIM3, H1m, H1p, H2m, H2p, H3m, H3p, " + "_a, " + "_b, _c)\n"; + passed = passed and run(file, context, comm, DIM1, + DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, H1p2, H2m2, H2p2, + H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3, + + _a, _b, _c); + file << "---------------------------------------------------\n"; + + delete[] _a; + delete[] _b; + delete[] _c; + + return passed; +} } // namespace halo_exchange_3D_generic_full - -int main(int argc, char **argv) { - +int +main(int argc, char** argv) +{ MPI_Init(&argc, &argv); - if (argc != 22) { - std::cout << "Usage: test_halo_exchange_3D dimx dimy dimz h1m1 hip1 h2m1 h2m1 h3m1 h3p1 h1m2 hip2 h2m2 h2m2 " - "h3m2 h3p2 h1m3 hip3 h2m3 h2m3 h3m3 h3p3\n where args are integer sizes of the data fields and " - "halo width" - << std::endl; + if (argc != 22) + { + std::cout + << "Usage: test_halo_exchange_3D dimx dimy dimz h1m1 hip1 h2m1 h2m1 h3m1 h3p1 h1m2 hip2 h2m2 h2m2 " + "h3m2 h3p2 h1m3 hip3 h2m3 h2m3 h3m3 h3p3\n where args are integer sizes of the data fields and " + "halo width" + << std::endl; return 1; } int DIM1 = atoi(argv[1]); @@ -2038,28 +1062,8 @@ int main(int argc, char **argv) { int H3m3 = atoi(argv[20]); int H3p3 = atoi(argv[21]); - halo_exchange_3D_generic_full::test(DIM1, - DIM2, - DIM3, - H1m1, - H1p1, - H2m1, - H2p1, - H3m1, - H3p1, - H1m2, - H1p2, - H2m2, - H2p2, - H3m2, - H3p2, - H1m3, - H1p3, - H2m3, - H2p3, - H3m3, - H3p3); + halo_exchange_3D_generic_full::test(DIM1, DIM2, DIM3, H1m1, H1p1, H2m1, H2p1, H3m1, H3p1, H1m2, + H1p2, H2m2, H2p2, H3m2, H3p2, H1m3, H1p3, H2m3, H2p3, H3m3, H3p3); MPI_Finalize(); - } diff --git a/benchmarks/simple_rma.cpp b/benchmarks/simple_rma.cpp index 0c3e9a30..8f6d7a18 100644 --- a/benchmarks/simple_rma.cpp +++ b/benchmarks/simple_rma.cpp @@ -51,10 +51,7 @@ struct simulation template struct cuda_deleter { - void operator()(T* ptr) - { - cudaFree(ptr); - } + void operator()(T* ptr) { cudaFree(ptr); } }; #endif @@ -62,58 +59,58 @@ struct simulation using context_type = typename gridtools::ghex::tl::context_factory::context_type; using context_ptr_type = std::unique_ptr; - using domain_descriptor_type = gridtools::ghex::structured::regular::domain_descriptor>; - using halo_generator_type = gridtools::ghex::structured::regular::halo_generator>; + using domain_descriptor_type = gridtools::ghex::structured::regular::domain_descriptor>; + using halo_generator_type = + gridtools::ghex::structured::regular::halo_generator>; template - using field_descriptor_type = gridtools::ghex::structured::regular::field_descriptor; + using field_descriptor_type = gridtools::ghex::structured::regular::field_descriptor; - using field_type = field_descriptor_type>; + using field_type = + field_descriptor_type>; #ifdef GHEX_CUDACC - using gpu_field_type = field_descriptor_type>; + using gpu_field_type = + field_descriptor_type>; #endif using decomp_type = gridtools::ghex::hierarchical_decomposition<3>; - int num_reps; - decomp_type decomp; - int num_threads; - bool mt; - const int num_fields; - int ext; - context_ptr_type context_ptr; - context_type& context; - const std::array local_ext; - const std::array periodic; - const std::array g_first; - const std::array g_last; - const std::array offset; - std::array halos; - const std::array local_ext_buffer; - halo_generator_type halo_gen; - std::vector local_domains; - const int max_memory; + int num_reps; + decomp_type decomp; + int num_threads; + bool mt; + const int num_fields; + int ext; + context_ptr_type context_ptr; + context_type& context; + const std::array local_ext; + const std::array periodic; + const std::array g_first; + const std::array g_last; + const std::array offset; + std::array halos; + const std::array local_ext_buffer; + halo_generator_type halo_gen; + std::vector local_domains; + const int max_memory; std::vector>> fields_raw; - std::vector> fields; + std::vector> fields; #ifdef GHEX_CUDACC - std::vector>>> fields_raw_gpu; - std::vector> fields_gpu; + std::vector>>> fields_raw_gpu; + std::vector> fields_gpu; #endif - typename context_type::communicator_type comm; - std::vector comms; + typename context_type::communicator_type comm; + std::vector comms; std::vector cos; - using pattern_type = std::remove_reference_t(context, halo_gen, local_domains))>; + using pattern_type = std::remove_reference_t(context, halo_gen, local_domains))>; std::unique_ptr pattern; - std::mutex io_mutex; + std::mutex io_mutex; std::vector timer_vec; - simulation( - int num_reps_, - int ext_, - int halo, - int num_fields_, - const decomp_type& decomp_) + simulation(int num_reps_, int ext_, int halo, int num_fields_, const decomp_type& decomp_) : num_reps{num_reps_} , decomp(decomp_) , num_threads(decomp.threads_per_rank()) @@ -122,22 +119,19 @@ struct simulation , ext{ext_} , context_ptr{gridtools::ghex::tl::context_factory::create(MPI_COMM_WORLD)} , context{*context_ptr} - , local_ext{ext,ext,ext} - , periodic{true,true,true} - , g_first{0,0,0} - , g_last{ - decomp.last_coord()[0]*local_ext[0]+local_ext[0]-1, - decomp.last_coord()[1]*local_ext[1]+local_ext[1]-1, - decomp.last_coord()[2]*local_ext[2]+local_ext[2]-1} - , offset{halo,halo,halo} - , halos{halo,halo,halo,halo,halo,halo} - , local_ext_buffer{ - local_ext[0]+halos[0]+halos[1], - local_ext[1]+halos[2]+halos[3], - local_ext[2]+halos[4]+halos[5]} + , local_ext{ext, ext, ext} + , periodic{true, true, true} + , g_first{0, 0, 0} + , g_last{decomp.last_coord()[0] * local_ext[0] + local_ext[0] - 1, + decomp.last_coord()[1] * local_ext[1] + local_ext[1] - 1, + decomp.last_coord()[2] * local_ext[2] + local_ext[2] - 1} + , offset{halo, halo, halo} + , halos{halo, halo, halo, halo, halo, halo} + , local_ext_buffer{local_ext[0] + halos[0] + halos[1], local_ext[1] + halos[2] + halos[3], + local_ext[2] + halos[4] + halos[5]} , halo_gen(g_first, g_last, halos, periodic) - , max_memory{local_ext_buffer[0]*local_ext_buffer[1]*local_ext_buffer[2]} - , comm{ context.get_serial_communicator() } + , max_memory{local_ext_buffer[0] * local_ext_buffer[1] * local_ext_buffer[2]} + , comm{context.get_serial_communicator()} , timer_vec(num_threads) { cos.resize(num_threads); @@ -150,20 +144,20 @@ struct simulation #endif comms = std::vector(num_threads, comm); - for (int j=0; j{x,y,z}, - std::array{x+local_ext[0]-1,y+local_ext[1]-1,z+local_ext[2]-1}}); + int x = coord[0] * local_ext[0]; + int y = coord[1] * local_ext[1]; + int z = coord[2] * local_ext[2]; + local_domains.push_back(domain_descriptor_type{context.rank() * num_threads + j, + std::array{x, y, z}, + std::array{x + local_ext[0] - 1, y + local_ext[1] - 1, + z + local_ext[2] - 1}}); } - pattern = std::unique_ptr{new pattern_type{ - gridtools::ghex::make_pattern( + pattern = std::unique_ptr{ + new pattern_type{gridtools::ghex::make_pattern( context, halo_gen, local_domains)}}; } @@ -171,7 +165,7 @@ struct simulation { if (num_threads == 1) { - std::thread t([this](){exchange(0);}); + std::thread t([this]() { exchange(0); }); // Create a cpu_set_t object representing a set of CPUs. Clear it and mark // only CPU = local rank as set. cpu_set_t cpuset; @@ -185,16 +179,20 @@ struct simulation { std::vector threads; threads.reserve(num_threads); - for (int j=0; j(comms[j]); - for (int i=0; i(max_memory) ); - fields[j].push_back(gridtools::ghex::wrap_field>( - local_domains[j], - fields_raw[j].back().data(), - offset, - local_ext_buffer)); + fields_raw[j].push_back(std::vector(max_memory)); + fields[j].push_back( + gridtools::ghex::wrap_field>( + local_domains[j], fields_raw[j].back().data(), offset, local_ext_buffer)); #ifdef GHEX_CUDACC - fields_raw_gpu[j].push_back( std::unique_ptr>{ - [this](){ void* ptr; cudaMalloc(&ptr, max_memory*sizeof(T)); return (T*)ptr; }()}); - fields_gpu[j].push_back(gridtools::ghex::wrap_field>( - local_domains[j], - fields_raw_gpu[j].back().get(), - offset, - local_ext_buffer)); + fields_raw_gpu[j].push_back(std::unique_ptr>{[this]() + { + void* ptr; + cudaMalloc(&ptr, max_memory * sizeof(T)); + return (T*)ptr; + }()}); + fields_gpu[j].push_back( + gridtools::ghex::wrap_field>( + local_domains[j], fields_raw_gpu[j].back().get(), offset, local_ext_buffer)); #endif } auto bco = gridtools::ghex::bulk_communication_object< - gridtools::ghex::structured::rma_range_generator, - pattern_type, + gridtools::ghex::structured::rma_range_generator, pattern_type, #ifndef GHEX_CUDACC field_type #else gpu_field_type #endif - > (basic_co); + >(basic_co); #ifndef GHEX_CUDACC - for (int i=0; ioperator()(fields[j][i])); + for (int i = 0; i < num_fields; ++i) bco.add_field(pattern->operator()(fields[j][i])); #else - for (int i=0; ioperator()(fields_gpu[j][i])); + for (int i = 0; i < num_fields; ++i) bco.add_field(pattern->operator()(fields_gpu[j][i])); #endif cos[j] = std::move(bco); // warm up - for (int t = 0; t < 50; ++t) - { - cos[j].exchange().wait(); - } + for (int t = 0; t < 50; ++t) { cos[j].exchange().wait(); } auto start = clock_type::now(); for (int t = 0; t < num_reps; ++t) @@ -254,52 +246,59 @@ struct simulation timer_vec[j].tic(); cos[j].exchange().wait(); timer_vec[j].toc(); - std::cout << "mean time: " << comm.rank() << ":" << j << " " << std::setprecision(12) << timer_vec[j].mean()/1000000.0 << "\n"; + std::cout << "mean time: " << comm.rank() << ":" << j << " " << std::setprecision(12) + << timer_vec[j].mean() / 1000000.0 << "\n"; timer_vec[j].clear(); } - auto end = clock_type::now(); + auto end = clock_type::now(); std::chrono::duration elapsed_seconds = end - start; if (comm.rank() == 0 && j == 0) { const auto num_elements = - local_ext_buffer[0] * local_ext_buffer[1] * local_ext_buffer[2] - - local_ext[0] * local_ext[1] * local_ext[2]; + local_ext_buffer[0] * local_ext_buffer[1] * local_ext_buffer[2] - + local_ext[0] * local_ext[1] * local_ext[2]; const auto num_bytes = num_elements * sizeof(T); const double load = 2 * comm.size() * num_threads * num_fields * num_bytes; const auto GB_per_s = num_reps * load / (elapsed_seconds.count() * 1.0e9); std::cout << "elapsed time: " << elapsed_seconds.count() << "s\n"; std::cout << "GB/s : " << GB_per_s << std::endl; const auto tt = timer_vec[0]; - std::cout << "mean time: " << std::setprecision(12) << tt.mean()/1000000.0 << "\n"; - std::cout << "min time: " << std::setprecision(12) << tt.min()/1000000.0 << "\n"; - std::cout << "max time: " << std::setprecision(12) << tt.max()/1000000.0 << "\n"; - std::cout << "sdev time: " << std::setprecision(12) << tt.stddev()/1000000.0 << "\n"; - std::cout << "sdev f time: " << std::setprecision(12) << tt.stddev()/tt.mean() << "\n"; - std::cout << "GB/s mean: " << std::setprecision(12) << load / (tt.mean()*1000.0) << std::endl; - std::cout << "GB/s min: " << std::setprecision(12) << load / (tt.max()*1000.0) << std::endl; - std::cout << "GB/s max: " << std::setprecision(12) << load / (tt.min()*1000.0) << std::endl; - std::cout << "GB/s sdev: " << std::setprecision(12) << (tt.stddev()/tt.mean())* (load / (tt.mean()*1000.0)) << std::endl; + std::cout << "mean time: " << std::setprecision(12) << tt.mean() / 1000000.0 << "\n"; + std::cout << "min time: " << std::setprecision(12) << tt.min() / 1000000.0 << "\n"; + std::cout << "max time: " << std::setprecision(12) << tt.max() / 1000000.0 << "\n"; + std::cout << "sdev time: " << std::setprecision(12) << tt.stddev() / 1000000.0 + << "\n"; + std::cout << "sdev f time: " << std::setprecision(12) << tt.stddev() / tt.mean() + << "\n"; + std::cout << "GB/s mean: " << std::setprecision(12) << load / (tt.mean() * 1000.0) + << std::endl; + std::cout << "GB/s min: " << std::setprecision(12) << load / (tt.max() * 1000.0) + << std::endl; + std::cout << "GB/s max: " << std::setprecision(12) << load / (tt.min() * 1000.0) + << std::endl; + std::cout << "GB/s sdev: " << std::setprecision(12) + << (tt.stddev() / tt.mean()) * (load / (tt.mean() * 1000.0)) << std::endl; } } }; -void print_usage(const char* app_name) +void +print_usage(const char* app_name) { - std::cout - << " -np N " << app_name << " " - << "local-domain-size " - << "num-repetition " - << "halo-size " - << "num-fields " - << "node-decompositon " - << "numa-decompositon " - << "rank-decompositon " - << "thread-decompositon " - << std::endl; + std::cout << " -np N " << app_name << " " + << "local-domain-size " + << "num-repetition " + << "halo-size " + << "num-fields " + << "node-decompositon " + << "numa-decompositon " + << "rank-decompositon " + << "thread-decompositon " << std::endl; } -int main(int argc, char** argv) +int +main(int argc, char** argv) { if (argc != 17) { @@ -307,30 +306,30 @@ int main(int argc, char** argv) return 1; } - int domain_size = std::atoi(argv[1]); - int num_repetitions = std::atoi(argv[2]); - int halo = std::atoi(argv[3]); - int num_fields = std::atoi(argv[4]); - std::array node_decomposition; - std::array numa_decomposition; - std::array rank_decomposition; - std::array thread_decomposition; - int num_ranks = 1; - int num_threads = 1; + int domain_size = std::atoi(argv[1]); + int num_repetitions = std::atoi(argv[2]); + int halo = std::atoi(argv[3]); + int num_fields = std::atoi(argv[4]); + std::array node_decomposition; + std::array numa_decomposition; + std::array rank_decomposition; + std::array thread_decomposition; + int num_ranks = 1; + int num_threads = 1; for (int i = 0; i < 3; ++i) { - node_decomposition[i] = std::atoi(argv[i+5]); - numa_decomposition[i] = std::atoi(argv[i+5+3]); - rank_decomposition[i] = std::atoi(argv[i+5+6]); - thread_decomposition[i] = std::atoi(argv[i+5+9]); - num_ranks *= node_decomposition[i]*numa_decomposition[i]*rank_decomposition[i]; + node_decomposition[i] = std::atoi(argv[i + 5]); + numa_decomposition[i] = std::atoi(argv[i + 5 + 3]); + rank_decomposition[i] = std::atoi(argv[i + 5 + 6]); + thread_decomposition[i] = std::atoi(argv[i + 5 + 9]); + num_ranks *= node_decomposition[i] * numa_decomposition[i] * rank_decomposition[i]; num_threads *= thread_decomposition[i]; } typename simulation::decomp_type decomp(node_decomposition, numa_decomposition, rank_decomposition, thread_decomposition); - int required = num_threads>1 ? MPI_THREAD_MULTIPLE : MPI_THREAD_SINGLE; + int required = num_threads > 1 ? MPI_THREAD_MULTIPLE : MPI_THREAD_SINGLE; int provided; int init_result = MPI_Init_thread(&argc, &argv, required, &provided); if (init_result == MPI_ERR_OTHER) @@ -347,7 +346,7 @@ int main(int argc, char** argv) MPI_Barrier(MPI_COMM_WORLD); int world_size; - MPI_Comm_size(MPI_COMM_WORLD,&world_size); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); if (world_size != num_ranks) { std::cout << "processor decomposition is wrong" << std::endl; @@ -359,7 +358,7 @@ int main(int argc, char** argv) simulation sim(num_repetitions, domain_size, halo, num_fields, decomp); sim.exchange(); - + MPI_Barrier(MPI_COMM_WORLD); } MPI_Finalize(); diff --git a/benchmarks/transport/ghex_p2p_bi_cb_avail_mt.cpp b/benchmarks/transport/ghex_p2p_bi_cb_avail_mt.cpp index 252bb9a2..cbc6072e 100644 --- a/benchmarks/transport/ghex_p2p_bi_cb_avail_mt.cpp +++ b/benchmarks/transport/ghex_p2p_bi_cb_avail_mt.cpp @@ -23,11 +23,11 @@ namespace ghex = gridtools::ghex; #ifdef GHEX_USE_UCP // UCX backend #include -using transport = ghex::tl::ucx_tag; +using transport = ghex::tl::ucx_tag; #else // MPI backend #include -using transport = ghex::tl::mpi_tag; +using transport = ghex::tl::mpi_tag; #endif #include @@ -37,7 +37,6 @@ using future_type = typename communicator_type::request_cb_type; using MsgType = gridtools::ghex::tl::shared_message_buffer<>; - #ifdef GHEX_USE_OPENMP std::atomic sent(0); std::atomic received(0); @@ -56,18 +55,19 @@ int tail_recv(0); #define THREADID 0 #endif -int main(int argc, char *argv[]) +int +main(int argc, char* argv[]) { - int niter, buff_size; - int inflight; - int mode; + int niter, buff_size; + int inflight; + int mode; gridtools::ghex::timer timer, ttimer; - if(argc != 4) - { - std::cerr << "Usage: bench [niter] [msg_size] [inflight]" << "\n"; - std::terminate(); - } + if (argc != 4) + { + std::cerr << "Usage: bench [niter] [msg_size] [inflight]" << "\n"; + std::terminate(); + } niter = atoi(argv[1]); buff_size = atoi(argv[2]); inflight = atoi(argv[3]); @@ -86,7 +86,8 @@ int main(int argc, char *argv[]) #ifdef GHEX_USE_OPENMP MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &mode); - if(mode != MPI_THREAD_MULTIPLE){ + if (mode != MPI_THREAD_MULTIPLE) + { std::cerr << "MPI_THREAD_MULTIPLE not supported by MPI, aborting\n"; std::terminate(); } @@ -95,18 +96,18 @@ int main(int argc, char *argv[]) #endif { - auto context_ptr = ghex::tl::context_factory::create(MPI_COMM_WORLD); + auto context_ptr = ghex::tl::context_factory::create(MPI_COMM_WORLD); auto& context = *context_ptr; #ifdef GHEX_USE_OPENMP #pragma omp parallel #endif { - auto comm = context.get_communicator(); - const auto rank = comm.rank(); - const auto size = comm.size(); - const auto thread_id = THREADID; - const auto peer_rank = (rank+1)%2; + auto comm = context.get_communicator(); + const auto rank = comm.rank(); + const auto size = comm.size(); + const auto thread_id = THREADID; + const auto peer_rank = (rank + 1) % 2; bool using_mt = false; #ifdef GHEX_USE_OPENMP @@ -119,124 +120,130 @@ int main(int argc, char *argv[]) int dbg = 0, sdbg = 0, rdbg = 0; auto send_callback = [&](communicator_type::message_type, int, int tag) - { - int pthr = tag/inflight; - if(pthr != thread_id) nlsend_cnt++; - comm_cnt++; - sent++; - }; + { + int pthr = tag / inflight; + if (pthr != thread_id) nlsend_cnt++; + comm_cnt++; + sent++; + }; auto recv_callback = [&](communicator_type::message_type, int, int tag) - { - int pthr = tag/inflight; - if(pthr != thread_id) nlrecv_cnt++; - //printf("rank %d thrid %d tag %d pthr %d\n", rank, thread_id, tag, pthr); - comm_cnt++; - received++; - }; - - if (thread_id==0 && rank==0) - { - std::cout << "\n\nrunning test " << __FILE__ << " with communicator " << typeid(comm).name() << "\n\n"; - }; - - std::vector smsgs(inflight); - std::vector rmsgs(inflight); + { + int pthr = tag / inflight; + if (pthr != thread_id) nlrecv_cnt++; + //printf("rank %d thrid %d tag %d pthr %d\n", rank, thread_id, tag, pthr); + comm_cnt++; + received++; + }; + + if (thread_id == 0 && rank == 0) + { + std::cout << "\n\nrunning test " << __FILE__ << " with communicator " + << typeid(comm).name() << "\n\n"; + }; + + std::vector smsgs(inflight); + std::vector rmsgs(inflight); std::vector sreqs(inflight); std::vector rreqs(inflight); - for(int j=0; j= (niter/10)) - { - dbg = 0; - std::cout << rank << " total bwdt MB/s: " - << ((double)(received-last_received + sent-last_sent)*size*buff_size/2)/timer.stoc() - << "\n"; - timer.tic(); - last_received = received; - last_sent = sent; - } - - if(rank==0 && thread_id==0 && rdbg >= (niter/10)) - { - std::cout << received << " received\n"; - rdbg = 0; - } - - if(rank==0 && thread_id==0 && sdbg >= (niter/10)) - { - std::cout << sent << " sent\n"; - sdbg = 0; - } - - for(int j=0; j= (niter / 10)) + { + dbg = 0; + std::cout << rank << " total bwdt MB/s: " + << ((double)(received - last_received + sent - last_sent) * size * + buff_size / 2) / + timer.stoc() + << "\n"; + timer.tic(); + last_received = received; + last_sent = sent; + } + + if (rank == 0 && thread_id == 0 && rdbg >= (niter / 10)) + { + std::cout << received << " received\n"; + rdbg = 0; + } + + if (rank == 0 && thread_id == 0 && sdbg >= (niter / 10)) + { + std::cout << sent << " sent\n"; + sdbg = 0; + } + + for (int j = 0; j < inflight; j++) + { + //if(rmsgs[j].use_count() == 1) + if (rreqs[j].test()) + { + submit_recv_cnt += num_threads; + rdbg += num_threads; + dbg += num_threads; + rreqs[j] = + comm.recv(rmsgs[j], peer_rank, thread_id * inflight + j, recv_callback); + lrecv++; + } + else + comm.progress(); + + // if(lsent < lrecv+2*inflight && sent < niter && smsgs[j].use_count() == 1) + if (lsent < lrecv + 2 * inflight && sent < niter && sreqs[j].test()) + { + submit_cnt += num_threads; + sdbg += num_threads; + dbg += num_threads; + sreqs[j] = + comm.send(smsgs[j], peer_rank, thread_id * inflight + j, send_callback); + lsent++; + } + else + comm.progress(); + } + } + + barrier(comm); + + if (thread_id == 0 && rank == 0) + { + const auto t = ttimer.stoc(); + std::cout << "time: " << t / 1000000 << "s\n"; + std::cout << "final MB/s: " << ((double)niter * size * buff_size) / t << "\n"; + } // stop here to help produce a nice std output - barrier(comm); + barrier(comm); #ifdef GHEX_USE_OPENMP #pragma omp critical #endif { - std::cout << "rank " << rank << " thread " << thread_id - << " sends submitted " << submit_cnt/num_threads - << " serviced " << comm_cnt << ", non-local sends " - << nlsend_cnt << " non-local recvs " << nlrecv_cnt << "\n"; + std::cout << "rank " << rank << " thread " << thread_id << " sends submitted " + << submit_cnt / num_threads << " serviced " << comm_cnt + << ", non-local sends " << nlsend_cnt << " non-local recvs " << nlrecv_cnt + << "\n"; } // tail loops - submit RECV requests until @@ -251,58 +258,68 @@ int main(int argc, char *argv[]) do { comm.progress(); // check if we have completed all our posted sends - if(!send_complete){ + if (!send_complete) + { incomplete_sends = 0; - for(int j=0; j -using transport = ghex::tl::ucx_tag; +using transport = ghex::tl::ucx_tag; #else // MPI backend #include -using transport = ghex::tl::mpi_tag; +using transport = ghex::tl::mpi_tag; #endif #include @@ -37,7 +37,6 @@ using future_type = typename communicator_type::request_cb_type; using MsgType = gridtools::ghex::tl::shared_message_buffer<>; - #ifdef GHEX_USE_OPENMP std::atomic sent(0); std::atomic received(0); @@ -52,18 +51,19 @@ int received; #define THREADID 0 #endif -int main(int argc, char *argv[]) +int +main(int argc, char* argv[]) { - int niter, buff_size; - int inflight; - int mode; + int niter, buff_size; + int inflight; + int mode; gridtools::ghex::timer timer, ttimer; - if(argc != 4) - { - std::cerr << "Usage: bench [niter] [msg_size] [inflight]" << "\n"; - std::terminate(); - } + if (argc != 4) + { + std::cerr << "Usage: bench [niter] [msg_size] [inflight]" << "\n"; + std::terminate(); + } niter = atoi(argv[1]); buff_size = atoi(argv[2]); inflight = atoi(argv[3]); @@ -81,7 +81,8 @@ int main(int argc, char *argv[]) #ifdef GHEX_USE_OPENMP MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &mode); - if(mode != MPI_THREAD_MULTIPLE){ + if (mode != MPI_THREAD_MULTIPLE) + { std::cerr << "MPI_THREAD_MULTIPLE not supported by MPI, aborting\n"; std::terminate(); } @@ -90,18 +91,18 @@ int main(int argc, char *argv[]) #endif { - auto context_ptr = ghex::tl::context_factory::create(MPI_COMM_WORLD); + auto context_ptr = ghex::tl::context_factory::create(MPI_COMM_WORLD); auto& context = *context_ptr; #ifdef GHEX_USE_OPENMP #pragma omp parallel #endif { - auto comm = context.get_communicator(); - const auto rank = comm.rank(); - const auto size = comm.size(); - const auto thread_id = THREADID; - const auto peer_rank = (rank+1)%2; + auto comm = context.get_communicator(); + const auto rank = comm.rank(); + const auto size = comm.size(); + const auto thread_id = THREADID; + const auto peer_rank = (rank + 1) % 2; bool using_mt = false; #ifdef GHEX_USE_OPENMP @@ -111,34 +112,37 @@ int main(int argc, char *argv[]) int comm_cnt = 0, nlsend_cnt = 0, nlrecv_cnt = 0; auto send_callback = [&](communicator_type::message_type, int, int tag) - { - // std::cout << "send callback called " << rank << " thread " << omp_get_thread_num() << " tag " << tag << "\n"; - int pthr = tag/inflight; - if(pthr != thread_id) nlsend_cnt++; - comm_cnt++; - sent++; - }; + { + // std::cout << "send callback called " << rank << " thread " << omp_get_thread_num() << " tag " << tag << "\n"; + int pthr = tag / inflight; + if (pthr != thread_id) nlsend_cnt++; + comm_cnt++; + sent++; + }; auto recv_callback = [&](communicator_type::message_type, int, int tag) - { - // std::cout << "recv callback called " << rank << " thread " << omp_get_thread_num() << " tag " << tag << "\n"; - int pthr = tag/inflight; - if(pthr != thread_id) nlrecv_cnt++; - comm_cnt++; - received++; - }; - - if (thread_id==0 && rank==0) - { - if(rank==0) std::cout << "\n\nrunning test " << __FILE__ << " with communicator " << typeid(comm).name() << "\n\n"; - } - - std::vector smsgs; - std::vector rmsgs; + { + // std::cout << "recv callback called " << rank << " thread " << omp_get_thread_num() << " tag " << tag << "\n"; + int pthr = tag / inflight; + if (pthr != thread_id) nlrecv_cnt++; + comm_cnt++; + received++; + }; + + if (thread_id == 0 && rank == 0) + { + if (rank == 0) + std::cout << "\n\nrunning test " << __FILE__ << " with communicator " + << typeid(comm).name() << "\n\n"; + } + + std::vector smsgs; + std::vector rmsgs; std::vector sreqs; std::vector rreqs; - for(int j=0; j= (niter/10)) { + if (thread_id == 0 && dbg >= (niter / 10)) + { dbg = 0; std::cout << rank << " total bwdt MB/s: " - << ((double)(i-last_i)*size*buff_size)/timer.stoc() - << "\n"; + << ((double)(i - last_i) * size * buff_size) / timer.stoc() << "\n"; timer.tic(); last_i = i; } // submit inflight requests - for(int j=0; j -using transport = ghex::tl::ucx_tag; +using transport = ghex::tl::ucx_tag; #else // MPI backend #include -using transport = ghex::tl::mpi_tag; +using transport = ghex::tl::mpi_tag; #endif #include @@ -37,7 +37,6 @@ using future_type = typename communicator_type::future; using MsgType = gridtools::ghex::tl::message_buffer<>; - #ifdef GHEX_USE_OPENMP std::atomic sent(0); std::atomic received(0); @@ -56,18 +55,19 @@ int tail_recv(0); #define THREADID 0 #endif -int main(int argc, char *argv[]) +int +main(int argc, char* argv[]) { - int niter, buff_size; - int inflight; - int mode; + int niter, buff_size; + int inflight; + int mode; gridtools::ghex::timer timer, ttimer; - if(argc != 4) - { - std::cerr << "Usage: bench [niter] [msg_size] [inflight]" << "\n"; - std::terminate(); - } + if (argc != 4) + { + std::cerr << "Usage: bench [niter] [msg_size] [inflight]" << "\n"; + std::terminate(); + } niter = atoi(argv[1]); buff_size = atoi(argv[2]); inflight = atoi(argv[3]); @@ -86,7 +86,8 @@ int main(int argc, char *argv[]) #ifdef GHEX_USE_OPENMP MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &mode); - if(mode != MPI_THREAD_MULTIPLE){ + if (mode != MPI_THREAD_MULTIPLE) + { std::cerr << "MPI_THREAD_MULTIPLE not supported by MPI, aborting\n"; std::terminate(); } @@ -95,109 +96,120 @@ int main(int argc, char *argv[]) #endif { - auto context_ptr = ghex::tl::context_factory::create(MPI_COMM_WORLD); + auto context_ptr = ghex::tl::context_factory::create(MPI_COMM_WORLD); auto& context = *context_ptr; #ifdef GHEX_USE_OPENMP #pragma omp parallel #endif { - auto comm = context.get_communicator(); - const auto rank = comm.rank(); - const auto size = comm.size(); - const auto thread_id = THREADID; - const auto peer_rank = (rank+1)%2; + auto comm = context.get_communicator(); + const auto rank = comm.rank(); + const auto size = comm.size(); + const auto thread_id = THREADID; + const auto peer_rank = (rank + 1) % 2; bool using_mt = false; #ifdef GHEX_USE_OPENMP using_mt = true; #endif - if (thread_id==0 && rank==0) - { - std::cout << "\n\nrunning test " << __FILE__ << " with communicator " << typeid(comm).name() << "\n\n"; - }; + if (thread_id == 0 && rank == 0) + { + std::cout << "\n\nrunning test " << __FILE__ << " with communicator " + << typeid(comm).name() << "\n\n"; + }; - std::vector smsgs(inflight); - std::vector rmsgs(inflight); + std::vector smsgs(inflight); + std::vector rmsgs(inflight); std::vector sreqs(inflight); std::vector rreqs(inflight); - for(int j=0; j=(niter/10)) { - std::cout << sent << " sent\n"; - sdbg = 0; - } - - if(rank==0 && thread_id==0 && rdbg>=(niter/10)) { - std::cout << received << " received\n"; - rdbg = 0; - } - - if(thread_id == 0 && dbg >= (niter/10)) { - dbg = 0; - std::cout << rank << " total bwdt MB/s: " - << ((double)(received-last_received + sent-last_sent)*size*buff_size/2)/timer.toc() - << "\n"; - timer.tic(); - last_received = received; - last_sent = sent; - } - - if(rreqs[j].test()) { - received++; - lrecv++; - rdbg+=num_threads; - dbg+=num_threads; - rreqs[j] = comm.recv(rmsgs[j], peer_rank, thread_id*inflight + j); - } - - if(lsent < lrecv+2*inflight && sent < niter && sreqs[j].test()) { - sent++; - lsent++; - sdbg+=num_threads; - dbg+=num_threads; - sreqs[j] = comm.send(smsgs[j], peer_rank, thread_id*inflight + j); - } - } - } + int lsent = 0, lrecv = 0; + while (sent < niter || received < niter) + { + for (int j = 0; j < inflight; j++) + { + if (rank == 0 && thread_id == 0 && sdbg >= (niter / 10)) + { + std::cout << sent << " sent\n"; + sdbg = 0; + } + + if (rank == 0 && thread_id == 0 && rdbg >= (niter / 10)) + { + std::cout << received << " received\n"; + rdbg = 0; + } + + if (thread_id == 0 && dbg >= (niter / 10)) + { + dbg = 0; + std::cout << rank << " total bwdt MB/s: " + << ((double)(received - last_received + sent - last_sent) * size * + buff_size / 2) / + timer.toc() + << "\n"; + timer.tic(); + last_received = received; + last_sent = sent; + } + + if (rreqs[j].test()) + { + received++; + lrecv++; + rdbg += num_threads; + dbg += num_threads; + rreqs[j] = comm.recv(rmsgs[j], peer_rank, thread_id * inflight + j); + } + + if (lsent < lrecv + 2 * inflight && sent < niter && sreqs[j].test()) + { + sent++; + lsent++; + sdbg += num_threads; + dbg += num_threads; + sreqs[j] = comm.send(smsgs[j], peer_rank, thread_id * inflight + j); + } + } + } barrier(comm); - if(thread_id == 0 && rank == 0){ + if (thread_id == 0 && rank == 0) + { const auto t = ttimer.toc(); - std::cout << "time: " << t/1000000 << "s\n"; - std::cout << "final MB/s: " << ((double)niter*size*buff_size)/t << "\n"; + std::cout << "time: " << t / 1000000 << "s\n"; + std::cout << "final MB/s: " << ((double)niter * size * buff_size) / t << "\n"; } // tail loops - submit RECV requests until @@ -212,29 +224,34 @@ int main(int argc, char *argv[]) do { comm.progress(); // check if we have completed all our posted sends - if(!send_complete){ + if (!send_complete) + { incomplete_sends = 0; - for(int j=0; j -using transport = ghex::tl::ucx_tag; +using transport = ghex::tl::ucx_tag; #else // MPI backend #include -using transport = ghex::tl::mpi_tag; +using transport = ghex::tl::mpi_tag; #endif #include @@ -44,18 +43,19 @@ using MsgType = gridtools::ghex::tl::message_buffer<>; #define THREADID 0 #endif -int main(int argc, char *argv[]) +int +main(int argc, char* argv[]) { - int niter, buff_size; - int inflight; - int mode; + int niter, buff_size; + int inflight; + int mode; gridtools::ghex::timer timer, ttimer; - if(argc != 4) - { - std::cerr << "Usage: bench [niter] [msg_size] [inflight]" << "\n"; - std::terminate(); - } + if (argc != 4) + { + std::cerr << "Usage: bench [niter] [msg_size] [inflight]" << "\n"; + std::terminate(); + } niter = atoi(argv[1]); buff_size = atoi(argv[2]); inflight = atoi(argv[3]); @@ -74,70 +74,76 @@ int main(int argc, char *argv[]) #ifdef GHEX_USE_OPENMP MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &mode); - if(mode != MPI_THREAD_MULTIPLE){ + if (mode != MPI_THREAD_MULTIPLE) + { std::cerr << "MPI_THREAD_MULTIPLE not supported by MPI, aborting\n"; std::terminate(); } #else MPI_Init_thread(NULL, NULL, MPI_THREAD_SINGLE, &mode); #endif - + { - auto context_ptr = ghex::tl::context_factory::create(MPI_COMM_WORLD); + auto context_ptr = ghex::tl::context_factory::create(MPI_COMM_WORLD); auto& context = *context_ptr; #ifdef GHEX_USE_OPENMP #pragma omp parallel #endif { - auto comm = context.get_communicator(); - const auto rank = comm.rank(); - const auto size = comm.size(); - const auto thread_id = THREADID; - const auto peer_rank = (rank+1)%2; + auto comm = context.get_communicator(); + const auto rank = comm.rank(); + const auto size = comm.size(); + const auto thread_id = THREADID; + const auto peer_rank = (rank + 1) % 2; bool using_mt = false; #ifdef GHEX_USE_OPENMP using_mt = true; #endif - if (thread_id==0 && rank==0) - { - std::cout << "\n\nrunning test " << __FILE__ << " with communicator " << typeid(comm).name() << "\n\n"; - }; + if (thread_id == 0 && rank == 0) + { + std::cout << "\n\nrunning test " << __FILE__ << " with communicator " + << typeid(comm).name() << "\n\n"; + }; - std::vector smsgs(inflight); - std::vector rmsgs(inflight); + std::vector smsgs(inflight); + std::vector rmsgs(inflight); std::vector sreqs(inflight); std::vector rreqs(inflight); - for(int j=0; j= (niter/10)) { + while (sent < niter || received < niter) + { + if (thread_id == 0 && dbg >= (niter / 10)) + { dbg = 0; std::cout << rank << " total bwdt MB/s: " - << ((double)(received-last_received + sent-last_sent)*size*buff_size/2)/timer.toc() + << ((double)(received - last_received + sent - last_sent) * size * + buff_size / 2) / + timer.toc() << "\n"; timer.tic(); last_received = received; @@ -145,17 +151,19 @@ int main(int argc, char *argv[]) } /* submit comm */ - for(int j=0; j using MsgType = gridtools::ghex::tl::shared_message_buffer; /* available comm slots */ -int *available = NULL; -int ongoing_comm = 0; +int* available = NULL; +int ongoing_comm = 0; -void send_callback(MsgType mesg, int rank, int tag) +void +send_callback(MsgType mesg, int rank, int tag) { // std::cout << "send callback called " << rank << " thread " << omp_get_thread_num() << " tag " << tag << "\n"; available[tag] = 1; ongoing_comm--; } -void recv_callback(MsgType mesg, int rank, int tag) +void +recv_callback(MsgType mesg, int rank, int tag) { // std::cout << "recv callback called " << rank << " thread " << omp_get_thread_num() << " tag " << tag << "\n"; available[tag] = 1; ongoing_comm--; } -int main(int argc, char *argv[]) +int +main(int argc, char* argv[]) { int rank, size, threads, peer_rank; int niter, buff_size; @@ -68,9 +71,10 @@ int main(int argc, char *argv[]) int mode; #ifdef USE_OPENMP MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &mode); - if(mode != MPI_THREAD_MULTIPLE){ - std::cerr << "MPI_THREAD_MULTIPLE not supported by MPI, aborting\n"; - std::terminate(); + if (mode != MPI_THREAD_MULTIPLE) + { + std::cerr << "MPI_THREAD_MULTIPLE not supported by MPI, aborting\n"; + std::terminate(); } #else MPI_Init_thread(NULL, NULL, MPI_THREAD_SINGLE, &mode); @@ -78,87 +82,93 @@ int main(int argc, char *argv[]) #endif gridtools::ghex::tl::callback_communicator comm; - AllocType alloc; + AllocType alloc; niter = atoi(argv[1]); buff_size = atoi(argv[2]); - inflight = atoi(argv[3]); - + inflight = atoi(argv[3]); + rank = comm.rank(); size = comm.size(); - peer_rank = (rank+1)%2; + peer_rank = (rank + 1) % 2; - if(rank==0) std::cout << "\n\nrunning test " << __FILE__ << " with communicator " << typeid(comm).name() << "\n\n"; + if (rank == 0) + std::cout << "\n\nrunning test " << __FILE__ << " with communicator " << typeid(comm).name() + << "\n\n"; { - gridtools::ghex::timer timer; - long bytes = 0; - available = new int[inflight]; - - for(int j=0; j= dbg) { - std::cout << sent << " iters\n"; - dbg = dbg + blk; - } - - available[j] = 0; - sent++; - ongoing_comm++; - MsgType msg = MsgType(buff_size, alloc); - comm.send(msg, peer_rank, j, send_callback); - } - else comm.progress(); - } - } - - } else { - - /* recv requests are resubmitted as soon as a request is completed */ - /* so the number of submitted recv requests is always constant (inflight) */ - /* expect niter messages (i.e., niter recv callbacks) on receiver */ - ongoing_comm = niter; - while(ongoing_comm > 0){ - - for(int j=0; j 0){ - comm.progress(); - } - - comm.flush(); - comm.barrier(); - - if(rank == 1) timer.vtoc(bytes); + gridtools::ghex::timer timer; + long bytes = 0; + available = new int[inflight]; + + for (int j = 0; j < inflight; j++) { available[j] = 1; } + + if (rank == 1) + { + timer.tic(); + bytes = (double)niter * size * buff_size / 2; + } + + if (rank == 0) + { + int i = 0, dbg = 0, blk; + blk = niter / 10; + dbg = dbg + blk; + + /* send niter messages - as soon as a slot becomes free */ + int sent = 0; + while (sent < niter) + { + for (int j = 0; j < inflight; j++) + { + if (available[j]) + { + if (rank == 0 && sent >= dbg) + { + std::cout << sent << " iters\n"; + dbg = dbg + blk; + } + + available[j] = 0; + sent++; + ongoing_comm++; + MsgType msg = MsgType(buff_size, alloc); + comm.send(msg, peer_rank, j, send_callback); + } + else + comm.progress(); + } + } + } + else + { + /* recv requests are resubmitted as soon as a request is completed */ + /* so the number of submitted recv requests is always constant (inflight) */ + /* expect niter messages (i.e., niter recv callbacks) on receiver */ + ongoing_comm = niter; + while (ongoing_comm > 0) + { + for (int j = 0; j < inflight; j++) + { + if (available[j]) + { + available[j] = 0; + MsgType msg = MsgType(buff_size, alloc); + comm.recv(msg, peer_rank, j, recv_callback); + } + else + comm.progress(); + } + } + } + + /* complete all comm */ + while (ongoing_comm > 0) { comm.progress(); } + + comm.flush(); + comm.barrier(); + + if (rank == 1) timer.vtoc(bytes); } #ifdef USE_MPI diff --git a/benchmarks/transport/ghex_p2p_cb_dynamic_resubmit.cpp b/benchmarks/transport/ghex_p2p_cb_dynamic_resubmit.cpp index 91aaf565..90f809e5 100644 --- a/benchmarks/transport/ghex_p2p_cb_dynamic_resubmit.cpp +++ b/benchmarks/transport/ghex_p2p_cb_dynamic_resubmit.cpp @@ -40,25 +40,28 @@ using CommType = gridtools::ghex::tl::communicator using MsgType = gridtools::ghex::tl::shared_message_buffer; /* available comm slots */ -int *available = NULL; -int ongoing_comm = 0; +int* available = NULL; +int ongoing_comm = 0; -void send_callback(MsgType mesg, int rank, int tag) +void +send_callback(MsgType mesg, int rank, int tag) { // std::cout << "send callback called " << rank << " thread " << omp_get_thread_num() << " tag " << tag << "\n"; available[tag] = 1; ongoing_comm--; } -gridtools::ghex::tl::callback_communicator *pcomm = NULL; -void recv_callback(MsgType mesg, int rank, int tag) +gridtools::ghex::tl::callback_communicator* pcomm = NULL; +void +recv_callback(MsgType mesg, int rank, int tag) { // std::cout << "recv callback called " << rank << " thread " << omp_get_thread_num() << " tag " << tag << "\n"; pcomm->recv(mesg, rank, tag, recv_callback); ongoing_comm--; } -int main(int argc, char *argv[]) +int +main(int argc, char* argv[]) { int rank, size, threads, peer_rank; int niter, buff_size; @@ -68,9 +71,10 @@ int main(int argc, char *argv[]) int mode; #ifdef USE_OPENMP MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &mode); - if(mode != MPI_THREAD_MULTIPLE){ - std::cerr << "MPI_THREAD_MULTIPLE not supported by MPI, aborting\n"; - std::terminate(); + if (mode != MPI_THREAD_MULTIPLE) + { + std::cerr << "MPI_THREAD_MULTIPLE not supported by MPI, aborting\n"; + std::terminate(); } #else MPI_Init_thread(NULL, NULL, MPI_THREAD_SINGLE, &mode); @@ -78,88 +82,92 @@ int main(int argc, char *argv[]) #endif gridtools::ghex::tl::callback_communicator comm; - AllocType alloc; + AllocType alloc; /* needed in the recv_callback to resubmit the recv request */ pcomm = &comm; niter = atoi(argv[1]); buff_size = atoi(argv[2]); - inflight = atoi(argv[3]); - + inflight = atoi(argv[3]); + rank = comm.rank(); size = comm.size(); - peer_rank = (rank+1)%2; + peer_rank = (rank + 1) % 2; - if(rank==0) std::cout << "\n\nrunning test " << __FILE__ << " with communicator " << typeid(comm).name() << "\n\n"; + if (rank == 0) + std::cout << "\n\nrunning test " << __FILE__ << " with communicator " << typeid(comm).name() + << "\n\n"; { - gridtools::ghex::timer timer; - long bytes = 0; - - available = new int[inflight]; - for(int j=0; j= dbg) { - std::cout << sent << " iters\n"; - dbg = dbg + blk; - } - - available[j] = 0; - sent++; - ongoing_comm++; - MsgType msg = MsgType(buff_size, alloc); - comm.send(msg, peer_rank, j, send_callback); - } - else comm.progress(); - } - } - - } else { - - /* recv requests are resubmitted as soon as a request is completed */ - /* so the number of submitted recv requests is always constant (inflight) */ - /* expect niter messages (i.e., niter recv callbacks) on receiver */ - ongoing_comm = niter; - - /* submit all recv requests */ - for(int j=0; j 0){ - comm.progress(); - } - - comm.flush(); - comm.barrier(); - - if(rank == 1) timer.vtoc(bytes); + gridtools::ghex::timer timer; + long bytes = 0; + + available = new int[inflight]; + for (int j = 0; j < inflight; j++) { available[j] = 1; } + + if (rank == 1) + { + timer.tic(); + bytes = (double)niter * size * buff_size / 2; + } + + if (rank == 0) + { + int i = 0, dbg = 0, blk; + blk = niter / 10; + dbg = dbg + blk; + + /* send niter messages - as soon as a slot becomes free */ + int sent = 0; + while (sent < niter) + { + for (int j = 0; j < inflight; j++) + { + if (available[j]) + { + if (rank == 0 && sent >= dbg) + { + std::cout << sent << " iters\n"; + dbg = dbg + blk; + } + + available[j] = 0; + sent++; + ongoing_comm++; + MsgType msg = MsgType(buff_size, alloc); + comm.send(msg, peer_rank, j, send_callback); + } + else + comm.progress(); + } + } + } + else + { + /* recv requests are resubmitted as soon as a request is completed */ + /* so the number of submitted recv requests is always constant (inflight) */ + /* expect niter messages (i.e., niter recv callbacks) on receiver */ + ongoing_comm = niter; + + /* submit all recv requests */ + for (int j = 0; j < inflight; j++) + { + MsgType msg = MsgType(buff_size, alloc); + comm.recv(msg, peer_rank, j, recv_callback); + } + + /* requests are re-submitted inside the calback. */ + /* progress (below) until niter messages have been received. */ + } + + /* complete all comm */ + while (ongoing_comm > 0) { comm.progress(); } + + comm.flush(); + comm.barrier(); + + if (rank == 1) timer.vtoc(bytes); } #ifdef USE_MPI diff --git a/benchmarks/transport/ghex_p2p_cb_dynamic_resubmit_mt.cpp b/benchmarks/transport/ghex_p2p_cb_dynamic_resubmit_mt.cpp index 9c8ec76c..3000db55 100644 --- a/benchmarks/transport/ghex_p2p_cb_dynamic_resubmit_mt.cpp +++ b/benchmarks/transport/ghex_p2p_cb_dynamic_resubmit_mt.cpp @@ -44,36 +44,38 @@ using MsgType = gridtools::ghex::tl::shared_message_buffer; there is no way of knowing which thread will service which requests, and how many. */ -int comm_cnt = 0, nlcomm_cnt = 0, submit_cnt = 0; -int thrid, nthr; +int comm_cnt = 0, nlcomm_cnt = 0, submit_cnt = 0; +int thrid, nthr; #pragma omp threadprivate(comm_cnt, nlcomm_cnt, submit_cnt, thrid, nthr) /* available comm slots - per-thread */ -int **available = NULL; -int ongoing_comm = 0; -int inflight; +int** available = NULL; +int ongoing_comm = 0; +int inflight; -void send_callback(MsgType mesg, int rank, int tag) +void +send_callback(MsgType mesg, int rank, int tag) { // std::cout << "send callback called " << rank << " thread " << omp_get_thread_num() << " tag " << tag << "\n"; - int pthr = tag/inflight; - int pos = tag - pthr*inflight; - if(pthr != thrid) nlcomm_cnt++; + int pthr = tag / inflight; + int pos = tag - pthr * inflight; + if (pthr != thrid) nlcomm_cnt++; comm_cnt++; available[pthr][pos] = 1; } -gridtools::ghex::tl::callback_communicator *pcomm = NULL; -#pragma omp threadprivate(pcomm) +gridtools::ghex::tl::callback_communicator* pcomm = NULL; +#pragma omp threadprivate(pcomm) -void recv_callback(MsgType mesg, int rank, int tag) +void +recv_callback(MsgType mesg, int rank, int tag) { // std::cout << "recv callback called " << rank << " thread " << omp_get_thread_num() << " tag " << tag << " ongoing " << ongoing_comm << "\n"; - int pthr = tag/inflight; - int pos = tag - pthr*inflight; - if(pthr != thrid) nlcomm_cnt++; + int pthr = tag / inflight; + int pos = tag - pthr * inflight; + if (pthr != thrid) nlcomm_cnt++; comm_cnt++; - submit_cnt+=nthr; + submit_cnt += nthr; /* resubmit the recv request */ pcomm->recv(mesg, rank, tag, recv_callback); @@ -82,20 +84,22 @@ void recv_callback(MsgType mesg, int rank, int tag) ongoing_comm--; } -int main(int argc, char *argv[]) +int +main(int argc, char* argv[]) { - int rank, size, threads, peer_rank; - int niter, buff_size; + int rank, size, threads, peer_rank; + int niter, buff_size; gridtools::ghex::timer timer; - long bytes = 0; + long bytes = 0; #ifdef USE_MPI int mode; #ifdef USE_OPENMP MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &mode); - if(mode != MPI_THREAD_MULTIPLE){ - std::cerr << "MPI_THREAD_MULTIPLE not supported by MPI, aborting\n"; - std::terminate(); + if (mode != MPI_THREAD_MULTIPLE) + { + std::cerr << "MPI_THREAD_MULTIPLE not supported by MPI, aborting\n"; + std::terminate(); } #else MPI_Init_thread(NULL, NULL, MPI_THREAD_SINGLE, &mode); @@ -104,105 +108,110 @@ int main(int argc, char *argv[]) niter = atoi(argv[1]); buff_size = atoi(argv[2]); - inflight = atoi(argv[3]); - + inflight = atoi(argv[3]); + #pragma omp parallel { - gridtools::ghex::tl::callback_communicator *comm - = new gridtools::ghex::tl::callback_communicator(); - AllocType alloc; + gridtools::ghex::tl::callback_communicator* comm = + new gridtools::ghex::tl::callback_communicator(); + AllocType alloc; #pragma omp master - { - rank = comm->rank(); - size = comm->size(); - peer_rank = (rank+1)%2; - if(rank==0) std::cout << "\n\nrunning test " << __FILE__ << " with communicator " << typeid(*comm).name() << "\n\n"; - } + { + rank = comm->rank(); + size = comm->size(); + peer_rank = (rank + 1) % 2; + if (rank == 0) + std::cout << "\n\nrunning test " << __FILE__ << " with communicator " + << typeid(*comm).name() << "\n\n"; + } - /* needed in the recv_callback to resubmit the recv request */ - pcomm = comm; + /* needed in the recv_callback to resubmit the recv request */ + pcomm = comm; - thrid = omp_get_thread_num(); - nthr = omp_get_num_threads(); + thrid = omp_get_thread_num(); + nthr = omp_get_num_threads(); #pragma omp master - available = new int*[nthr]; + available = new int*[nthr]; #pragma omp barrier - available[thrid] = new int[inflight]; - - for(int j=0; jbarrier(); - - if(rank == 1) { - timer.tic(); - bytes = (double)niter*size*buff_size/2; - } - - if(rank == 0){ - - int i = 0, dbg = 0, blk; - blk = niter / 10; - dbg = dbg + blk; - - /* send niter messages - as soon as a slot becomes free */ - while(submit_cnt < niter){ - - for(int j=0; j= dbg) { - std::cout << submit_cnt << " iters\n"; - dbg = dbg + blk; - } - available[thrid][j] = 0; - submit_cnt += nthr; - MsgType msg = MsgType(buff_size, alloc); - comm->send(msg, peer_rank, thrid*inflight+j, send_callback); - } - else comm->progress(); - } - } - - } else { - - /* recv requests are resubmitted as soon as a request is completed */ - /* so the number of submitted recv requests is always constant (inflight) */ - /* expect niter messages (i.e., niter recv callbacks) on receiver */ - ongoing_comm = niter; + available[thrid] = new int[inflight]; + + for (int j = 0; j < inflight; j++) { available[thrid][j] = 1; } + + /* make sure both ranks are started and all threads initialized */ + comm->barrier(); + + if (rank == 1) + { + timer.tic(); + bytes = (double)niter * size * buff_size / 2; + } + + if (rank == 0) + { + int i = 0, dbg = 0, blk; + blk = niter / 10; + dbg = dbg + blk; + + /* send niter messages - as soon as a slot becomes free */ + while (submit_cnt < niter) + { + for (int j = 0; j < inflight; j++) + { + if (available[thrid][j]) + { + if (rank == 0 && thrid == 0 && submit_cnt >= dbg) + { + std::cout << submit_cnt << " iters\n"; + dbg = dbg + blk; + } + available[thrid][j] = 0; + submit_cnt += nthr; + MsgType msg = MsgType(buff_size, alloc); + comm->send(msg, peer_rank, thrid * inflight + j, send_callback); + } + else + comm->progress(); + } + } + } + else + { + /* recv requests are resubmitted as soon as a request is completed */ + /* so the number of submitted recv requests is always constant (inflight) */ + /* expect niter messages (i.e., niter recv callbacks) on receiver */ + ongoing_comm = niter; #pragma omp barrier - /* submit all recv requests */ - for(int j=0; jrecv(msg, peer_rank, thrid*inflight+j, recv_callback); - submit_cnt+=nthr; - } - - /* requests are re-submitted inside the calback. */ - /* progress (below) until niter messages have been received. */ - - /* complete all comm */ - while(ongoing_comm > 0){ - comm->progress(); - } - } + /* submit all recv requests */ + for (int j = 0; j < inflight; j++) + { + MsgType msg = MsgType(buff_size, alloc); + comm->recv(msg, peer_rank, thrid * inflight + j, recv_callback); + submit_cnt += nthr; + } + + /* requests are re-submitted inside the calback. */ + /* progress (below) until niter messages have been received. */ + + /* complete all comm */ + while (ongoing_comm > 0) { comm->progress(); } + } #pragma omp barrier - comm->flush(); - comm->barrier(); - + comm->flush(); + comm->barrier(); + #pragma omp critical - std::cout << "rank " << rank << " thread " << thrid << " submitted " << submit_cnt/nthr - << " serviced " << comm_cnt << ", non-local " << nlcomm_cnt << " completion events\n"; - - delete comm; + std::cout << "rank " << rank << " thread " << thrid << " submitted " << submit_cnt / nthr + << " serviced " << comm_cnt << ", non-local " << nlcomm_cnt + << " completion events\n"; + + delete comm; } - if(rank == 1) timer.vtoc(bytes); + if (rank == 1) timer.vtoc(bytes); #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); diff --git a/benchmarks/transport/ghex_p2p_cb_resubmit.cpp b/benchmarks/transport/ghex_p2p_cb_resubmit.cpp index 9e4850ae..cabbe7b1 100644 --- a/benchmarks/transport/ghex_p2p_cb_resubmit.cpp +++ b/benchmarks/transport/ghex_p2p_cb_resubmit.cpp @@ -12,7 +12,6 @@ #include - #ifdef USE_MPI /* MPI backend */ @@ -34,37 +33,41 @@ using CommType = gridtools::ghex::tl::communicator using MsgType = gridtools::ghex::tl::shared_message_buffer<>; /* available comm slots */ -int *available = NULL; -int ongoing_comm = 0; +int* available = NULL; +int ongoing_comm = 0; -void send_callback(MsgType mesg, int rank, int tag) +void +send_callback(MsgType mesg, int rank, int tag) { // std::cout << "send callback called " << rank << " thread " << omp_get_thread_num() << " tag " << tag << "\n"; available[tag] = 1; ongoing_comm--; } -gridtools::ghex::tl::callback_communicator *pcomm = NULL; -void recv_callback(MsgType mesg, int rank, int tag) +gridtools::ghex::tl::callback_communicator* pcomm = NULL; +void +recv_callback(MsgType mesg, int rank, int tag) { // std::cout << "recv callback called " << rank << " thread " << omp_get_thread_num() << " tag " << tag << "\n"; pcomm->recv(mesg, rank, tag, recv_callback); ongoing_comm--; } -int main(int argc, char *argv[]) +int +main(int argc, char* argv[]) { int rank, size, threads, peer_rank; int niter, buff_size; int inflight; - + #ifdef USE_MPI int mode; #ifdef USE_OPENMP MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &mode); - if(mode != MPI_THREAD_MULTIPLE){ - std::cerr << "MPI_THREAD_MULTIPLE not supported by MPI, aborting\n"; - std::terminate(); + if (mode != MPI_THREAD_MULTIPLE) + { + std::cerr << "MPI_THREAD_MULTIPLE not supported by MPI, aborting\n"; + std::terminate(); } #else MPI_Init_thread(NULL, NULL, MPI_THREAD_SINGLE, &mode); @@ -78,81 +81,85 @@ int main(int argc, char *argv[]) niter = atoi(argv[1]); buff_size = atoi(argv[2]); - inflight = atoi(argv[3]); - + inflight = atoi(argv[3]); + rank = comm.rank(); size = comm.size(); - peer_rank = (rank+1)%2; + peer_rank = (rank + 1) % 2; - if(rank==0) std::cout << "\n\nrunning test " << __FILE__ << " with communicator " << typeid(comm).name() << "\n\n"; + if (rank == 0) + std::cout << "\n\nrunning test " << __FILE__ << " with communicator " << typeid(comm).name() + << "\n\n"; { - gridtools::ghex::timer timer; - long bytes = 0; - std::vector msgs; - available = new int[inflight]; - - for(int j=0; j= dbg) { - std::cout << sent << " iters\n"; - dbg = dbg + blk; - } - - available[j] = 0; - sent++; - ongoing_comm++; - comm.send(msgs[j], peer_rank, j, send_callback); - } - else comm.progress(); - } - } - - } else { - - /* recv requests are resubmitted as soon as a request is completed */ - /* so the number of submitted recv requests is always constant (inflight) */ - /* expect niter messages (i.e., niter recv callbacks) on receiver */ - ongoing_comm = niter; - - /* submit all recv requests */ - for(int j=0; j 0){ - comm.progress(); - } - - if(rank == 1) timer.vtoc(bytes); - - comm.flush(); - comm.barrier(); + gridtools::ghex::timer timer; + long bytes = 0; + std::vector msgs; + available = new int[inflight]; + + for (int j = 0; j < inflight; j++) + { + available[j] = 1; + msgs.emplace_back(buff_size); + } + + if (rank == 1) + { + timer.tic(); + bytes = (double)niter * size * buff_size / 2; + } + + if (rank == 0) + { + int i = 0, dbg = 0, blk; + blk = niter / 10; + dbg = dbg + blk; + + /* send niter messages - as soon as a slot becomes free */ + int sent = 0; + while (sent < niter) + { + for (int j = 0; j < inflight; j++) + { + if (available[j]) + { + if (rank == 0 && sent >= dbg) + { + std::cout << sent << " iters\n"; + dbg = dbg + blk; + } + + available[j] = 0; + sent++; + ongoing_comm++; + comm.send(msgs[j], peer_rank, j, send_callback); + } + else + comm.progress(); + } + } + } + else + { + /* recv requests are resubmitted as soon as a request is completed */ + /* so the number of submitted recv requests is always constant (inflight) */ + /* expect niter messages (i.e., niter recv callbacks) on receiver */ + ongoing_comm = niter; + + /* submit all recv requests */ + for (int j = 0; j < inflight; j++) { comm.recv(msgs[j], peer_rank, j, recv_callback); } + + /* requests are re-submitted inside the calback. */ + /* progress (below) until niter messages have been received. */ + } + + /* complete all comm */ + while (ongoing_comm > 0) { comm.progress(); } + + if (rank == 1) timer.vtoc(bytes); + + comm.flush(); + comm.barrier(); } #ifdef USE_MPI diff --git a/benchmarks/transport/mpi_p2p_avail_any.cpp b/benchmarks/transport/mpi_p2p_avail_any.cpp index 6e3bf71f..c965c8d5 100644 --- a/benchmarks/transport/mpi_p2p_avail_any.cpp +++ b/benchmarks/transport/mpi_p2p_avail_any.cpp @@ -12,98 +12,107 @@ #include -int main(int argc, char *argv[]) +int +main(int argc, char* argv[]) { - int rank, size, mode, peer_rank; - int niter, buff_size; - int inflight; + int rank, size, mode, peer_rank; + int niter, buff_size; + int inflight; MPI_Comm mpi_comm; gridtools::ghex::timer timer; - long bytes = 0; + long bytes = 0; niter = atoi(argv[1]); buff_size = atoi(argv[2]); inflight = atoi(argv[3]); - + #ifdef USE_OPENMP - MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &mode); + MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &mode); #else - // MPI_Init(NULL, NULL); - MPI_Init_thread(NULL, NULL, MPI_THREAD_SINGLE, &mode); + // MPI_Init(NULL, NULL); + MPI_Init_thread(NULL, NULL, MPI_THREAD_SINGLE, &mode); #endif MPI_Comm_dup(MPI_COMM_WORLD, &mpi_comm); MPI_Comm_rank(mpi_comm, &rank); MPI_Comm_size(mpi_comm, &size); - peer_rank = (rank+1)%2; + peer_rank = (rank + 1) % 2; - if(rank==0) std::cout << "\n\nrunning test " << __FILE__ << "\n\n"; + if (rank == 0) std::cout << "\n\nrunning test " << __FILE__ << "\n\n"; { - unsigned char **buffers = new unsigned char *[inflight]; - MPI_Request *req = new MPI_Request[inflight]; - - for(int j=0; j=(niter/10)) { - std::cout << i << " iters\n"; - dbg=0; - } - MPI_Isend(buffers[j], buff_size, MPI_BYTE, peer_rank, thrid*inflight+j, mpi_comm, &req[j]); - } else - MPI_Irecv(buffers[j], buff_size, MPI_BYTE, peer_rank, thrid*inflight+j, mpi_comm, &req[j]); - - dbg +=nthr; i+=nthr; - } - - // MPI_Waitany(inflight, req, &completed, MPI_STATUS_IGNORE); - // // MPI_Testany(inflight, req, &completed, &flag, MPI_STATUS_IGNORE); - // // if(!flag) continue; - - // if(rank==0 && i%(niter/10)==0) { - // std::cout << i << " iters\n"; - // } - - // if(rank==0) - // MPI_Isend(buffers[completed], buff_size, MPI_BYTE, peer_rank, completed, mpi_comm, &req[completed]); - // else - // MPI_Irecv(buffers[completed], buff_size, MPI_BYTE, peer_rank, completed, mpi_comm, &req[completed]); - // i++; if(i==niter) break; - } - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 1) timer.vtoc(bytes); + unsigned char** buffers = new unsigned char*[inflight]; + MPI_Request* req = new MPI_Request[inflight]; + + for (int j = 0; j < inflight; j++) + { + MPI_Alloc_mem(buff_size, MPI_INFO_NULL, &buffers[j]); + req[j] = MPI_REQUEST_NULL; + for (int i = 0; i < buff_size; i++) { buffers[j][i] = i % (rank + 1); } + } + + MPI_Barrier(MPI_COMM_WORLD); + if (rank == 1) + { + timer.tic(); + bytes = (double)niter * size * buff_size / 2; + } + + /* submit inflight async requests */ + for (int j = 0; j < inflight; j++) + { + if (rank == 0) + MPI_Isend(buffers[j], buff_size, MPI_BYTE, peer_rank, j, mpi_comm, &req[j]); + else + MPI_Irecv(buffers[j], buff_size, MPI_BYTE, peer_rank, j, mpi_comm, &req[j]); + } + + int i = 0, j, dbg = 0, thrid = 0, nthr = 1; + while (i < niter) + { + int completed, flag; + + MPI_Testany(inflight, req, &j, &flag, MPI_STATUS_IGNORE); + if (flag) + { + if (rank == 0) + { + if (thrid == 0 && dbg >= (niter / 10)) + { + std::cout << i << " iters\n"; + dbg = 0; + } + MPI_Isend(buffers[j], buff_size, MPI_BYTE, peer_rank, thrid * inflight + j, + mpi_comm, &req[j]); + } + else + MPI_Irecv(buffers[j], buff_size, MPI_BYTE, peer_rank, thrid * inflight + j, + mpi_comm, &req[j]); + + dbg += nthr; + i += nthr; + } + + // MPI_Waitany(inflight, req, &completed, MPI_STATUS_IGNORE); + // // MPI_Testany(inflight, req, &completed, &flag, MPI_STATUS_IGNORE); + // // if(!flag) continue; + + // if(rank==0 && i%(niter/10)==0) { + // std::cout << i << " iters\n"; + // } + + // if(rank==0) + // MPI_Isend(buffers[completed], buff_size, MPI_BYTE, peer_rank, completed, mpi_comm, &req[completed]); + // else + // MPI_Irecv(buffers[completed], buff_size, MPI_BYTE, peer_rank, completed, mpi_comm, &req[completed]); + // i++; if(i==niter) break; + } + + MPI_Barrier(MPI_COMM_WORLD); + if (rank == 1) timer.vtoc(bytes); } - + MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); } diff --git a/benchmarks/transport/mpi_p2p_avail_mt.cpp b/benchmarks/transport/mpi_p2p_avail_mt.cpp index dfecde81..5d7c97c5 100644 --- a/benchmarks/transport/mpi_p2p_avail_mt.cpp +++ b/benchmarks/transport/mpi_p2p_avail_mt.cpp @@ -14,70 +14,76 @@ #include -int main(int argc, char *argv[]) +int +main(int argc, char* argv[]) { - int rank, size, threads, peer_rank; - int niter, buff_size; - int inflight; + int rank, size, threads, peer_rank; + int niter, buff_size; + int inflight; MPI_Comm mpi_comm; - int ncomm = 0; + int ncomm = 0; gridtools::ghex::timer timer; - long bytes = 0; + long bytes = 0; niter = atoi(argv[1]); buff_size = atoi(argv[2]); inflight = atoi(argv[3]); - + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &threads); MPI_Comm_dup(MPI_COMM_WORLD, &mpi_comm); MPI_Comm_rank(mpi_comm, &rank); MPI_Comm_size(mpi_comm, &size); - peer_rank = (rank+1)%2; + peer_rank = (rank + 1) % 2; - if(rank==0) std::cout << "\n\nrunning test " << __FILE__ << "\n\n"; + if (rank == 0) std::cout << "\n\nrunning test " << __FILE__ << "\n\n"; -#pragma omp parallel shared(niter, buff_size, peer_rank) reduction( + : ncomm ) +#pragma omp parallel shared(niter, buff_size, peer_rank) reduction(+ : ncomm) { - int thrid, nthr; - unsigned char **buffers = new unsigned char *[inflight]; - MPI_Request *req = new MPI_Request[inflight]; - - thrid = omp_get_thread_num(); - nthr = omp_get_num_threads(); - - for(int j=0; j=(niter/10)) { - std::cout << i << " iters\n"; - dbg=0; - } - MPI_Isend(buffers[j], buff_size, MPI_BYTE, peer_rank, thrid*inflight+j, mpi_comm, &req[j]); - } else - MPI_Irecv(buffers[j], buff_size, MPI_BYTE, peer_rank, thrid*inflight+j, mpi_comm, &req[j]); + */ - ncomm++; - dbg +=nthr; i+=nthr; - } - } - std::cout << "rank " << rank << " thrid " << thrid << " ncomm " << ncomm << "\n"; + /* A version with MPI_Testany instead of an explicit loop : both are the same */ + MPI_Testany(inflight, req, &j, &flag, MPI_STATUS_IGNORE); + if (flag) + { + if (rank == 0) + { + if (thrid == 0 && dbg >= (niter / 10)) + { + std::cout << i << " iters\n"; + dbg = 0; + } + MPI_Isend(buffers[j], buff_size, MPI_BYTE, peer_rank, thrid * inflight + j, + mpi_comm, &req[j]); + } + else + MPI_Irecv(buffers[j], buff_size, MPI_BYTE, peer_rank, thrid * inflight + j, + mpi_comm, &req[j]); + + ncomm++; + dbg += nthr; + i += nthr; + } + } + std::cout << "rank " << rank << " thrid " << thrid << " ncomm " << ncomm << "\n"; #pragma omp barrier #pragma omp master - { - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 1) timer.vtoc(bytes); - } + { + MPI_Barrier(MPI_COMM_WORLD); + if (rank == 1) timer.vtoc(bytes); + } #pragma omp barrier } diff --git a/benchmarks/transport/mpi_p2p_bi_avail.cpp b/benchmarks/transport/mpi_p2p_bi_avail.cpp index e80fe43d..e48949e2 100644 --- a/benchmarks/transport/mpi_p2p_bi_avail.cpp +++ b/benchmarks/transport/mpi_p2p_bi_avail.cpp @@ -12,70 +12,74 @@ #include #include -int main(int argc, char *argv[]) +int +main(int argc, char* argv[]) { - int rank, size, mode, peer_rank; - int niter, buff_size; - int inflight; + int rank, size, mode, peer_rank; + int niter, buff_size; + int inflight; MPI_Comm mpi_comm; gridtools::ghex::timer timer; - long bytes = 0; + long bytes = 0; niter = atoi(argv[1]); buff_size = atoi(argv[2]); inflight = atoi(argv[3]); - + #ifdef USE_OPENMP - MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &mode); + MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &mode); #else - MPI_Init_thread(NULL, NULL, MPI_THREAD_SINGLE, &mode); + MPI_Init_thread(NULL, NULL, MPI_THREAD_SINGLE, &mode); #endif MPI_Comm_dup(MPI_COMM_WORLD, &mpi_comm); MPI_Comm_rank(mpi_comm, &rank); MPI_Comm_size(mpi_comm, &size); - peer_rank = (rank+1)%2; + peer_rank = (rank + 1) % 2; - if(rank==0) std::cout << "\n\nrunning test " << __FILE__ << "\n\n"; + if (rank == 0) std::cout << "\n\nrunning test " << __FILE__ << "\n\n"; { - unsigned char **sbuffers = new unsigned char *[inflight]; - unsigned char **rbuffers = new unsigned char *[inflight]; - MPI_Request *sreq = new MPI_Request[inflight]; - MPI_Request *rreq = new MPI_Request[inflight]; - - for(int j=0; j sent(0); std::atomic received(0); -int last_received = 0; -int last_sent = 0; +int last_received = 0; +int last_sent = 0; -int main(int argc, char *argv[]) +int +main(int argc, char* argv[]) { int rank, size, peer_rank; int niter, buff_size; @@ -57,18 +58,20 @@ int main(int argc, char *argv[]) gridtools::ghex::timer timer, ttimer; - if(argc != 4){ + if (argc != 4) + { std::cerr << "Usage: bench [niter] [msg_size] [inflight]" << "\n"; std::terminate(); } niter = atoi(argv[1]); buff_size = atoi(argv[2]); inflight = atoi(argv[3]); - + int mode; #ifdef GHEX_USE_OPENMP MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mode); - if(mode != MPI_THREAD_MULTIPLE){ + if (mode != MPI_THREAD_MULTIPLE) + { std::cerr << "MPI_THREAD_MULTIPLE not supported by MPI, aborting\n"; std::terminate(); } @@ -76,34 +79,35 @@ int main(int argc, char *argv[]) MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &mode); #endif - THREAD_PARALLEL_BEG() { - - int thrid, nthr; - MPI_Comm mpi_comm; - unsigned char **sbuffers = new unsigned char *[inflight]; - unsigned char **rbuffers = new unsigned char *[inflight]; - MPI_Request *sreq = new MPI_Request[inflight]; - MPI_Request *rreq = new MPI_Request[inflight]; - - THREAD_MASTER() { + THREAD_PARALLEL_BEG() + { + int thrid, nthr; + MPI_Comm mpi_comm; + unsigned char** sbuffers = new unsigned char*[inflight]; + unsigned char** rbuffers = new unsigned char*[inflight]; + MPI_Request* sreq = new MPI_Request[inflight]; + MPI_Request* rreq = new MPI_Request[inflight]; + + THREAD_MASTER() + { MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); - peer_rank = (rank+1)%2; - if(rank==0) std::cout << "\n\nrunning test " << __FILE__ << "\n\n"; + peer_rank = (rank + 1) % 2; + if (rank == 0) std::cout << "\n\nrunning test " << __FILE__ << "\n\n"; } thrid = GET_THREAD_NUM(); nthr = GET_NUM_THREADS(); /* duplicate the communicator - all threads in order */ - for(int tid=0; tid=(niter/10)) { + while (sent < niter || received < niter) + { + if (thrid == 0 && sdbg >= (niter / 10)) + { std::cout << rank << " " << sent << " sent\n"; sdbg = 0; } - if(thrid==0 && rdbg>=(niter/10)) { + if (thrid == 0 && rdbg >= (niter / 10)) + { std::cout << rank << " " << received << " received\n"; rdbg = 0; } - if(thrid == 0 && dbg >= (2*niter/10)) { + if (thrid == 0 && dbg >= (2 * niter / 10)) + { dbg = 0; - timer.vtoc(header, (double)(received-last_received + sent-last_sent)*size*buff_size/2); + timer.vtoc(header, + (double)(received - last_received + sent - last_sent) * size * buff_size / 2); timer.tic(); last_received = received; last_sent = sent; } - // testany version is much faster with OpenMPI, esp. for large messages + // testany version is much faster with OpenMPI, esp. for large messages // #define USE_TESTANY #ifdef USE_TESTANY MPI_Testany(inflight, rreq, &j, &flag, MPI_STATUS_IGNORE); - if(flag) { - MPI_Irecv(rbuffers[j], buff_size, MPI_BYTE, peer_rank, thrid*inflight+j, mpi_comm, &rreq[j]); + if (flag) + { + MPI_Irecv(rbuffers[j], buff_size, MPI_BYTE, peer_rank, thrid * inflight + j, + mpi_comm, &rreq[j]); dbg += nthr; rdbg += nthr; received++; lrecv++; } - if(lsent < lrecv+2*inflight && sent= (niter/10)) { - dbg = 0; - timer.vtoc(header, (double)(i-last_i)*size*buff_size); - timer.tic(); - last_i = i; - } - - /* submit comm */ - for(int j=0; j= (niter / 10)) + { + dbg = 0; + timer.vtoc(header, (double)(i - last_i) * size * buff_size); + timer.tic(); + last_i = i; + } + + /* submit comm */ + for (int j = 0; j < inflight; j++) + { + MPI_Irecv(rbuffers[j], buff_size, MPI_BYTE, peer_rank, thrid * inflight + j, + mpi_comm, &rreq[j]); + MPI_Isend(sbuffers[j], buff_size, MPI_BYTE, peer_rank, thrid * inflight + j, + mpi_comm, &sreq[j]); + dbg += nthr; + i += nthr; + } + + /* wait for all to complete */ #ifdef USE_WAITALL MPI_Waitall(inflight, sreq, MPI_STATUS_IGNORE); MPI_Waitall(inflight, rreq, MPI_STATUS_IGNORE); #else - for(int j=0; j -namespace ghex { - - namespace allocator { - - template - struct buffer_ptr { - T *m_buffer; - std::size_t m_size; - - buffer_ptr() = delete; - buffer_ptr(T *p, std::size_t size): - m_buffer{p}, m_size{size} - {} - }; - - template - static std::vector>> buffers; - - int thrid; - DECLARE_THREAD_PRIVATE(thrid) - - template - struct pool_allocator { - - typedef T value_type; - - BaseAllocator m_ba; - - pool_allocator(){ - thrid = GET_THREAD_NUM(); - THREAD_MASTER (){ - thread_rank_type nthr = GET_NUM_THREADS(); - if(buffers.size() != nthr){ - buffers.resize(nthr); - } - } - THREAD_BARRIER(); - } - - pool_allocator(const pool_allocator &other) : - m_ba{other.m_ba} - {} - - void initialize(int nb, int size) - { - for(int i=0; i container(m_ba.allocate(size), size); - memset(container.m_buffer, 0, size); - buffers[thrid].push_back(container); - } - } - - [[nodiscard]] T* allocate(std::size_t size) - { - if(0 == buffers[thrid].size()){ - return m_ba.allocate(size); - } else { - buffer_ptr &container = buffers[thrid].back(); - T *data = container.m_buffer; - buffers[thrid].pop_back(); - return data; - } - } - - void deallocate(T* p, std::size_t size) - { - buffers[thrid].emplace_back(p, size); - } - - void release(){ - int size = buffers[thrid].size(); - for(int i=0; i &container = buffers[thrid].back(); - m_ba.deallocate(container.m_buffer, container.m_size); - buffers[thrid].pop_back(); - } - } - - }; - } // namespace allocator +namespace ghex +{ + +namespace allocator +{ + +template +struct buffer_ptr +{ + T* m_buffer; + std::size_t m_size; + + buffer_ptr() = delete; + buffer_ptr(T* p, std::size_t size) + : m_buffer{p} + , m_size{size} + { + } +}; + +template +static std::vector>> buffers; + +int thrid; +DECLARE_THREAD_PRIVATE(thrid) + +template +struct pool_allocator +{ + typedef T value_type; + + BaseAllocator m_ba; + + pool_allocator() + { + thrid = GET_THREAD_NUM(); + THREAD_MASTER() + { + thread_rank_type nthr = GET_NUM_THREADS(); + if (buffers.size() != nthr) { buffers.resize(nthr); } + } + THREAD_BARRIER(); + } + + pool_allocator(const pool_allocator& other) + : m_ba{other.m_ba} + { + } + + void initialize(int nb, int size) + { + for (int i = 0; i < nb; i++) + { + buffer_ptr container(m_ba.allocate(size), size); + memset(container.m_buffer, 0, size); + buffers[thrid].push_back(container); + } + } + + [[nodiscard]] T* allocate(std::size_t size) + { + if (0 == buffers[thrid].size()) { return m_ba.allocate(size); } + else + { + buffer_ptr& container = buffers[thrid].back(); + T* data = container.m_buffer; + buffers[thrid].pop_back(); + return data; + } + } + + void deallocate(T* p, std::size_t size) { buffers[thrid].emplace_back(p, size); } + + void release() + { + int size = buffers[thrid].size(); + for (int i = 0; i < size; i++) + { + buffer_ptr& container = buffers[thrid].back(); + m_ba.deallocate(container.m_buffer, container.m_size); + buffers[thrid].pop_back(); + } + } +}; +} // namespace allocator } // namespace ghex #endif /* INCLUDED_POOL_ALLOCATOR_HPP */ diff --git a/benchmarks/transport/utils.hpp b/benchmarks/transport/utils.hpp index ef7b243f..984c1904 100644 --- a/benchmarks/transport/utils.hpp +++ b/benchmarks/transport/utils.hpp @@ -17,18 +17,21 @@ #include template -void make_zero(Msg& msg) { - for (auto& c : msg) - c = 0; +void +make_zero(Msg& msg) +{ + for (auto& c : msg) c = 0; } -void bind_to_core(int thrid) +void +bind_to_core(int thrid) { cpu_set_t cpu_mask; - pid_t tid = syscall(SYS_gettid); + pid_t tid = syscall(SYS_gettid); CPU_ZERO(&cpu_mask); CPU_SET(thrid, &cpu_mask); - if (sched_setaffinity(tid, sizeof(cpu_mask), &cpu_mask) == -1){ + if (sched_setaffinity(tid, sizeof(cpu_mask), &cpu_mask) == -1) + { fprintf(stderr, "sched_setaffinity error : %s\n", strerror(errno)); exit(1); } diff --git a/benchmarks/unstructured_parmetis.cpp b/benchmarks/unstructured_parmetis.cpp index d7682ef9..9110d042 100644 --- a/benchmarks/unstructured_parmetis.cpp +++ b/benchmarks/unstructured_parmetis.cpp @@ -48,7 +48,6 @@ #include #endif - // GHEX type definitions #ifndef GHEX_TEST_USE_UCX using transport = gridtools::ghex::tl::mpi_tag; @@ -57,240 +56,290 @@ using transport = gridtools::ghex::tl::ucx_tag; #endif using domain_id_type = int; using global_index_type = idx_t; -using domain_descriptor_type = gridtools::ghex::unstructured::domain_descriptor; -using halo_generator_type = gridtools::ghex::unstructured::halo_generator; +using domain_descriptor_type = + gridtools::ghex::unstructured::domain_descriptor; +using halo_generator_type = + gridtools::ghex::unstructured::halo_generator; using grid_type = gridtools::ghex::unstructured::grid; template -using data_descriptor_cpu_type = gridtools::ghex::unstructured::data_descriptor; +using data_descriptor_cpu_type = + gridtools::ghex::unstructured::data_descriptor; using timer_type = gridtools::ghex::timer; #ifdef GHEX_CUDACC template using gpu_allocator_type = gridtools::ghex::allocator::cuda::allocator; template -using data_descriptor_gpu_type = gridtools::ghex::unstructured::data_descriptor; +using data_descriptor_gpu_type = + gridtools::ghex::unstructured::data_descriptor; using device_id_type = gridtools::ghex::arch_traits::device_id_type; #endif - template -char* as_bytes(T& i) { +char* +as_bytes(T& i) +{ return reinterpret_cast(&i); } template -std::vector counts_as_bytes(const C& c) { +std::vector +counts_as_bytes(const C& c) +{ std::vector res(c.size()); - std::transform(c.begin(), c.end(), res.begin(), [](auto i){ return i * sizeof(T); }); + std::transform(c.begin(), c.end(), res.begin(), [](auto i) { return i * sizeof(T); }); return res; } -std::vector counts_to_displs(const std::vector& counts) { +std::vector +counts_to_displs(const std::vector& counts) +{ std::vector displs(counts.size(), 0); - for (std::size_t i = 1; i < counts.size(); ++i) { - displs[i] = displs[i-1] + counts[i-1]; - } + for (std::size_t i = 1; i < counts.size(); ++i) { displs[i] = displs[i - 1] + counts[i - 1]; } return displs; } template -void initialize_field(const Domain& d, Field& f, const O d_id_offset) { +void +initialize_field(const Domain& d, Field& f, const O d_id_offset) +{ using value_type = typename Field::value_type; assert(f.size() == d.size() * d.levels()); - for (std::size_t i = 0; i < d.inner_size(); ++i) { - value_type val = static_cast(d.domain_id()) * d_id_offset + static_cast(d.vertices()[i]); - for (std::size_t level = 0; level < d.levels(); ++level) { + for (std::size_t i = 0; i < d.inner_size(); ++i) + { + value_type val = static_cast(d.domain_id()) * d_id_offset + + static_cast(d.vertices()[i]); + for (std::size_t level = 0; level < d.levels(); ++level) + { f[i * d.levels() + level] = val; // TO DO: use different values for different levels } } } template -void check_exchanged_data(const Domain& d, const Pattern& p, const Field& f, const O d_id_offset) { +void +check_exchanged_data(const Domain& d, const Pattern& p, const Field& f, const O d_id_offset) +{ using domain_id_type = typename Domain::domain_id_type; using index_type = typename Pattern::index_type; using value_type = typename Field::value_type; std::map halo_map{}; // index -> recv_domain_id - for (const auto& rh : p.recv_halos()) { - for (const auto i : rh.second.front().local_indices()) { + for (const auto& rh : p.recv_halos()) + { + for (const auto i : rh.second.front().local_indices()) + { halo_map.insert(std::make_pair(i, rh.first.id)); } } - for (const auto& pair : halo_map) { - value_type expected = static_cast(pair.second) * d_id_offset + static_cast(d.vertices()[pair.first]); - for (std::size_t level = 0; level < d.levels(); ++level) { + for (const auto& pair : halo_map) + { + value_type expected = static_cast(pair.second) * d_id_offset + + static_cast(d.vertices()[pair.first]); + for (std::size_t level = 0; level < d.levels(); ++level) + { EXPECT_EQ(f[pair.first * d.levels() + level], expected); } } } template -Domain make_reindexed_domain(const Domain& d, const Pattern& p) { +Domain +make_reindexed_domain(const Domain& d, const Pattern& p) +{ using vertices_type = typename Domain::vertices_type; vertices_type vs{}; vs.reserve(d.size()); vs.insert(vs.end(), d.vertices().begin(), d.vertices().begin() + d.inner_size()); - for (const auto& rh : p.recv_halos()) { - for (const auto i : rh.second.front().local_indices()) { - vs.push_back(d.vertices()[i]); - } + for (const auto& rh : p.recv_halos()) + { + for (const auto i : rh.second.front().local_indices()) { vs.push_back(d.vertices()[i]); } } Domain res{d.domain_id(), vs, d.inner_size(), d.levels()}; return res; } template -int domain_to_rank(const DomainId d_id, const int num_threads) { +int +domain_to_rank(const DomainId d_id, const int num_threads) +{ return d_id / num_threads; } template -std::vector rank_to_domains(const int rank, const int num_threads) { +std::vector +rank_to_domains(const int rank, const int num_threads) +{ std::vector res(num_threads); - for (int i = 0; i < num_threads; ++i) { - res[i] = rank * num_threads + i; - } + for (int i = 0; i < num_threads; ++i) { res[i] = rank * num_threads + i; } return res; } template -struct d_v_pair { - +struct d_v_pair +{ using domain_id_type = DomainId; using v_id_type = VertexId; domain_id_type d_id; - v_id_type v_id; + v_id_type v_id; /** @brief unique ordering given by domain id and vertex id*/ - bool operator < (const d_v_pair& other) const noexcept { + bool operator<(const d_v_pair& other) const noexcept + { return d_id < other.d_id ? true : (d_id == other.d_id ? v_id < other.v_id : false); } - }; -using vertices_dist_type = std::map, std::vector>>; +using vertices_dist_type = + std::map, std::vector>>; using domain_vertices_dist_type = std::map>>; -domain_vertices_dist_type distribute_parmetis(vertices_dist_type& vertices_dist, std::size_t n_vertices, MPI_Comm comm) { - +domain_vertices_dist_type +distribute_parmetis(vertices_dist_type& vertices_dist, std::size_t n_vertices, MPI_Comm comm) +{ int size; MPI_Comm_size(comm, &size); // 1) all-to-all: number of vertices per rank std::vector s_n_vertices_rank(size); - for (int i = 0; i < size; ++i) { - s_n_vertices_rank[i] = vertices_dist[i].size(); // any missing rank gets actually inserted into the map here + for (int i = 0; i < size; ++i) + { + s_n_vertices_rank[i] = + vertices_dist[i].size(); // any missing rank gets actually inserted into the map here }; std::vector r_n_vertices_rank(size); - MPI_Alltoall(s_n_vertices_rank.data(), sizeof(int), MPI_BYTE, - r_n_vertices_rank.data(), sizeof(int), MPI_BYTE, - comm); + MPI_Alltoall(s_n_vertices_rank.data(), sizeof(int), MPI_BYTE, r_n_vertices_rank.data(), + sizeof(int), MPI_BYTE, comm); // 2) all-to-all: vertex ids std::vector s_v_ids_rank{}; s_v_ids_rank.reserve(n_vertices); - for (const auto& r_m_pair : vertices_dist) { - for (const auto& v_a_pair : r_m_pair.second) { + for (const auto& r_m_pair : vertices_dist) + { + for (const auto& v_a_pair : r_m_pair.second) + { s_v_ids_rank.push_back(v_a_pair.first.v_id); } } - std::vector s_v_ids_rank_counts = counts_as_bytes(s_n_vertices_rank); - std::vector s_v_ids_rank_displs = counts_to_displs(s_v_ids_rank_counts); - std::vector r_v_ids_rank(std::accumulate(r_n_vertices_rank.begin(), r_n_vertices_rank.end(), 0)); + std::vector s_v_ids_rank_counts = counts_as_bytes(s_n_vertices_rank); + std::vector s_v_ids_rank_displs = counts_to_displs(s_v_ids_rank_counts); + std::vector r_v_ids_rank( + std::accumulate(r_n_vertices_rank.begin(), r_n_vertices_rank.end(), 0)); std::vector r_v_ids_rank_counts = counts_as_bytes(r_n_vertices_rank); std::vector r_v_ids_rank_displs = counts_to_displs(r_v_ids_rank_counts); - MPI_Alltoallv(s_v_ids_rank.data(), s_v_ids_rank_counts.data(), s_v_ids_rank_displs.data(), MPI_BYTE, - r_v_ids_rank.data(), r_v_ids_rank_counts.data(), r_v_ids_rank_displs.data(), MPI_BYTE, - comm); + MPI_Alltoallv(s_v_ids_rank.data(), s_v_ids_rank_counts.data(), s_v_ids_rank_displs.data(), + MPI_BYTE, r_v_ids_rank.data(), r_v_ids_rank_counts.data(), r_v_ids_rank_displs.data(), + MPI_BYTE, comm); // 3) all-to-all: domain ids std::vector s_d_ids_rank{}; s_d_ids_rank.reserve(n_vertices); - for (const auto& r_m_pair : vertices_dist) { - for (const auto& v_a_pair : r_m_pair.second) { + for (const auto& r_m_pair : vertices_dist) + { + for (const auto& v_a_pair : r_m_pair.second) + { s_d_ids_rank.push_back(v_a_pair.first.d_id); } } std::vector s_d_ids_rank_counts = counts_as_bytes(s_n_vertices_rank); std::vector s_d_ids_rank_displs = counts_to_displs(s_d_ids_rank_counts); - std::vector r_d_ids_rank(std::accumulate(r_n_vertices_rank.begin(), r_n_vertices_rank.end(), 0)); + std::vector r_d_ids_rank( + std::accumulate(r_n_vertices_rank.begin(), r_n_vertices_rank.end(), 0)); std::vector r_d_ids_rank_counts = counts_as_bytes(r_n_vertices_rank); std::vector r_d_ids_rank_displs = counts_to_displs(r_d_ids_rank_counts); - MPI_Alltoallv(s_d_ids_rank.data(), s_d_ids_rank_counts.data(), s_d_ids_rank_displs.data(), MPI_BYTE, - r_d_ids_rank.data(), r_d_ids_rank_counts.data(), r_d_ids_rank_displs.data(), MPI_BYTE, - comm); + MPI_Alltoallv(s_d_ids_rank.data(), s_d_ids_rank_counts.data(), s_d_ids_rank_displs.data(), + MPI_BYTE, r_d_ids_rank.data(), r_d_ids_rank_counts.data(), r_d_ids_rank_displs.data(), + MPI_BYTE, comm); // 4) all-to-all: adjacency size per vertex per rank std::vector s_adjncy_size_vertex_rank{}; s_adjncy_size_vertex_rank.reserve(n_vertices); - for (const auto& r_m_pair : vertices_dist) { - for (const auto& v_a_pair : r_m_pair.second) { + for (const auto& r_m_pair : vertices_dist) + { + for (const auto& v_a_pair : r_m_pair.second) + { s_adjncy_size_vertex_rank.push_back(v_a_pair.second.size()); } } std::vector s_adjncy_size_vertex_rank_counts = counts_as_bytes(s_n_vertices_rank); - std::vector s_adjncy_size_vertex_rank_displs = counts_to_displs(s_adjncy_size_vertex_rank_counts); - std::vector r_adjncy_size_vertex_rank(std::accumulate(r_n_vertices_rank.begin(), r_n_vertices_rank.end(), 0)); + std::vector s_adjncy_size_vertex_rank_displs = + counts_to_displs(s_adjncy_size_vertex_rank_counts); + std::vector r_adjncy_size_vertex_rank( + std::accumulate(r_n_vertices_rank.begin(), r_n_vertices_rank.end(), 0)); std::vector r_adjncy_size_vertex_rank_counts = counts_as_bytes(r_n_vertices_rank); - std::vector r_adjncy_size_vertex_rank_displs = counts_to_displs(r_adjncy_size_vertex_rank_counts); - MPI_Alltoallv(s_adjncy_size_vertex_rank.data(), s_adjncy_size_vertex_rank_counts.data(), s_adjncy_size_vertex_rank_displs.data(), MPI_BYTE, - r_adjncy_size_vertex_rank.data(), r_adjncy_size_vertex_rank_counts.data(), r_adjncy_size_vertex_rank_displs.data(), MPI_BYTE, - comm); + std::vector r_adjncy_size_vertex_rank_displs = + counts_to_displs(r_adjncy_size_vertex_rank_counts); + MPI_Alltoallv(s_adjncy_size_vertex_rank.data(), s_adjncy_size_vertex_rank_counts.data(), + s_adjncy_size_vertex_rank_displs.data(), MPI_BYTE, r_adjncy_size_vertex_rank.data(), + r_adjncy_size_vertex_rank_counts.data(), r_adjncy_size_vertex_rank_displs.data(), MPI_BYTE, + comm); // 5) all-to-all: adjacency per rank std::vector s_adjncy_rank{}; - s_adjncy_rank.reserve(std::accumulate(s_adjncy_size_vertex_rank.begin(), s_adjncy_size_vertex_rank.end(), 0)); - for (const auto& r_m_pair : vertices_dist) { - for (const auto& v_a_pair : r_m_pair.second) { - s_adjncy_rank.insert(s_adjncy_rank.end(), v_a_pair.second.begin(), v_a_pair.second.end()); + s_adjncy_rank.reserve( + std::accumulate(s_adjncy_size_vertex_rank.begin(), s_adjncy_size_vertex_rank.end(), 0)); + for (const auto& r_m_pair : vertices_dist) + { + for (const auto& v_a_pair : r_m_pair.second) + { + s_adjncy_rank.insert(s_adjncy_rank.end(), v_a_pair.second.begin(), + v_a_pair.second.end()); } } std::vector s_adjncy_rank_counts{}; s_adjncy_rank_counts.reserve(size); - for (auto a_it = s_adjncy_size_vertex_rank.begin(), r_it = s_n_vertices_rank.begin(); r_it < s_n_vertices_rank.end(); ++r_it) { + for (auto a_it = s_adjncy_size_vertex_rank.begin(), r_it = s_n_vertices_rank.begin(); + r_it < s_n_vertices_rank.end(); ++r_it) + { s_adjncy_rank_counts.push_back(std::accumulate(a_it, a_it + *r_it, 0) * sizeof(idx_t)); a_it += *r_it; } - std::vector s_adjncy_rank_displs = counts_to_displs(s_adjncy_rank_counts); - std::vector r_adjncy_rank(std::accumulate(r_adjncy_size_vertex_rank.begin(), r_adjncy_size_vertex_rank.end(), 0)); + std::vector s_adjncy_rank_displs = counts_to_displs(s_adjncy_rank_counts); + std::vector r_adjncy_rank( + std::accumulate(r_adjncy_size_vertex_rank.begin(), r_adjncy_size_vertex_rank.end(), 0)); std::vector r_adjncy_rank_counts{}; r_adjncy_rank_counts.reserve(size); - for (auto a_it = r_adjncy_size_vertex_rank.begin(), r_it = r_n_vertices_rank.begin(); r_it < r_n_vertices_rank.end(); ++r_it) { + for (auto a_it = r_adjncy_size_vertex_rank.begin(), r_it = r_n_vertices_rank.begin(); + r_it < r_n_vertices_rank.end(); ++r_it) + { r_adjncy_rank_counts.push_back(std::accumulate(a_it, a_it + *r_it, 0) * sizeof(idx_t)); a_it += *r_it; } std::vector r_adjncy_rank_displs = counts_to_displs(r_adjncy_rank_counts); - MPI_Alltoallv(s_adjncy_rank.data(), s_adjncy_rank_counts.data(), s_adjncy_rank_displs.data(), MPI_BYTE, - r_adjncy_rank.data(), r_adjncy_rank_counts.data(), r_adjncy_rank_displs.data(), MPI_BYTE, - comm); + MPI_Alltoallv(s_adjncy_rank.data(), s_adjncy_rank_counts.data(), s_adjncy_rank_displs.data(), + MPI_BYTE, r_adjncy_rank.data(), r_adjncy_rank_counts.data(), r_adjncy_rank_displs.data(), + MPI_BYTE, comm); // 6) per-domain vertices distribution map domain_vertices_dist_type domain_vertices_dist{}; - for (std::size_t i = 0, a_idx = 0; i < r_v_ids_rank.size(); ++i) { + for (std::size_t i = 0, a_idx = 0; i < r_v_ids_rank.size(); ++i) + { auto a_begin = r_adjncy_rank.begin() + a_idx; auto a_end = a_begin + r_adjncy_size_vertex_rank[i]; - domain_vertices_dist[r_d_ids_rank[i]] - .insert(std::make_pair(r_v_ids_rank[i], std::vector{a_begin, a_end})); + domain_vertices_dist[r_d_ids_rank[i]].insert( + std::make_pair(r_v_ids_rank[i], std::vector{a_begin, a_end})); a_idx += r_adjncy_size_vertex_rank[i]; } return domain_vertices_dist; - } template -void debug_print(const C& c) { +void +debug_print(const C& c) +{ std::cout << "Size = " << c.size() << "; elements = [ "; for (const auto& elem : c) { std::cout << elem << " "; } std::cout << "]\n"; } - /** @brief Unstructured exchange benchmark (in-place receive against buffered receive)*/ -TEST(unstructured_parmetis, receive_type) { - +TEST(unstructured_parmetis, receive_type) +{ // type definitions using data_int_type = int64_t; - static_assert(std::is_same::value, "data integer type must be the same as ParMETIS integer type"); + static_assert(std::is_same::value, + "data integer type must be the same as ParMETIS integer type"); // MPI setup MPI_Comm comm; @@ -301,96 +350,93 @@ TEST(unstructured_parmetis, receive_type) { // Threads auto env_threads = std::getenv("GHEX_PARMETIS_BENCHMARK_NUM_THREADS"); - int num_threads = (env_threads) ? std::atoi(env_threads) : 1; + int num_threads = (env_threads) ? std::atoi(env_threads) : 1; // Ap std::ifstream ap_fs("Ap.out", std::ios_base::binary); ap_fs.seekg(0, std::ios_base::end); // go to the end idx_t all_num_vertices = ap_fs.tellg() / sizeof(idx_t) - 1; - ap_fs.seekg(all_num_vertices / size * sizeof(idx_t) * rank); // rewind to begin of section, according to rank (remainder is handled entirely by last rank, TO DO: not optimal) + ap_fs.seekg( + all_num_vertices / size * sizeof(idx_t) * + rank); // rewind to begin of section, according to rank (remainder is handled entirely by last rank, TO DO: not optimal) std::vector ap{}; - if (rank == (size - 1)) { // last rank reads until eof - for (idx_t b; ap_fs.read(as_bytes(b), sizeof(b)); ) { - ap.push_back(b); - } - } else { // all other ranks read until end of their section - idx_t section_size = all_num_vertices / size + 1; // (CSR format provides always the two endpoints, first included, second excluded) - for (idx_t i = 0, b; i < section_size; ++i) { + if (rank == (size - 1)) + { // last rank reads until eof + for (idx_t b; ap_fs.read(as_bytes(b), sizeof(b));) { ap.push_back(b); } + } + else + { // all other ranks read until end of their section + idx_t section_size = + all_num_vertices / size + + 1; // (CSR format provides always the two endpoints, first included, second excluded) + for (idx_t i = 0, b; i < section_size; ++i) + { ap_fs.read(as_bytes(b), sizeof(b)); ap.push_back(b); } } - idx_t ap_offset = ap.front(); + idx_t ap_offset = ap.front(); std::vector ap_n(ap.size()); - std::transform(ap.begin(), ap.end(), ap_n.begin(), [ap_offset](auto i){ return i - ap_offset; }); // normalize + std::transform(ap.begin(), ap.end(), ap_n.begin(), + [ap_offset](auto i) { return i - ap_offset; }); // normalize // Ai std::ifstream ai_fs("Ai.out", std::ios_base::binary); ai_fs.seekg(ap.front() * sizeof(idx_t)); std::vector ai{}; - for (idx_t i = ap.front(), b; i < ap.back(); ++i) { + for (idx_t i = ap.front(), b; i < ap.back(); ++i) + { ai_fs.read(as_bytes(b), sizeof(b)); ai.push_back(b); } // Vertices initial distribution std::vector vtxdist_v(size + 1); - idx_t num_vertices = all_num_vertices / size; - for (int i = 0; i < size; ++i) { - vtxdist_v[i] = num_vertices * i; - } + idx_t num_vertices = all_num_vertices / size; + for (int i = 0; i < size; ++i) { vtxdist_v[i] = num_vertices * i; } vtxdist_v[size] = all_num_vertices; // Vertices final distribution (output) std::vector part_v(ap.size() - 1); // ParMETIS variables - idx_t wgtflag = 0; - idx_t numflag = 0; - idx_t ncon = 1; // TO DO: might vary - idx_t nparts = size * num_threads; - std::vector tpwgts_v(ncon * nparts, 1 / static_cast(nparts)); // TO DO: might vary - std::vector ubvec_v(ncon, 1.02); // TO DO: might vary + idx_t wgtflag = 0; + idx_t numflag = 0; + idx_t ncon = 1; // TO DO: might vary + idx_t nparts = size * num_threads; + std::vector tpwgts_v(ncon * nparts, + 1 / static_cast(nparts)); // TO DO: might vary + std::vector ubvec_v(ncon, 1.02); // TO DO: might vary std::array options{0, 0, 0}; - idx_t edgecut; + idx_t edgecut; // ParMETIS graph partitioning - ParMETIS_V3_PartKway(vtxdist_v.data(), - ap_n.data(), - ai.data(), - NULL, - NULL, - &wgtflag, - &numflag, - &ncon, - &nparts, - tpwgts_v.data(), - ubvec_v.data(), - options.data(), - &edgecut, - part_v.data(), - &comm); + ParMETIS_V3_PartKway(vtxdist_v.data(), ap_n.data(), ai.data(), NULL, NULL, &wgtflag, &numflag, + &ncon, &nparts, tpwgts_v.data(), ubvec_v.data(), options.data(), &edgecut, part_v.data(), + &comm); // repartition output according to parmetis labeling vertices_dist_type vertices_dist{}; - for (idx_t v_id = vtxdist_v[rank], i = 0; i < static_cast(ap_n.size() - 1); ++v_id, ++i) { - vertices_dist[domain_to_rank(part_v[i], num_threads)] - .insert(std::make_pair(d_v_pair{static_cast(part_v[i]), v_id}, std::vector{ai.begin() + ap_n[i], ai.begin() + ap_n[i+1]})); + for (idx_t v_id = vtxdist_v[rank], i = 0; i < static_cast(ap_n.size() - 1); ++v_id, ++i) + { + vertices_dist[domain_to_rank(part_v[i], num_threads)].insert(std::make_pair( + d_v_pair{static_cast(part_v[i]), v_id}, + std::vector{ai.begin() + ap_n[i], ai.begin() + ap_n[i + 1]})); } auto domain_vertices_dist = distribute_parmetis(vertices_dist, ap_n.size() - 1, comm); // GHEX constants const std::size_t levels = 100; - const idx_t d_id_offset = 10e9; - const int n_iters_warm_up = 50; - const int n_iters = 50; + const idx_t d_id_offset = 10e9; + const int n_iters_warm_up = 50; + const int n_iters = 50; #ifndef GHEX_CUDACC // GHEX context - auto context_ptr = gridtools::ghex::tl::context_factory::create(MPI_COMM_WORLD); + auto context_ptr = gridtools::ghex::tl::context_factory::create(MPI_COMM_WORLD); auto& context = *context_ptr; - int gh_rank = context.rank(); + int gh_rank = context.rank(); // barrier gridtools::ghex::tl::barrier_t gh_barrier{static_cast(num_threads)}; @@ -414,15 +460,15 @@ TEST(unstructured_parmetis, receive_type) { ss_file << gh_rank; std::string filename = #ifdef GHEX_PARMETIS_BENCHMARK_UNORDERED - "unstructured_parmetis_receive_type_unordered_" + "unstructured_parmetis_receive_type_unordered_" #endif #ifdef GHEX_PARMETIS_BENCHMARK_ORDERED - "unstructured_parmetis_receive_type_ordered_" + "unstructured_parmetis_receive_type_ordered_" #endif #ifdef GHEX_PARMETIS_BENCHMARK_IPR - "unstructured_parmetis_receive_type_ipr_" + "unstructured_parmetis_receive_type_ipr_" #endif - + ss_file.str() + ".txt"; + + ss_file.str() + ".txt"; std::ofstream file(filename.c_str()); file << "Unstructured ParMETIS receive type benchmark\n\n"; @@ -430,25 +476,32 @@ TEST(unstructured_parmetis, receive_type) { // setup std::vector local_domains{}; - for (auto d_id : rank_to_domains(gh_rank, num_threads)) { + for (auto d_id : rank_to_domains(gh_rank, num_threads)) + { std::vector vertices{}; - vertices.reserve(domain_vertices_dist[d_id].size()); // any missing domain gets actually inserted into the map here - std::vector adjncy{}; // size may be computed in advance, not preformance critical anyway - for (const auto& v_a_pair : domain_vertices_dist[d_id]) { + vertices.reserve( + domain_vertices_dist[d_id] + .size()); // any missing domain gets actually inserted into the map here + std::vector + adjncy{}; // size may be computed in advance, not preformance critical anyway + for (const auto& v_a_pair : domain_vertices_dist[d_id]) + { vertices.push_back(v_a_pair.first); adjncy.insert(adjncy.end(), v_a_pair.second.begin(), v_a_pair.second.end()); } - local_domains.push_back(domain_descriptor_type{d_id, vertices, adjncy, levels}); // CSR constructor + local_domains.push_back( + domain_descriptor_type{d_id, vertices, adjncy, levels}); // CSR constructor } halo_generator_type hg{}; - auto p = gridtools::ghex::make_pattern(context, hg, local_domains); + auto p = gridtools::ghex::make_pattern(context, hg, local_domains); using pattern_container_type = decltype(p); #ifdef GHEX_PARMETIS_BENCHMARK_UNORDERED - std::vector> f{}; + std::vector> f{}; std::vector> data{}; - for (const auto& d : local_domains) { + for (const auto& d : local_domains) + { std::vector local_f(d.size() * d.levels(), 0); initialize_field(d, local_f, d_id_offset); f.push_back(std::move(local_f)); @@ -456,15 +509,18 @@ TEST(unstructured_parmetis, receive_type) { } // thread function - auto thread_func = [&context, &gh_barrier, &t_buf_local, &t_buf_local_mutex](auto bi){ - auto th_comm = context.get_communicator(); + auto thread_func = [&context, &gh_barrier, &t_buf_local, &t_buf_local_mutex](auto bi) + { + auto th_comm = context.get_communicator(); timer_type t_buf_local_th; - auto co = gridtools::ghex::make_communication_object(th_comm); - for (int i = 0; i < n_iters_warm_up; ++i) { // warm-up + auto co = gridtools::ghex::make_communication_object(th_comm); + for (int i = 0; i < n_iters_warm_up; ++i) + { // warm-up auto h = co.exchange(bi); h.wait(); } - for (int i = 0; i < n_iters; ++i) { // benchmark + for (int i = 0; i < n_iters; ++i) + { // benchmark timer_type t_local; gh_barrier(th_comm); t_local.tic(); @@ -479,13 +535,12 @@ TEST(unstructured_parmetis, receive_type) { // run std::vector threads{}; - for (auto& d : data) { - threads.push_back(std::thread{thread_func, p(d)}); - } + for (auto& d : data) { threads.push_back(std::thread{thread_func, p(d)}); } for (auto& t : threads) t.join(); // check - for (std::size_t i = 0; i < f.size(); ++i) { + for (std::size_t i = 0; i < f.size(); ++i) + { check_exchanged_data(local_domains[i], p[i], f[i], d_id_offset); } @@ -494,19 +549,20 @@ TEST(unstructured_parmetis, receive_type) { // exchanged size idx_t n_halo_vertices_local{0}, n_halo_vertices_global; - for (const auto& d : local_domains) { - n_halo_vertices_local += (d.size() - d.inner_size()); - } - MPI_Allreduce(&n_halo_vertices_local, &n_halo_vertices_global, 1, MPI_INT64_T, MPI_SUM, context.mpi_comm()); // MPI type set according to parmetis idx type + for (const auto& d : local_domains) { n_halo_vertices_local += (d.size() - d.inner_size()); } + MPI_Allreduce(&n_halo_vertices_local, &n_halo_vertices_global, 1, MPI_INT64_T, MPI_SUM, + context.mpi_comm()); // MPI type set according to parmetis idx type // output file << "total exchanged size in GB (assuming value type = idx_t): " - << static_cast(n_halo_vertices_global * levels * sizeof(idx_t) * 2) / (1024.0 * 1024.0 * 1024.0) << "\n\n" + << static_cast(n_halo_vertices_global * levels * sizeof(idx_t) * 2) / + (1024.0 * 1024.0 * 1024.0) + << "\n\n" << "1 - unordered halos - buffered receive - CPU\n" - << "\tlocal time = " << t_buf_local.mean() / 1000.0 - << "+/-" << t_buf_local.stddev() / (std::sqrt(t_buf_local.num_samples()) * 1000.0) << "ms\n" - << "\tglobal time = " << t_buf_global.mean() / 1000.0 - << "+/-" << t_buf_global.stddev() / (std::sqrt(t_buf_global.num_samples()) * 1000.0) << "ms\n"; + << "\tlocal time = " << t_buf_local.mean() / 1000.0 << "+/-" + << t_buf_local.stddev() / (std::sqrt(t_buf_local.num_samples()) * 1000.0) << "ms\n" + << "\tglobal time = " << t_buf_global.mean() / 1000.0 << "+/-" + << t_buf_global.stddev() / (std::sqrt(t_buf_global.num_samples()) * 1000.0) << "ms\n"; #endif @@ -514,16 +570,19 @@ TEST(unstructured_parmetis, receive_type) { // setup std::vector local_domains_ord{}; - for (std::size_t i = 0; i < local_domains.size(); ++i) { + for (std::size_t i = 0; i < local_domains.size(); ++i) + { local_domains_ord.push_back(make_reindexed_domain(local_domains[i], p[i])); } - auto p_ord = gridtools::ghex::make_pattern(context, hg, local_domains_ord); // easiest way, but quite redundant: only recv halos are different + auto p_ord = gridtools::ghex::make_pattern(context, hg, + local_domains_ord); // easiest way, but quite redundant: only recv halos are different #ifdef GHEX_PARMETIS_BENCHMARK_ORDERED - std::vector> f_ord{}; + std::vector> f_ord{}; std::vector> data_ord{}; - for (const auto& d_ord : local_domains_ord) { + for (const auto& d_ord : local_domains_ord) + { std::vector local_f_ord(d_ord.size() * d_ord.levels(), 0); initialize_field(d_ord, local_f_ord, d_id_offset); f_ord.push_back(std::move(local_f_ord)); @@ -531,15 +590,19 @@ TEST(unstructured_parmetis, receive_type) { } // thread function - auto thread_func_ord = [&context, &gh_barrier, &t_ord_buf_local, &t_ord_buf_local_mutex](auto bi){ - auto th_comm = context.get_communicator(); + auto thread_func_ord = [&context, &gh_barrier, &t_ord_buf_local, &t_ord_buf_local_mutex]( + auto bi) + { + auto th_comm = context.get_communicator(); timer_type t_ord_buf_local_th; auto co_ord = gridtools::ghex::make_communication_object(th_comm); - for (int i = 0; i < n_iters_warm_up; ++i) { // warm-up + for (int i = 0; i < n_iters_warm_up; ++i) + { // warm-up auto h_ord = co_ord.exchange(bi); h_ord.wait(); } - for (int i = 0; i < n_iters; ++i) { // benchmark + for (int i = 0; i < n_iters; ++i) + { // benchmark timer_type t_local; gh_barrier(th_comm); t_local.tic(); @@ -554,13 +617,15 @@ TEST(unstructured_parmetis, receive_type) { // run std::vector threads_ord{}; - for (auto& d_ord : data_ord) { + for (auto& d_ord : data_ord) + { threads_ord.push_back(std::thread{thread_func_ord, p_ord(d_ord)}); } for (auto& t_ord : threads_ord) t_ord.join(); // check - for (std::size_t i = 0; i < f_ord.size(); ++i) { + for (std::size_t i = 0; i < f_ord.size(); ++i) + { check_exchanged_data(local_domains_ord[i], p_ord[i], f_ord[i], d_id_offset); } @@ -568,10 +633,11 @@ TEST(unstructured_parmetis, receive_type) { auto t_ord_buf_global = gridtools::ghex::reduce(t_ord_buf_local, context.mpi_comm()); file << "2 - ordered halos - buffered receive - CPU\n" - << "\tlocal time = " << t_ord_buf_local.mean() / 1000.0 - << "+/-" << t_ord_buf_local.stddev() / (std::sqrt(t_ord_buf_local.num_samples()) * 1000.0) << "ms\n" - << "\tglobal time = " << t_ord_buf_global.mean() / 1000.0 - << "+/-" << t_ord_buf_global.stddev() / (std::sqrt(t_ord_buf_global.num_samples()) * 1000.0) << "ms\n"; + << "\tlocal time = " << t_ord_buf_local.mean() / 1000.0 << "+/-" + << t_ord_buf_local.stddev() / (std::sqrt(t_ord_buf_local.num_samples()) * 1000.0) << "ms\n" + << "\tglobal time = " << t_ord_buf_global.mean() / 1000.0 << "+/-" + << t_ord_buf_global.stddev() / (std::sqrt(t_ord_buf_global.num_samples()) * 1000.0) + << "ms\n"; #endif @@ -579,9 +645,10 @@ TEST(unstructured_parmetis, receive_type) { #ifdef GHEX_PARMETIS_BENCHMARK_IPR - std::vector> f_ipr{}; + std::vector> f_ipr{}; std::vector> data_ipr{}; - for (const auto& d_ord : local_domains_ord) { + for (const auto& d_ord : local_domains_ord) + { std::vector local_f_ipr(d_ord.size() * d_ord.levels(), 0); initialize_field(d_ord, local_f_ipr, d_id_offset); f_ipr.push_back(std::move(local_f_ipr)); @@ -589,15 +656,20 @@ TEST(unstructured_parmetis, receive_type) { } // thread function - auto thread_func_ipr = [&context, &gh_barrier, &t_ord_ipr_local, &t_ord_ipr_local_mutex](auto bi){ - auto th_comm = context.get_communicator(); + auto thread_func_ipr = [&context, &gh_barrier, &t_ord_ipr_local, &t_ord_ipr_local_mutex]( + auto bi) + { + auto th_comm = context.get_communicator(); timer_type t_ord_ipr_local_th; - auto co_ipr = gridtools::ghex::make_communication_object_ipr(th_comm); - for (int i = 0; i < n_iters_warm_up; ++i) { // warm-up + auto co_ipr = + gridtools::ghex::make_communication_object_ipr(th_comm); + for (int i = 0; i < n_iters_warm_up; ++i) + { // warm-up auto h_ipr = co_ipr.exchange(bi); h_ipr.wait(); } - for (int i = 0; i < n_iters; ++i) { // benchmark + for (int i = 0; i < n_iters; ++i) + { // benchmark timer_type t_local; gh_barrier(th_comm); t_local.tic(); @@ -612,13 +684,15 @@ TEST(unstructured_parmetis, receive_type) { // run std::vector threads_ipr{}; - for (auto& d_ipr : data_ipr) { + for (auto& d_ipr : data_ipr) + { threads_ipr.push_back(std::thread{thread_func_ipr, p_ord(d_ipr)}); } for (auto& t_ipr : threads_ipr) t_ipr.join(); // check - for (std::size_t i = 0; i < f_ipr.size(); ++i) { + for (std::size_t i = 0; i < f_ipr.size(); ++i) + { check_exchanged_data(local_domains_ord[i], p_ord[i], f_ipr[i], d_id_offset); } @@ -626,36 +700,38 @@ TEST(unstructured_parmetis, receive_type) { auto t_ord_ipr_global = gridtools::ghex::reduce(t_ord_ipr_local, context.mpi_comm()); file << "3 - ordered halos - in-place receive - CPU\n" - << "\tlocal time = " << t_ord_ipr_local.mean() / 1000.0 - << "+/-" << t_ord_ipr_local.stddev() / (std::sqrt(t_ord_ipr_local.num_samples()) * 1000.0) << "ms\n" - << "\tglobal time = " << t_ord_ipr_global.mean() / 1000.0 - << "+/-" << t_ord_ipr_global.stddev() / (std::sqrt(t_ord_ipr_global.num_samples()) * 1000.0) << "ms\n"; + << "\tlocal time = " << t_ord_ipr_local.mean() / 1000.0 << "+/-" + << t_ord_ipr_local.stddev() / (std::sqrt(t_ord_ipr_local.num_samples()) * 1000.0) << "ms\n" + << "\tglobal time = " << t_ord_ipr_global.mean() / 1000.0 << "+/-" + << t_ord_ipr_global.stddev() / (std::sqrt(t_ord_ipr_global.num_samples()) * 1000.0) + << "ms\n"; #endif #else // GHEX context - auto context_ptr = gridtools::ghex::tl::context_factory::create(MPI_COMM_WORLD); + auto context_ptr = gridtools::ghex::tl::context_factory::create(MPI_COMM_WORLD); auto& context = *context_ptr; - int gh_rank = context.rank(); - auto gh_comm = context.get_communicator(); - int num_devices; + int gh_rank = context.rank(); + auto gh_comm = context.get_communicator(); + int num_devices; GHEX_CHECK_CUDA_RESULT(cudaGetDeviceCount(&num_devices)); device_id_type device_id = gh_rank % num_devices; GHEX_CHECK_CUDA_RESULT(cudaSetDevice(device_id)); // timers - timer_type t_buf_local_gpu, t_buf_global_gpu; // 1 - unordered halos - buffered receive + timer_type t_buf_local_gpu, t_buf_global_gpu; // 1 - unordered halos - buffered receive timer_type t_ord_buf_local_gpu, t_ord_buf_global_gpu; // 2 - ordered halos - buffered receive timer_type t_ord_ipr_local_gpu, t_ord_ipr_global_gpu; // 3 - ordered halos - in-place receive // output file std::stringstream ss_file; ss_file << gh_rank; - std::string filename = "unstructured_parmetis_receive_type_gpu_" + ss_file.str() + ".txt"; + std::string filename = "unstructured_parmetis_receive_type_gpu_" + ss_file.str() + ".txt"; std::ofstream file(filename.c_str()); - file << "Unstructured ParMETIS receive type benchmark; DEBUG: GPU device id = " << device_id << "\n\n"; + file << "Unstructured ParMETIS receive type benchmark; DEBUG: GPU device id = " << device_id + << "\n\n"; // GPU allocator gpu_allocator_type gpu_alloc{}; @@ -663,32 +739,37 @@ TEST(unstructured_parmetis, receive_type) { // 1 ======== unordered halos - buffered receive ========================= // setup - domain_id_type d_id{gh_rank}; // 1 domain per rank + domain_id_type d_id{gh_rank}; // 1 domain per rank std::vector vertices{}; - vertices.reserve(domain_vertices_dist[d_id].size()); // any missing domain gets actually inserted into the map here + vertices.reserve(domain_vertices_dist[d_id] + .size()); // any missing domain gets actually inserted into the map here std::vector adjncy{}; // size may be computed in advance, not preformance critical anyway - for (const auto& v_a_pair : domain_vertices_dist[d_id]) { - vertices.push_back(v_a_pair.first); + for (const auto& v_a_pair : domain_vertices_dist[d_id]) + { + vertices.push_back(v_a_pair.first); adjncy.insert(adjncy.end(), v_a_pair.second.begin(), v_a_pair.second.end()); } - domain_descriptor_type d{d_id, vertices, adjncy, levels}; // CSR constructor + domain_descriptor_type d{d_id, vertices, adjncy, levels}; // CSR constructor std::vector local_domains{d}; - halo_generator_type hg{}; + halo_generator_type hg{}; auto p = gridtools::ghex::make_pattern(context, hg, local_domains); using pattern_container_type = decltype(p); auto co = gridtools::ghex::make_communication_object(gh_comm); std::vector f_cpu(d.size() * d.levels(), 0); initialize_field(d, f_cpu, d_id_offset); idx_t* f_gpu = gpu_alloc.allocate(d.size() * d.levels()); - GHEX_CHECK_CUDA_RESULT(cudaMemcpy(f_gpu, f_cpu.data(), d.size() * d.levels() * sizeof(idx_t), cudaMemcpyHostToDevice)); + GHEX_CHECK_CUDA_RESULT(cudaMemcpy(f_gpu, f_cpu.data(), d.size() * d.levels() * sizeof(idx_t), + cudaMemcpyHostToDevice)); data_descriptor_gpu_type data_gpu{d, f_gpu, 1, true, device_id}; // exchange - for (int i = 0; i < n_iters_warm_up; ++i) { // warm-up + for (int i = 0; i < n_iters_warm_up; ++i) + { // warm-up auto h_gpu = co.exchange(p(data_gpu)); h_gpu.wait(); } - for (int i = 0; i < n_iters; ++i) { // benchmark + for (int i = 0; i < n_iters; ++i) + { // benchmark timer_type t_local; MPI_Barrier(context.mpi_comm()); t_local.tic(); @@ -711,22 +792,27 @@ TEST(unstructured_parmetis, receive_type) { // 2 ======== ordered halos - buffered receive =========================== // setup - domain_descriptor_type d_ord = make_reindexed_domain(d, p[0]); + domain_descriptor_type d_ord = make_reindexed_domain(d, p[0]); std::vector local_domains_ord{d_ord}; - auto p_ord = gridtools::ghex::make_pattern(context, hg, local_domains_ord); // easiest way, but quite redundant: only recv halos are different - auto co_ord = gridtools::ghex::make_communication_object(gh_comm); // new one, same conditions + auto p_ord = gridtools::ghex::make_pattern(context, hg, + local_domains_ord); // easiest way, but quite redundant: only recv halos are different + auto co_ord = gridtools::ghex::make_communication_object( + gh_comm); // new one, same conditions std::vector f_ord_cpu(d_ord.size() * d_ord.levels(), 0); initialize_field(d_ord, f_ord_cpu, d_id_offset); idx_t* f_ord_gpu = gpu_alloc.allocate(d_ord.size() * d_ord.levels()); - GHEX_CHECK_CUDA_RESULT(cudaMemcpy(f_ord_gpu, f_ord_cpu.data(), d_ord.size() * d_ord.levels() * sizeof(idx_t), cudaMemcpyHostToDevice)); + GHEX_CHECK_CUDA_RESULT(cudaMemcpy(f_ord_gpu, f_ord_cpu.data(), + d_ord.size() * d_ord.levels() * sizeof(idx_t), cudaMemcpyHostToDevice)); data_descriptor_gpu_type data_ord_gpu{d_ord, f_ord_gpu, 1, true, device_id}; // exchange - for (int i = 0; i < n_iters_warm_up; ++i) { // warm-up + for (int i = 0; i < n_iters_warm_up; ++i) + { // warm-up auto h_ord_gpu = co_ord.exchange(p_ord(data_ord_gpu)); h_ord_gpu.wait(); } - for (int i = 0; i < n_iters; ++i) { // benchmark + for (int i = 0; i < n_iters; ++i) + { // benchmark timer_type t_local; MPI_Barrier(context.mpi_comm()); t_local.tic(); @@ -740,7 +826,8 @@ TEST(unstructured_parmetis, receive_type) { } // check - cudaMemcpy(f_ord_cpu.data(), f_ord_gpu, d_ord.size() * d_ord.levels() * sizeof(idx_t), cudaMemcpyDeviceToHost); + cudaMemcpy(f_ord_cpu.data(), f_ord_gpu, d_ord.size() * d_ord.levels() * sizeof(idx_t), + cudaMemcpyDeviceToHost); check_exchanged_data(d_ord, p_ord[0], f_ord_cpu, d_id_offset); // deallocate @@ -753,15 +840,18 @@ TEST(unstructured_parmetis, receive_type) { std::vector f_ipr_cpu(d_ord.size() * d_ord.levels(), 0); initialize_field(d_ord, f_ipr_cpu, d_id_offset); idx_t* f_ipr_gpu = gpu_alloc.allocate(d_ord.size() * d_ord.levels()); - GHEX_CHECK_CUDA_RESULT(cudaMemcpy(f_ipr_gpu, f_ipr_cpu.data(), d_ord.size() * d_ord.levels() * sizeof(idx_t), cudaMemcpyHostToDevice)); + GHEX_CHECK_CUDA_RESULT(cudaMemcpy(f_ipr_gpu, f_ipr_cpu.data(), + d_ord.size() * d_ord.levels() * sizeof(idx_t), cudaMemcpyHostToDevice)); data_descriptor_gpu_type data_ipr_gpu{d_ord, f_ipr_gpu, 1, true, device_id}; // exchange - for (int i = 0; i < n_iters_warm_up; ++i) { // warm-up + for (int i = 0; i < n_iters_warm_up; ++i) + { // warm-up auto h_ipr_gpu = co_ipr.exchange(p_ord(data_ipr_gpu)); h_ipr_gpu.wait(); } - for (int i = 0; i < n_iters; ++i) { // benchmark + for (int i = 0; i < n_iters; ++i) + { // benchmark timer_type t_local; MPI_Barrier(context.mpi_comm()); t_local.tic(); @@ -775,7 +865,8 @@ TEST(unstructured_parmetis, receive_type) { } // check - cudaMemcpy(f_ipr_cpu.data(), f_ipr_gpu, d_ord.size() * d_ord.levels() * sizeof(idx_t), cudaMemcpyDeviceToHost); + cudaMemcpy(f_ipr_cpu.data(), f_ipr_gpu, d_ord.size() * d_ord.levels() * sizeof(idx_t), + cudaMemcpyDeviceToHost); check_exchanged_data(d_ord, p_ord[0], f_ipr_cpu, d_id_offset); // deallocate @@ -783,32 +874,40 @@ TEST(unstructured_parmetis, receive_type) { // ======== output ======================================================= - idx_t n_halo_vertices_local{static_cast(d.size() - d.inner_size())}, n_halo_vertices_global; - MPI_Allreduce(&n_halo_vertices_local, &n_halo_vertices_global, 1, MPI_INT64_T, MPI_SUM, context.mpi_comm()); // MPI type set according to parmetis idx type + idx_t n_halo_vertices_local{static_cast(d.size() - d.inner_size())}, + n_halo_vertices_global; + MPI_Allreduce(&n_halo_vertices_local, &n_halo_vertices_global, 1, MPI_INT64_T, MPI_SUM, + context.mpi_comm()); // MPI type set according to parmetis idx type file << "total exchanged size in GB (assuming value type = idx_t): " - << static_cast(n_halo_vertices_global * levels * sizeof(idx_t) * 2) / (1024.0 * 1024.0 * 1024.0) << "\n\n"; + << static_cast(n_halo_vertices_global * levels * sizeof(idx_t) * 2) / + (1024.0 * 1024.0 * 1024.0) + << "\n\n"; file << "1 - unordered halos - buffered receive - GPU\n" - << "\tlocal time = " << t_buf_local_gpu.mean() / 1000.0 - << "+/-" << t_buf_local_gpu.stddev() / (std::sqrt(t_buf_local_gpu.num_samples()) * 1000.0) << "ms\n" - << "\tglobal time = " << t_buf_global_gpu.mean() / 1000.0 - << "+/-" << t_buf_global_gpu.stddev() / (std::sqrt(t_buf_global_gpu.num_samples()) * 1000.0) << "ms\n"; + << "\tlocal time = " << t_buf_local_gpu.mean() / 1000.0 << "+/-" + << t_buf_local_gpu.stddev() / (std::sqrt(t_buf_local_gpu.num_samples()) * 1000.0) << "ms\n" + << "\tglobal time = " << t_buf_global_gpu.mean() / 1000.0 << "+/-" + << t_buf_global_gpu.stddev() / (std::sqrt(t_buf_global_gpu.num_samples()) * 1000.0) + << "ms\n"; file << "2 - ordered halos - buffered receive - GPU\n" - << "\tlocal time = " << t_ord_buf_local_gpu.mean() / 1000.0 - << "+/-" << t_ord_buf_local_gpu.stddev() / (std::sqrt(t_ord_buf_local_gpu.num_samples()) * 1000.0) << "ms\n" - << "\tglobal time = " << t_ord_buf_global_gpu.mean() / 1000.0 - << "+/-" << t_ord_buf_global_gpu.stddev() / (std::sqrt(t_ord_buf_global_gpu.num_samples()) * 1000.0) << "ms\n"; + << "\tlocal time = " << t_ord_buf_local_gpu.mean() / 1000.0 << "+/-" + << t_ord_buf_local_gpu.stddev() / (std::sqrt(t_ord_buf_local_gpu.num_samples()) * 1000.0) + << "ms\n" + << "\tglobal time = " << t_ord_buf_global_gpu.mean() / 1000.0 << "+/-" + << t_ord_buf_global_gpu.stddev() / (std::sqrt(t_ord_buf_global_gpu.num_samples()) * 1000.0) + << "ms\n"; file << "3 - ordered halos - in-place receive - GPU\n" - << "\tlocal time = " << t_ord_ipr_local_gpu.mean() / 1000.0 - << "+/-" << t_ord_ipr_local_gpu.stddev() / (std::sqrt(t_ord_ipr_local_gpu.num_samples()) * 1000.0) << "ms\n" - << "\tglobal time = " << t_ord_ipr_global_gpu.mean() / 1000.0 - << "+/-" << t_ord_ipr_global_gpu.stddev() / (std::sqrt(t_ord_ipr_global_gpu.num_samples()) * 1000.0) << "ms\n"; + << "\tlocal time = " << t_ord_ipr_local_gpu.mean() / 1000.0 << "+/-" + << t_ord_ipr_local_gpu.stddev() / (std::sqrt(t_ord_ipr_local_gpu.num_samples()) * 1000.0) + << "ms\n" + << "\tglobal time = " << t_ord_ipr_global_gpu.mean() / 1000.0 << "+/-" + << t_ord_ipr_global_gpu.stddev() / (std::sqrt(t_ord_ipr_global_gpu.num_samples()) * 1000.0) + << "ms\n"; #endif // MPI setup MPI_Comm_free(&comm); - } diff --git a/bindings/fhex/cubed_sphere_bind.cpp b/bindings/fhex/cubed_sphere_bind.cpp index d26788fe..a604d366 100644 --- a/bindings/fhex/cubed_sphere_bind.cpp +++ b/bindings/fhex/cubed_sphere_bind.cpp @@ -19,113 +19,127 @@ #include "ghex_defs.hpp" using namespace gridtools::ghex::fhex; -using arch_type = ghex::cpu; -using domain_id_type = ghex::structured::cubed_sphere::domain_id_type; - -namespace gridtools { - namespace ghex { - namespace fhex { - - struct cubed_sphere_field_descriptor { - fp_type *data; - int offset[3]; - int extents[3]; - int halo[4]; - int n_components; - int layout; - bool is_vector; - }; - - using field_vector_type = std::vector; - struct cubed_sphere_domain_descriptor { - field_vector_type *fields = nullptr; - int tile; - int device_id; - int cube[2]; // local grid dimensions - int first[2]; // indices of the first LOCAL grid point, in global index space - int last[2]; // indices of the last LOCAL grid point, in global index space - }; - - // compare two fields to establish, if the same pattern can be used for both - struct field_compare { - bool operator()( const cubed_sphere_field_descriptor& lhs, const cubed_sphere_field_descriptor& rhs ) const - { - if(lhs.halo[0] < rhs.halo[0]) return true; - if(lhs.halo[0] > rhs.halo[0]) return false; - if(lhs.halo[1] < rhs.halo[1]) return true; - if(lhs.halo[1] > rhs.halo[1]) return false; - if(lhs.halo[2] < rhs.halo[2]) return true; - if(lhs.halo[2] > rhs.halo[2]) return false; - if(lhs.halo[3] < rhs.halo[3]) return true; - if(lhs.halo[3] > rhs.halo[3]) return false; - - return false; - } - }; - - using grid_type = ghex::structured::grid; - using grid_detail_type = ghex::structured::detail::grid>>; // only 3D grids - using domain_descriptor_type = ghex::structured::cubed_sphere::domain_descriptor; - using pattern_type = ghex::pattern_container; - using communication_obj_type = ghex::communication_object; - using pattern_map_type = std::map; - using exchange_handle_type = communication_obj_type::handle_type; - using halo_generator_type = ghex::structured::cubed_sphere::halo_generator; - - // row-major storage - using field_descriptor_type_1 = ghex::structured::cubed_sphere::field_descriptor>; - using pattern_field_type_1 = ghex::buffer_info; - using pattern_field_vector_type_1 = std::pair>, std::vector>; - - // field-major storage - using field_descriptor_type_2 = ghex::structured::cubed_sphere::field_descriptor>; - using pattern_field_type_2 = ghex::buffer_info; - using pattern_field_vector_type_2 = std::pair>, std::vector>; - - struct pattern_field_data { - pattern_field_vector_type_1 row_major; - pattern_field_vector_type_2 field_major; - }; - - // a map of field descriptors to patterns - static pattern_map_type field_to_pattern; - } - } -} +using arch_type = ghex::cpu; +using domain_id_type = ghex::structured::cubed_sphere::domain_id_type; + +namespace gridtools +{ +namespace ghex +{ +namespace fhex +{ -extern "C" -void ghex_cubed_sphere_co_init(obj_wrapper **wco_ref, obj_wrapper *wcomm) +struct cubed_sphere_field_descriptor +{ + fp_type* data; + int offset[3]; + int extents[3]; + int halo[4]; + int n_components; + int layout; + bool is_vector; +}; + +using field_vector_type = std::vector; +struct cubed_sphere_domain_descriptor { - if(nullptr == wcomm) return; - auto &comm = *get_object_ptr_unsafe(wcomm); + field_vector_type* fields = nullptr; + int tile; + int device_id; + int cube[2]; // local grid dimensions + int first[2]; // indices of the first LOCAL grid point, in global index space + int last[2]; // indices of the last LOCAL grid point, in global index space +}; + +// compare two fields to establish, if the same pattern can be used for both +struct field_compare +{ + bool operator()(const cubed_sphere_field_descriptor& lhs, + const cubed_sphere_field_descriptor& rhs) const + { + if (lhs.halo[0] < rhs.halo[0]) return true; + if (lhs.halo[0] > rhs.halo[0]) return false; + if (lhs.halo[1] < rhs.halo[1]) return true; + if (lhs.halo[1] > rhs.halo[1]) return false; + if (lhs.halo[2] < rhs.halo[2]) return true; + if (lhs.halo[2] > rhs.halo[2]) return false; + if (lhs.halo[3] < rhs.halo[3]) return true; + if (lhs.halo[3] > rhs.halo[3]) return false; + + return false; + } +}; + +using grid_type = ghex::structured::grid; +using grid_detail_type = + ghex::structured::detail::grid>>; // only 3D grids +using domain_descriptor_type = ghex::structured::cubed_sphere::domain_descriptor; +using pattern_type = ghex::pattern_container; +using communication_obj_type = + ghex::communication_object; +using pattern_map_type = std::map; +using exchange_handle_type = communication_obj_type::handle_type; +using halo_generator_type = ghex::structured::cubed_sphere::halo_generator; + +// row-major storage +using field_descriptor_type_1 = ghex::structured::cubed_sphere::field_descriptor>; +using pattern_field_type_1 = + ghex::buffer_info; +using pattern_field_vector_type_1 = std::pair>, + std::vector>; + +// field-major storage +using field_descriptor_type_2 = ghex::structured::cubed_sphere::field_descriptor>; +using pattern_field_type_2 = + ghex::buffer_info; +using pattern_field_vector_type_2 = std::pair>, + std::vector>; + +struct pattern_field_data +{ + pattern_field_vector_type_1 row_major; + pattern_field_vector_type_2 field_major; +}; + +// a map of field descriptors to patterns +static pattern_map_type field_to_pattern; +} // namespace fhex +} // namespace ghex +} // namespace gridtools + +extern "C" void +ghex_cubed_sphere_co_init(obj_wrapper** wco_ref, obj_wrapper* wcomm) +{ + if (nullptr == wcomm) return; + auto& comm = *get_object_ptr_unsafe(wcomm); *wco_ref = new obj_wrapper(ghex::make_communication_object(comm)); } -extern "C" -void ghex_cubed_sphere_domain_add_field(cubed_sphere_domain_descriptor *domain_desc, cubed_sphere_field_descriptor *field_desc) +extern "C" void +ghex_cubed_sphere_domain_add_field(cubed_sphere_domain_descriptor* domain_desc, + cubed_sphere_field_descriptor* field_desc) { - if(nullptr == domain_desc || nullptr == field_desc) return; - if(nullptr == domain_desc->fields){ - domain_desc->fields = new field_vector_type(); - } + if (nullptr == domain_desc || nullptr == field_desc) return; + if (nullptr == domain_desc->fields) { domain_desc->fields = new field_vector_type(); } domain_desc->fields->push_back(*field_desc); } -extern "C" -void ghex_cubed_sphere_domain_free(cubed_sphere_domain_descriptor *domain_desc) +extern "C" void +ghex_cubed_sphere_domain_free(cubed_sphere_domain_descriptor* domain_desc) { - if(nullptr == domain_desc) return; + if (nullptr == domain_desc) return; delete domain_desc->fields; domain_desc->fields = nullptr; domain_desc->tile = -1; domain_desc->device_id = -1; } -extern "C" -void* ghex_cubed_sphere_exchange_desc_new(cubed_sphere_domain_descriptor *domains_desc, int n_domains) +extern "C" void* +ghex_cubed_sphere_exchange_desc_new(cubed_sphere_domain_descriptor* domains_desc, int n_domains) { - - if(0 == n_domains || nullptr == domains_desc) return nullptr; + if (0 == n_domains || nullptr == domains_desc) return nullptr; // Create all necessary patterns: // 1. make a vector of local domain descriptors @@ -135,69 +149,80 @@ void* ghex_cubed_sphere_exchange_desc_new(cubed_sphere_domain_descriptor *domain // switch from fortran 1-based numbering to C std::vector local_domains; - for(int i=0; i &halo = *((std::array*)(field.halo)); - auto halo_generator = halo_generator_type(halo); - pit = field_to_pattern.emplace(std::make_pair(std::move(field), - ghex::make_pattern(*ghex_context, halo_generator, local_domains))).first; + if (pit == field_to_pattern.end()) + { + std::array& halo = *((std::array*)(field.halo)); + auto halo_generator = halo_generator_type(halo); + pit = field_to_pattern + .emplace(std::make_pair(std::move(field), + ghex::make_pattern(*ghex_context, halo_generator, + local_domains))) + .first; } - pattern_type &pattern = (*pit).second; - std::array &offset = *((std::array*)field.offset); - std::array &extents = *((std::array*)field.extents); + pattern_type& pattern = (*pit).second; + std::array& offset = *((std::array*)field.offset); + std::array& extents = *((std::array*)field.extents); // ASYMETRY - if(GhexLayoutFieldLast == field.layout){ - std::unique_ptr field_desc_uptr(new field_descriptor_type_1(local_domains[i], field.data, offset, extents, field.n_components, field.is_vector)); - auto ptr = field_desc_uptr.get(); - pattern_fields.row_major.first.push_back(std::move(field_desc_uptr)); - pattern_fields.row_major.second.push_back(pattern(*ptr)); - } else { - std::unique_ptr field_desc_uptr(new field_descriptor_type_2(local_domains[i], field.data, offset, extents, field.n_components, field.is_vector)); - auto ptr = field_desc_uptr.get(); - pattern_fields.field_major.first.push_back(std::move(field_desc_uptr)); - pattern_fields.field_major.second.push_back(pattern(*ptr)); - } + if (GhexLayoutFieldLast == field.layout) + { + std::unique_ptr field_desc_uptr( + new field_descriptor_type_1(local_domains[i], field.data, offset, extents, + field.n_components, field.is_vector)); + auto ptr = field_desc_uptr.get(); + pattern_fields.row_major.first.push_back(std::move(field_desc_uptr)); + pattern_fields.row_major.second.push_back(pattern(*ptr)); + } + else + { + std::unique_ptr field_desc_uptr( + new field_descriptor_type_2(local_domains[i], field.data, offset, extents, + field.n_components, field.is_vector)); + auto ptr = field_desc_uptr.get(); + pattern_fields.field_major.first.push_back(std::move(field_desc_uptr)); + pattern_fields.field_major.second.push_back(pattern(*ptr)); + } } } return new obj_wrapper(std::move(pattern_fields)); } -extern "C" -void *ghex_cubed_sphere_exchange(obj_wrapper *cowrapper, obj_wrapper *ewrapper) +extern "C" void* +ghex_cubed_sphere_exchange(obj_wrapper* cowrapper, obj_wrapper* ewrapper) { - if(nullptr == cowrapper || nullptr == ewrapper) return nullptr; - communication_obj_type &co = *get_object_ptr_unsafe(cowrapper); - pattern_field_data &pattern_fields = *get_object_ptr_unsafe(ewrapper); - return new obj_wrapper(co.exchange(pattern_fields.row_major.second.begin(), - pattern_fields.row_major.second.end(), - pattern_fields.field_major.second.begin(), - pattern_fields.field_major.second.end())); + if (nullptr == cowrapper || nullptr == ewrapper) return nullptr; + communication_obj_type& co = *get_object_ptr_unsafe(cowrapper); + pattern_field_data& pattern_fields = *get_object_ptr_unsafe(ewrapper); + return new obj_wrapper( + co.exchange(pattern_fields.row_major.second.begin(), pattern_fields.row_major.second.end(), + pattern_fields.field_major.second.begin(), pattern_fields.field_major.second.end())); } -extern "C" -void ghex_cubed_sphere_exchange_handle_wait(obj_wrapper **ehwrapper) +extern "C" void +ghex_cubed_sphere_exchange_handle_wait(obj_wrapper** ehwrapper) { - if(nullptr == *ehwrapper) return; - exchange_handle_type &hex = *get_object_ptr_unsafe(*ehwrapper); + if (nullptr == *ehwrapper) return; + exchange_handle_type& hex = *get_object_ptr_unsafe(*ehwrapper); hex.wait(); *ehwrapper = nullptr; } diff --git a/bindings/python/src/_pyghex/config.cpp b/bindings/python/src/_pyghex/config.cpp index 164597ba..2e725c72 100644 --- a/bindings/python/src/_pyghex/config.cpp +++ b/bindings/python/src/_pyghex/config.cpp @@ -81,9 +81,9 @@ print_config(const pybind11::dict& d) void register_config(pybind11::module& m) { - m - .def("config", &config, "Get GHEX's configuration.") - .def("print_config", [](const pybind11::dict& d) { return print_config(d); }, + m.def("config", &config, "Get GHEX's configuration.") + .def( + "print_config", [](const pybind11::dict& d) { return print_config(d); }, "Print GHEX's configuration."); } } // namespace pyghex diff --git a/bindings/python/src/_pyghex/context_shim.cpp b/bindings/python/src/_pyghex/context_shim.cpp index 2c531aeb..7c8db752 100644 --- a/bindings/python/src/_pyghex/context_shim.cpp +++ b/bindings/python/src/_pyghex/context_shim.cpp @@ -53,7 +53,8 @@ register_context(pybind11::module& m) "size", [](const context_shim& c) { return c.m.size(); }, "number of ranks within the communicator"); - m.def("expose_cpp_ptr", [](context_shim* obj){return reinterpret_cast(&obj->m);}); + m.def("expose_cpp_ptr", + [](context_shim* obj) { return reinterpret_cast(&obj->m); }); } } // namespace pyghex diff --git a/bindings/python/src/_pyghex/mpi_comm_shim.cpp b/bindings/python/src/_pyghex/mpi_comm_shim.cpp index e6447dbe..73d94c77 100644 --- a/bindings/python/src/_pyghex/mpi_comm_shim.cpp +++ b/bindings/python/src/_pyghex/mpi_comm_shim.cpp @@ -109,7 +109,6 @@ register_mpi(pybind11::module& m) m.def("mpi_finalize", &mpi_finalize, "Finalize MPI (calls MPI_Finalize)"); m.def("mpi_is_initialized", &mpi_is_initialized, "Check if MPI is initialized."); m.def("mpi_is_finalized", &mpi_is_finalized, "Check if MPI is finalized."); - } } // namespace pyghex diff --git a/bindings/python/src/_pyghex/py_dtype_to_cpp_name.cpp b/bindings/python/src/_pyghex/py_dtype_to_cpp_name.cpp index cb866a5f..18e1e6d0 100644 --- a/bindings/python/src/_pyghex/py_dtype_to_cpp_name.cpp +++ b/bindings/python/src/_pyghex/py_dtype_to_cpp_name.cpp @@ -22,21 +22,24 @@ namespace py = pybind11; namespace pyghex { -std::string py_dtype_to_cpp_name(py::dtype dtype) { +std::string +py_dtype_to_cpp_name(py::dtype dtype) +{ std::string cpp_name; - gridtools::for_each([&cpp_name, &dtype](auto l) { - using type = decltype(l); + gridtools::for_each( + [&cpp_name, &dtype](auto l) + { + using type = decltype(l); - if (dtype.is(py::dtype::of())) { - assert(cpp_name.empty()); - cpp_name = util::mangle_python(); - } - }); + if (dtype.is(py::dtype::of())) + { + assert(cpp_name.empty()); + cpp_name = util::mangle_python(); + } + }); - if (cpp_name.empty()) { - throw std::invalid_argument("Unsupported numpy dtype"); - } + if (cpp_name.empty()) { throw std::invalid_argument("Unsupported numpy dtype"); } return cpp_name; } diff --git a/bindings/python/src/_pyghex/register_class.hpp b/bindings/python/src/_pyghex/register_class.hpp index 78e75baf..d33f669e 100644 --- a/bindings/python/src/_pyghex/register_class.hpp +++ b/bindings/python/src/_pyghex/register_class.hpp @@ -18,12 +18,14 @@ namespace pyghex { template -auto register_class(pybind11::module& m) { - +auto +register_class(pybind11::module& m) +{ auto demangled = util::demangle(); auto pymangled = util::mangle_python(demangled); return pybind11::class_(m, pymangled.c_str()) - .def_property_readonly_static("__cpp_type__", [demangled](const pybind11::object&) { return demangled; }) + .def_property_readonly_static("__cpp_type__", + [demangled](const pybind11::object&) { return demangled; }) .def("__str__", [pymangled](const T&) { return ""; }) .def("__repr__", [pymangled](const T&) { return ""; }); } diff --git a/bindings/python/src/_pyghex/structured/regular/communication_object.cpp b/bindings/python/src/_pyghex/structured/regular/communication_object.cpp index 2adba96f..d46d2de1 100644 --- a/bindings/python/src/_pyghex/structured/regular/communication_object.cpp +++ b/bindings/python/src/_pyghex/structured/regular/communication_object.cpp @@ -43,8 +43,7 @@ register_communication_object(pybind11::module& m) auto _handle = register_class(m); - _handle - .def("wait", &handle_type::wait) + _handle.def("wait", &handle_type::wait) .def("is_ready", &handle_type::is_ready) .def("progress", &handle_type::progress); @@ -62,28 +61,26 @@ register_communication_object(pybind11::module& m) { return co.exchange(b.begin(), b.end()); }, pybind11::keep_alive<0, 1>()) .def( - "exchange", - [](communication_object_shim& co, buffer_info_type& b) - { return co.exchange(b); }, - pybind11::keep_alive<0, 1>()) + "exchange", [](communication_object_shim& co, buffer_info_type& b) + { return co.exchange(b); }, pybind11::keep_alive<0, 1>()) .def( "exchange", - [](communication_object_shim& co, buffer_info_type& b0, buffer_info_type& b1) - { return co.exchange(b0, b1); }, + [](communication_object_shim& co, buffer_info_type& b0, + buffer_info_type& b1) { return co.exchange(b0, b1); }, pybind11::keep_alive<0, 1>()) .def( "exchange", - [](communication_object_shim& co, buffer_info_type& b0, buffer_info_type& b1, buffer_info_type& b2) + [](communication_object_shim& co, buffer_info_type& b0, + buffer_info_type& b1, buffer_info_type& b2) { return co.exchange(b0, b1, b2); }, pybind11::keep_alive<0, 1>()); }); }); - m.def( - "make_co_regular", - [](context_shim& c){ return communication_object_shim{&c.m, std::monostate{}}; }, - pybind11::keep_alive<0, 1>()); - + m.def( + "make_co_regular", + [](context_shim& c) { return communication_object_shim{&c.m, std::monostate{}}; }, + pybind11::keep_alive<0, 1>()); } } //namespace regular diff --git a/bindings/python/src/_pyghex/structured/regular/communication_object.hpp b/bindings/python/src/_pyghex/structured/regular/communication_object.hpp index afa75521..e0854357 100644 --- a/bindings/python/src/_pyghex/structured/regular/communication_object.hpp +++ b/bindings/python/src/_pyghex/structured/regular/communication_object.hpp @@ -34,32 +34,34 @@ using communication_object_specializations = communication_object_args>; } // namespace - // Communication object specializations are stored in a variant and constructed on demand before the first exchange. // - this removes the need to inject the pattern type at construction, i.e. // in the python function `make_communication_object` doesn't require a pattern object to infer the type anymore // - if this communication object shim is later used with a different *type* of pattern, for example // a 2d pattern instead of a 3d pattern, the exchange will fail with an exception -struct communication_object_shim { +struct communication_object_shim +{ // the variant's first alternative is of type std::monostate to indicate the empty state - using variant_t = - gridtools::meta::rename>; + using variant_t = gridtools::meta::rename>; ghex::context* ctx = nullptr; - variant_t m; + variant_t m; // exchange of buffer info objects template - auto exchange(ghex::buffer_info&... b) { + auto exchange(ghex::buffer_info&... b) + { return get_co>().exchange(b...); } // exchange of iterator pairs pointing to buffer info ranges template - auto exchange(Its... its) { + auto exchange(Its... its) + { // need even number of iterators (begin and end) static_assert(sizeof...(Its) % 2 == 0); - return exchange_from_iterators(std::make_tuple(std::move(its)...), std::make_index_sequence()); + return exchange_from_iterators(std::make_tuple(std::move(its)...), + std::make_index_sequence()); } private: @@ -73,9 +75,10 @@ struct communication_object_shim { // helper function for iterators template - auto exchange_from_iterators(std::tuple t, std::index_sequence) { + auto exchange_from_iterators(std::tuple t, std::index_sequence) + { // every second iterator is a begin - using begins = decltype(std::make_tuple(std::get(t)...)); + using begins = decltype(std::make_tuple(std::get(t)...)); static constexpr std::size_t half_size = sizeof...(Is); return get_co>().exchange( std::get(t)..., std::get(t)...); @@ -85,7 +88,8 @@ struct communication_object_shim { // - will initialize the communication object if the variant is empty // - will throw if a different communication object specialization was initialized earlier template - auto& get_co() { + auto& get_co() + { // extract and deduplicate grids from patterns using grids = gridtools::meta::dedup>; // check that all grids are of same type @@ -97,11 +101,12 @@ struct communication_object_shim { static_assert(gridtools::meta::length::value == 1); // communication object type - using co_t = ghex::communication_object, gridtools::meta::at_c>; + using co_t = ghex::communication_object, + gridtools::meta::at_c>; // check whether co_t is in variant static_assert(gridtools::meta::find::value < - gridtools::meta::length::value); + gridtools::meta::length::value); // initialize variant with communication object if necessary if (m.index() == 0) m.emplace(*ctx); diff --git a/bindings/python/src/_pyghex/structured/regular/field_descriptor.cpp b/bindings/python/src/_pyghex/structured/regular/field_descriptor.cpp index 68bc1bfb..e38664b0 100644 --- a/bindings/python/src/_pyghex/structured/regular/field_descriptor.cpp +++ b/bindings/python/src/_pyghex/structured/regular/field_descriptor.cpp @@ -93,13 +93,12 @@ struct buffer_info_accessor assert(pybind11::ssize_t(strides.size()) == ndim); } - return pybind11::buffer_info( - ptr, /* Pointer to buffer */ - itemsize, /* Size of one scalar */ - format, /* Python struct-style format descriptor */ - ndim, /* Number of dimensions */ - shape, /* Buffer dimensions */ - strides /* Strides (in bytes) for each index */ + return pybind11::buffer_info(ptr, /* Pointer to buffer */ + itemsize, /* Size of one scalar */ + format, /* Python struct-style format descriptor */ + ndim, /* Number of dimensions */ + shape, /* Buffer dimensions */ + strides /* Strides (in bytes) for each index */ ); } }; @@ -132,41 +131,47 @@ register_field_descriptor(pybind11::module& m) using array = std::array; using grid_type = ghex::structured::grid::template type; using pattern_type = ghex::pattern; - using buffer_info_type = ghex::buffer_info; + using buffer_info_type = + ghex::buffer_info; auto _field_descriptor = register_class(m); - /*auto _buffer_info =*/ register_class(m); - - _field_descriptor - .def(pybind11::init( - [](const domain_descriptor_type& dom, pybind11::object& b, - const array& offsets, const array& extents) - { - pybind11::buffer_info info = get_buffer_info(b); - - if (!info.item_type_is_equivalent_to()) - { - std::stringstream error; - error << "Incompatible format: expected a " << typeid(T).name() - << " buffer."; - throw pybind11::type_error(error.str()); - } - - auto ordered_strides = info.strides; - std::sort(ordered_strides.begin(), ordered_strides.end(), [](int a, int b) { return a > b; }); - array b_layout_map; - for (size_t i = 0; i < dimension::value; ++i) { - auto it = std::find(ordered_strides.begin(), ordered_strides.end(), info.strides[i]); - b_layout_map[i] = std::distance(ordered_strides.begin(), it); - if (b_layout_map[i] != layout_map::at(i)) { - throw pybind11::type_error("Buffer has a different layout than specified."); - } - } - - return ghex::wrap_field(dom, - static_cast(info.ptr), offsets, extents, info.strides); - }), - pybind11::keep_alive<0, 2>()); + /*auto _buffer_info =*/register_class(m); + + _field_descriptor.def( + pybind11::init( + [](const domain_descriptor_type& dom, pybind11::object& b, const array& offsets, + const array& extents) + { + pybind11::buffer_info info = get_buffer_info(b); + + if (!info.item_type_is_equivalent_to()) + { + std::stringstream error; + error << "Incompatible format: expected a " << typeid(T).name() + << " buffer."; + throw pybind11::type_error(error.str()); + } + + auto ordered_strides = info.strides; + std::sort(ordered_strides.begin(), ordered_strides.end(), + [](int a, int b) { return a > b; }); + array b_layout_map; + for (size_t i = 0; i < dimension::value; ++i) + { + auto it = std::find(ordered_strides.begin(), ordered_strides.end(), + info.strides[i]); + b_layout_map[i] = std::distance(ordered_strides.begin(), it); + if (b_layout_map[i] != layout_map::at(i)) + { + throw pybind11::type_error( + "Buffer has a different layout than specified."); + } + } + + return ghex::wrap_field(dom, + static_cast(info.ptr), offsets, extents, info.strides); + }), + pybind11::keep_alive<0, 2>()); }); } diff --git a/bindings/python/src/_pyghex/structured/regular/halo_generator.cpp b/bindings/python/src/_pyghex/structured/regular/halo_generator.cpp index 60f412e0..a54e6851 100644 --- a/bindings/python/src/_pyghex/structured/regular/halo_generator.cpp +++ b/bindings/python/src/_pyghex/structured/regular/halo_generator.cpp @@ -38,7 +38,6 @@ register_halo_generator(pybind11::module& m) using box = typename type::box; using box2 = typename type::box2; - auto _halo_generator = register_class(m); auto _box = register_class(m); auto _box2 = register_class(m); @@ -54,8 +53,7 @@ register_halo_generator(pybind11::module& m) .def_property_readonly("global_", pybind11::overload_cast<>(&box2::global, pybind11::const_)); - _box - .def_property_readonly("first", + _box.def_property_readonly("first", [](const box& b) { auto first = b.first(); diff --git a/bindings/python/src/_pyghex/structured/regular/pattern.cpp b/bindings/python/src/_pyghex/structured/regular/pattern.cpp index 845b1c79..629b870b 100644 --- a/bindings/python/src/_pyghex/structured/regular/pattern.cpp +++ b/bindings/python/src/_pyghex/structured/regular/pattern.cpp @@ -65,10 +65,8 @@ register_pattern(pybind11::module& m) // `&pattern_container::template operator()` leads to an // "identifier undefined in device code" error when using NVCC _pattern_container.def( - "__call__", - [](const pattern_container& pattern, field& f) - { return pattern(f); }, - pybind11::keep_alive<0, 2>()); + "__call__", [](const pattern_container& pattern, field& f) + { return pattern(f); }, pybind11::keep_alive<0, 2>()); }); }); } diff --git a/bindings/python/src/_pyghex/unstructured/communication_object.cpp b/bindings/python/src/_pyghex/unstructured/communication_object.cpp index 1130c510..085514fc 100644 --- a/bindings/python/src/_pyghex/unstructured/communication_object.cpp +++ b/bindings/python/src/_pyghex/unstructured/communication_object.cpp @@ -41,8 +41,7 @@ register_communication_object(pybind11::module& m) auto _communication_object = register_class(m); auto _handle = register_class(m); - _handle - .def("wait", &handle::wait) + _handle.def("wait", &handle::wait) .def("is_ready", &handle::is_ready) .def("progress", &handle::progress); @@ -55,18 +54,15 @@ register_communication_object(pybind11::module& m) _communication_object .def( - "exchange", - [](type& co, std::vector b) + "exchange", [](type& co, std::vector b) { return co.exchange(b.begin(), b.end()); }, pybind11::keep_alive<0, 1>()) .def( - "exchange", [](type& co, buffer_info_type& b) { return co.exchange(b); }, - pybind11::keep_alive<0, 1>()) + "exchange", [](type& co, buffer_info_type& b) + { return co.exchange(b); }, pybind11::keep_alive<0, 1>()) .def( - "exchange", - [](type& co, buffer_info_type& b0, buffer_info_type& b1) - { return co.exchange(b0, b1); }, - pybind11::keep_alive<0, 1>()) + "exchange", [](type& co, buffer_info_type& b0, buffer_info_type& b1) + { return co.exchange(b0, b1); }, pybind11::keep_alive<0, 1>()) .def( "exchange", [](type& co, buffer_info_type& b0, buffer_info_type& b1, @@ -74,17 +70,14 @@ register_communication_object(pybind11::module& m) pybind11::keep_alive<0, 1>()); }); - m.def("make_co_unstructured", - [](context_shim& c) - { - return type{c.m}; - }, + m.def( + "make_co_unstructured", [](context_shim& c) { return type{c.m}; }, pybind11::keep_alive<0, 1>()); - m.def("expose_cpp_ptr", [](type* obj){return reinterpret_cast(obj);}); + m.def("expose_cpp_ptr", + [](type* obj) { return reinterpret_cast(obj); }); }); } } // namespace unstructured } // namespace pyghex - diff --git a/bindings/python/src/_pyghex/unstructured/communication_object.hpp b/bindings/python/src/_pyghex/unstructured/communication_object.hpp index 348cc2ec..8bda5192 100644 --- a/bindings/python/src/_pyghex/unstructured/communication_object.hpp +++ b/bindings/python/src/_pyghex/unstructured/communication_object.hpp @@ -26,4 +26,3 @@ using communication_object_specializations = } // namespace } // namespace unstructured } // namespace pyghex - diff --git a/bindings/python/src/_pyghex/unstructured/domain_descriptor.cpp b/bindings/python/src/_pyghex/unstructured/domain_descriptor.cpp index c9be08de..2f6af561 100644 --- a/bindings/python/src/_pyghex/unstructured/domain_descriptor.cpp +++ b/bindings/python/src/_pyghex/unstructured/domain_descriptor.cpp @@ -47,11 +47,11 @@ register_domain_descriptor(pybind11::module& m) .def("size", &type::size, "Returns the size") .def("inner_size", &type::inner_size, "Returns the inner size") .def( - "indices", - [](const type& d) -> std::vector { return d.gids(); }, - "Returns the indices"); + "indices", [](const type& d) -> std::vector + { return d.gids(); }, "Returns the indices"); - m.def("expose_cpp_ptr", [](type* obj){return reinterpret_cast(obj);}); + m.def("expose_cpp_ptr", + [](type* obj) { return reinterpret_cast(obj); }); }); } diff --git a/bindings/python/src/_pyghex/unstructured/field_descriptor.cpp b/bindings/python/src/_pyghex/unstructured/field_descriptor.cpp index b2daf7b8..75f4e3e4 100644 --- a/bindings/python/src/_pyghex/unstructured/field_descriptor.cpp +++ b/bindings/python/src/_pyghex/unstructured/field_descriptor.cpp @@ -84,13 +84,12 @@ struct buffer_info_accessor assert(pybind11::ssize_t(strides.size()) == ndim); } - return pybind11::buffer_info( - ptr, /* Pointer to buffer */ - itemsize, /* Size of one scalar */ - format, /* Python struct-style format descriptor */ - ndim, /* Number of dimensions */ - shape, /* Buffer dimensions */ - strides /* Strides (in bytes) for each index */ + return pybind11::buffer_info(ptr, /* Pointer to buffer */ + itemsize, /* Size of one scalar */ + format, /* Python struct-style format descriptor */ + ndim, /* Number of dimensions */ + shape, /* Buffer dimensions */ + strides /* Strides (in bytes) for each index */ ); } }; @@ -123,10 +122,10 @@ register_field_descriptor(pybind11::module& m) using buffer_info_type = ghex::buffer_info; auto _field_descriptor = register_class(m); - /*auto _buffer_info = */register_class(m); + /*auto _buffer_info = */ register_class(m); - _field_descriptor - .def(pybind11::init( + _field_descriptor.def( + pybind11::init( [](const domain_descriptor_type& dom, pybind11::object& b) { pybind11::buffer_info info = get_buffer_info(b); @@ -150,35 +149,40 @@ register_field_descriptor(pybind11::module& m) "field's first dimension must match the size of the domain"); } - bool levels_first = true; + bool levels_first = true; std::size_t outer_strides = 0u; if (info.ndim == 2 && info.strides[1] != sizeof(T)) { levels_first = false; if (info.strides[0] != sizeof(T)) - throw pybind11::type_error("field's strides are not compatible with GHEX"); + throw pybind11::type_error( + "field's strides are not compatible with GHEX"); outer_strides = info.strides[1] / sizeof(T); - if (outer_strides*sizeof(T) != (std::size_t)(info.strides[1])) - throw pybind11::type_error("field's strides are not compatible with GHEX"); + if (outer_strides * sizeof(T) != (std::size_t)(info.strides[1])) + throw pybind11::type_error( + "field's strides are not compatible with GHEX"); } else if (info.ndim == 2) { if (info.strides[1] != sizeof(T)) - throw pybind11::type_error("field's strides are not compatible with GHEX"); + throw pybind11::type_error( + "field's strides are not compatible with GHEX"); outer_strides = info.strides[0] / sizeof(T); - if (outer_strides*sizeof(T) != (std::size_t)(info.strides[0])) - throw pybind11::type_error("field's strides are not compatible with GHEX"); + if (outer_strides * sizeof(T) != (std::size_t)(info.strides[0])) + throw pybind11::type_error( + "field's strides are not compatible with GHEX"); } else { if (info.strides[0] != sizeof(T)) - throw pybind11::type_error("field's strides are not compatible with GHEX"); + throw pybind11::type_error( + "field's strides are not compatible with GHEX"); } - std::size_t levels = - (info.ndim == 1) ? 1u : (std::size_t)info.shape[1]; + std::size_t levels = (info.ndim == 1) ? 1u : (std::size_t)info.shape[1]; - return type{dom, static_cast(info.ptr), levels, levels_first, outer_strides}; - }), + return type{dom, static_cast(info.ptr), levels, levels_first, + outer_strides}; + }), pybind11::keep_alive<0, 2>()); }); } diff --git a/bindings/python/src/_pyghex/unstructured/field_descriptor.hpp b/bindings/python/src/_pyghex/unstructured/field_descriptor.hpp index 09f0bad8..f59ba7d0 100644 --- a/bindings/python/src/_pyghex/unstructured/field_descriptor.hpp +++ b/bindings/python/src/_pyghex/unstructured/field_descriptor.hpp @@ -19,8 +19,8 @@ namespace unstructured { namespace { -using field_descriptor_args = gridtools::meta::cartesian_product; +using field_descriptor_args = gridtools::meta::cartesian_product; using field_descriptor_specializations = gridtools::meta::transform< gridtools::meta::rename::template apply, @@ -28,4 +28,3 @@ using field_descriptor_specializations = gridtools::meta::transform< } // namespace } // namespace unstructured } // namespace pyghex - diff --git a/bindings/python/src/_pyghex/unstructured/halo_generator.cpp b/bindings/python/src/_pyghex/unstructured/halo_generator.cpp index 0d65d994..f505c958 100644 --- a/bindings/python/src/_pyghex/unstructured/halo_generator.cpp +++ b/bindings/python/src/_pyghex/unstructured/halo_generator.cpp @@ -31,11 +31,11 @@ register_halo_generator(pybind11::module& m) using halo = typename type::halo; auto _halo_generator = register_class(m); - /*auto _halo = */register_class(m); + /*auto _halo = */ register_class(m); - _halo_generator - .def(pybind11::init<>(), "Create a halo generator") - .def(pybind11::init([](const std::vector& gids){ return type{gids};})) + _halo_generator.def(pybind11::init<>(), "Create a halo generator") + .def(pybind11::init( + [](const std::vector& gids) { return type{gids}; })) .def("__call__", &type::operator()); }); } diff --git a/bindings/python/src/_pyghex/unstructured/halo_generator.hpp b/bindings/python/src/_pyghex/unstructured/halo_generator.hpp index 73838e3d..b8fb5b1b 100644 --- a/bindings/python/src/_pyghex/unstructured/halo_generator.hpp +++ b/bindings/python/src/_pyghex/unstructured/halo_generator.hpp @@ -27,4 +27,3 @@ using halo_generator_specializations = gridtools::meta::transform< } // namespace } // namespace unstructured } // namespace pyghex - diff --git a/bindings/python/src/_pyghex/unstructured/pattern.cpp b/bindings/python/src/_pyghex/unstructured/pattern.cpp index 3f505aeb..01f1ae12 100644 --- a/bindings/python/src/_pyghex/unstructured/pattern.cpp +++ b/bindings/python/src/_pyghex/unstructured/pattern.cpp @@ -52,10 +52,8 @@ register_pattern(pybind11::module& m) { return util::mangle_python(); }); m.def( - "make_pattern_unstructured", - [](context_shim& c, halo_gen& h, domain_range& d) - { return ghex::make_pattern(c.m, h, d); }, - pybind11::keep_alive<0, 1>()); + "make_pattern_unstructured", [](context_shim& c, halo_gen& h, domain_range& d) + { return ghex::make_pattern(c.m, h, d); }, pybind11::keep_alive<0, 1>()); gridtools::for_each>( [&m, &_pattern_container](auto k) @@ -65,13 +63,12 @@ register_pattern(pybind11::module& m) // `&pattern_container::template operator()` leads to an // "identifier undefined in device code" error when using NVCC _pattern_container.def( - "__call__", - [](const pattern_container& pattern, field& f) - { return pattern(f); }, - pybind11::keep_alive<0, 2>()); + "__call__", [](const pattern_container& pattern, field& f) + { return pattern(f); }, pybind11::keep_alive<0, 2>()); }); - m.def("expose_cpp_ptr", [](pattern_container* obj){return reinterpret_cast(obj);}); + m.def("expose_cpp_ptr", + [](pattern_container* obj) { return reinterpret_cast(obj); }); }); } diff --git a/bindings/python/src/_pyghex/unstructured/pattern.hpp b/bindings/python/src/_pyghex/unstructured/pattern.hpp index b9f8766a..5bd68db8 100644 --- a/bindings/python/src/_pyghex/unstructured/pattern.hpp +++ b/bindings/python/src/_pyghex/unstructured/pattern.hpp @@ -37,5 +37,3 @@ using make_pattern_traits_specializations = } // namespace } // namespace unstructured } // namespace pyghex - - diff --git a/bindings/python/src/_pyghex/unstructured/types.hpp b/bindings/python/src/_pyghex/unstructured/types.hpp index 4ec91734..05d1aab9 100644 --- a/bindings/python/src/_pyghex/unstructured/types.hpp +++ b/bindings/python/src/_pyghex/unstructured/types.hpp @@ -21,7 +21,7 @@ namespace unstructured struct types : public ::pyghex::types { using global_ids = gridtools::meta::list; - using grids = gridtools::meta::list >; + using grids = gridtools::meta::list>; }; } // namespace unstructured diff --git a/bindings/python/src/_pyghex/util/demangle.hpp b/bindings/python/src/_pyghex/util/demangle.hpp index dab37b2e..0eae196b 100644 --- a/bindings/python/src/_pyghex/util/demangle.hpp +++ b/bindings/python/src/_pyghex/util/demangle.hpp @@ -34,16 +34,21 @@ demangle() } inline std::string -mangle_python(std::string s) { - s.erase(std::remove_if(s.begin(), s.end(), [](unsigned char c) { return std::isspace(c); }), s.end()); +mangle_python(std::string s) +{ + s.erase(std::remove_if(s.begin(), s.end(), [](unsigned char c) { return std::isspace(c); }), + s.end()); std::string _ghex = "ghex::"; - auto pos = s.find(_ghex); - while(pos != std::string::npos) { + auto pos = s.find(_ghex); + while (pos != std::string::npos) + { s.erase(pos, _ghex.length()); pos = s.find(_ghex); } - for (auto& c : s) { - switch(c) { + for (auto& c : s) + { + switch (c) + { case ':': case ',': case '<': diff --git a/include/ghex/bulk_communication_object.hpp b/include/ghex/bulk_communication_object.hpp index 9a704b4f..143111f2 100644 --- a/include/ghex/bulk_communication_object.hpp +++ b/include/ghex/bulk_communication_object.hpp @@ -330,8 +330,8 @@ class bulk_communication_object { // initialize the remote handle - this will effectively publish the rma pointers // will do nothing if already initialized - m_local_handle.init( - f.data(), f.bytes(), std::is_same::value); + m_local_handle.init(f.data(), f.bytes(), + std::is_same::value); // prepare local and remote patterns // ================================= @@ -500,14 +500,15 @@ class bulk_communication_object for (auto it = h_it->second.rbegin(); it != h_it->second.rend(); ++it) { const auto& c = *it; - s_range.m_ranges.back().emplace_back( - m_co->communicator(), f, c, h_it->first.mpi_rank + s_range.m_ranges.back().emplace_back(m_co->communicator(), f, c, + h_it->first.mpi_rank #ifdef GHEX_BULK_UNIQUE_TAGS , (m_it->second + h_it->first.tag + 1) * 10000 + q #else // alternatively rely on message ordering: - , h_it->first.tag + , + h_it->first.tag #endif ); ++q; @@ -554,11 +555,13 @@ class bulk_communication_object auto bis_tp = std::make_tuple(bis...); for (std::size_t i = 0; i < sizeof...(F); ++i) { - boost::mp11::mp_with_index(i, [this, &bis_tp](auto i) { - // get the field Index - using I = decltype(i); - add_field(std::get(bis_tp)); - }); + boost::mp11::mp_with_index(i, + [this, &bis_tp](auto i) + { + // get the field Index + using I = decltype(i); + add_field(std::get(bis_tp)); + }); } } @@ -574,79 +577,89 @@ class bulk_communication_object // loop over Fields for (std::size_t i = 0; i < boost::mp11::mp_size::value; ++i) { - boost::mp11::mp_with_index::value>(i, [this](auto i) { - // get the field Index - using I = decltype(i); - // get source and target ranges - auto& bi_cont = std::get(m_buffer_info_container_tuple); - auto& f_cont = std::get(m_field_container_tuple); - // add remote exchange - for (auto& f : f_cont) bi_cont.push_back(f.m_remote_pattern(f.m_field)); - }); + boost::mp11::mp_with_index::value>(i, + [this](auto i) + { + // get the field Index + using I = decltype(i); + // get source and target ranges + auto& bi_cont = std::get(m_buffer_info_container_tuple); + auto& f_cont = std::get(m_field_container_tuple); + // add remote exchange + for (auto& f : f_cont) bi_cont.push_back(f.m_remote_pattern(f.m_field)); + }); } for (std::size_t i = 0; i < boost::mp11::mp_size::value; ++i) { - boost::mp11::mp_with_index::value>(i, [this](auto i) { - // get the field Index - using I = decltype(i); - // get source and target ranges - auto& s_range = std::get(m_source_ranges_tuple); - // complete the handshake - for (auto& s_vec : s_range.m_ranges) - for (auto& r : s_vec) r.recv(); - }); + boost::mp11::mp_with_index::value>(i, + [this](auto i) + { + // get the field Index + using I = decltype(i); + // get source and target ranges + auto& s_range = std::get(m_source_ranges_tuple); + // complete the handshake + for (auto& s_vec : s_range.m_ranges) + for (auto& r : s_vec) r.recv(); + }); } for (std::size_t i = 0; i < boost::mp11::mp_size::value; ++i) { - boost::mp11::mp_with_index::value>(i, [this](auto i) { - // get the field Index - using I = decltype(i); - // get source and target ranges - auto& t_range = std::get(m_target_ranges_tuple); - // complete the handshake - for (auto& t_vec : t_range.m_ranges) - for (auto& r : t_vec) r.send(); - }); + boost::mp11::mp_with_index::value>(i, + [this](auto i) + { + // get the field Index + using I = decltype(i); + // get source and target ranges + auto& t_range = std::get(m_target_ranges_tuple); + // complete the handshake + for (auto& t_vec : t_range.m_ranges) + for (auto& r : t_vec) r.send(); + }); } // loop over Fields for (std::size_t i = 0; i < boost::mp11::mp_size::value; ++i) { - boost::mp11::mp_with_index::value>(i, [this](auto i) { - // get the field Index - using I = decltype(i); - - // get target ranges for fields - auto& t_range = std::get(m_target_ranges_tuple); - for (auto& t_vec : t_range.m_ranges) - for (auto& r : t_vec) - { - // register open functions - m_open_funcs.push_back([&r]() { r.end_target_epoch(); }); - // register wait functions - m_wait_funcs.push_back(func_request{std::function( - [&r]() -> bool { return r.try_start_target_epoch(); })}); - } - - // get source ranges for fields - auto& s_range = std::get(m_source_ranges_tuple); - // put data - for (auto& s_vec : s_range.m_ranges) - for (auto& r : s_vec) - { - // register put functions - m_put_funcs.push_back(func_request{std::function([&r]() -> bool { - if (r.try_start_source_epoch()) - { - r.put(); - r.end_source_epoch(); - return true; - } - else - return false; - })}); - } - }); + boost::mp11::mp_with_index::value>(i, + [this](auto i) + { + // get the field Index + using I = decltype(i); + + // get target ranges for fields + auto& t_range = std::get(m_target_ranges_tuple); + for (auto& t_vec : t_range.m_ranges) + for (auto& r : t_vec) + { + // register open functions + m_open_funcs.push_back([&r]() { r.end_target_epoch(); }); + // register wait functions + m_wait_funcs.push_back(func_request{std::function( + [&r]() -> bool { return r.try_start_target_epoch(); })}); + } + + // get source ranges for fields + auto& s_range = std::get(m_source_ranges_tuple); + // put data + for (auto& s_vec : s_range.m_ranges) + for (auto& r : s_vec) + { + // register put functions + m_put_funcs.push_back(func_request{std::function( + [&r]() -> bool + { + if (r.try_start_source_epoch()) + { + r.put(); + r.end_source_epoch(); + return true; + } + else + return false; + })}); + } + }); } m_initialized = true; } diff --git a/include/ghex/communication_object.hpp b/include/ghex/communication_object.hpp index d49cd1a4..d65e6a99 100644 --- a/include/ghex/communication_object.hpp +++ b/include/ghex/communication_object.hpp @@ -245,8 +245,8 @@ class communication_object * @param last points to the end of the range * @return handle to await communication */ template - [[nodiscard]] disable_if_buffer_info exchange( - Iterator first, Iterator last) + [[nodiscard]] disable_if_buffer_info exchange(Iterator first, + Iterator last) { // call special function for a single range return exchange_u(first, last); @@ -263,11 +263,11 @@ class communication_object * @param iters first and last iterators for further ranges * @return handle to await communication */ template - [[nodiscard]] disable_if_buffer_info exchange( - Iterator0 first0, Iterator0 last0, Iterator1 first1, Iterator1 last1, Iterators... iters) + [[nodiscard]] disable_if_buffer_info exchange(Iterator0 first0, + Iterator0 last0, Iterator1 first1, Iterator1 last1, Iterators... iters) { - static_assert( - sizeof...(Iterators) % 2 == 0, "need even number of iteratiors: (begin,end) pairs"); + static_assert(sizeof...(Iterators) % 2 == 0, + "need even number of iteratiors: (begin,end) pairs"); // call helper function to turn iterators into pairs of iterators return exchange_make_pairs(std::make_index_sequence<2 + sizeof...(iters) / 2>(), first0, last0, first1, last1, iters...); @@ -337,7 +337,8 @@ class communication_object auto ptr = &p1.second; m_recv_reqs.push_back( m_comm.recv(p1.second.buffer, p1.second.rank, p1.second.tag, - [ptr](context::message_type& m, context::rank_type, context::tag_type) { + [ptr](context::message_type& m, context::rank_type, context::tag_type) + { device::guard g(m); packer::unpack(*ptr, g.data()); })); @@ -364,28 +365,33 @@ class communication_object using test_t = pattern_container; std::map pat_ptr_map; int max_tag = 0; - for_each(iter_pairs_t, [&pat_ptr_map, &max_tag](std::size_t, auto iter_pair) { - for (auto it = iter_pair.first; it != iter_pair.second; ++it) + for_each(iter_pairs_t, + [&pat_ptr_map, &max_tag](std::size_t, auto iter_pair) { - auto ptr = &(it->get_pattern_container()); - auto p_it_bool = pat_ptr_map.insert(std::make_pair(ptr, max_tag)); - if (p_it_bool.second == true) max_tag += ptr->max_tag() + 1; - } - }); - for_each(iter_pairs_t, [this, &pat_ptr_map](std::size_t, auto iter_pair) { - using buffer_info_t = typename std::remove_reference::type; - using arch_t = typename buffer_info_t::arch_type; - using value_t = typename buffer_info_t::value_type; - auto mem = &(std::get>(m_mem)); - for (auto it = iter_pair.first; it != iter_pair.second; ++it) + for (auto it = iter_pair.first; it != iter_pair.second; ++it) + { + auto ptr = &(it->get_pattern_container()); + auto p_it_bool = pat_ptr_map.insert(std::make_pair(ptr, max_tag)); + if (p_it_bool.second == true) max_tag += ptr->max_tag() + 1; + } + }); + for_each(iter_pairs_t, + [this, &pat_ptr_map](std::size_t, auto iter_pair) { - auto field_ptr = &(it->get_field()); - auto tag_offset = pat_ptr_map[&(it->get_pattern_container())]; - const auto my_dom_id = it->get_field().domain_id(); - allocate( - mem, it->get_pattern(), field_ptr, my_dom_id, it->device_id(), tag_offset); - } - }); + using buffer_info_t = + typename std::remove_reference::type; + using arch_t = typename buffer_info_t::arch_type; + using value_t = typename buffer_info_t::value_type; + auto mem = &(std::get>(m_mem)); + for (auto it = iter_pair.first; it != iter_pair.second; ++it) + { + auto field_ptr = &(it->get_field()); + auto tag_offset = pat_ptr_map[&(it->get_pattern_container())]; + const auto my_dom_id = it->get_field().domain_id(); + allocate(mem, it->get_pattern(), field_ptr, my_dom_id, + it->device_id(), tag_offset); + } + }); } // helper function to set up communicaton buffers (compile-time case) @@ -420,54 +426,62 @@ class communication_object buffer_infos_ptr_t buffer_info_tuple{&buffer_infos...}; memory_t memory_tuple{&(std::get>(m_mem))...}; // loop over buffer_infos/memory and compute required space - for_each(memory_tuple, buffer_info_tuple, [this, &tag_offsets](std::size_t i, auto mem, auto bi) { - using arch_type = typename std::remove_reference_t::arch_type; - using value_type = typename std::remove_reference_t::value_type; - auto field_ptr = &(bi->get_field()); - const domain_id_type my_dom_id = bi->get_field().domain_id(); - allocate( - mem, bi->get_pattern(), field_ptr, my_dom_id, bi->device_id(), tag_offsets[i]); - }); + for_each(memory_tuple, buffer_info_tuple, + [this, &tag_offsets](std::size_t i, auto mem, auto bi) + { + using arch_type = typename std::remove_reference_t::arch_type; + using value_type = typename std::remove_reference_t::value_type; + auto field_ptr = &(bi->get_field()); + const domain_id_type my_dom_id = bi->get_field().domain_id(); + allocate(mem, bi->get_pattern(), field_ptr, my_dom_id, + bi->device_id(), tag_offsets[i]); + }); } void post_recvs() { - for_each(m_mem, [this](std::size_t, auto& m) { - using arch_type = typename std::remove_reference_t::arch_type; - for (auto& p0 : m.recv_memory) + for_each(m_mem, + [this](std::size_t, auto& m) { - const auto device_id = p0.first; - for (auto& p1 : p0.second) + using arch_type = typename std::remove_reference_t::arch_type; + for (auto& p0 : m.recv_memory) { - if (p1.second.size > 0u) + const auto device_id = p0.first; + for (auto& p1 : p0.second) { - if (!p1.second.buffer || p1.second.buffer.size() != p1.second.size + if (p1.second.size > 0u) + { + if (!p1.second.buffer || p1.second.buffer.size() != p1.second.size #if defined(GHEX_USE_GPU) || defined(GHEX_GPU_MODE_EMULATE) - || p1.second.buffer.device_id() != device_id + || p1.second.buffer.device_id() != device_id #endif - ) - p1.second.buffer = arch_traits::make_message( - m_comm, p1.second.size, device_id); - auto ptr = &p1.second; - // use callbacks for unpacking - m_recv_reqs.push_back(m_comm.recv(p1.second.buffer, p1.second.rank, - p1.second.tag, - [ptr](context::message_type& m, context::rank_type, context::tag_type) { - device::guard g(m); - packer::unpack(*ptr, g.data()); - })); + ) + p1.second.buffer = arch_traits::make_message(m_comm, + p1.second.size, device_id); + auto ptr = &p1.second; + // use callbacks for unpacking + m_recv_reqs.push_back( + m_comm.recv(p1.second.buffer, p1.second.rank, p1.second.tag, + [ptr](context::message_type& m, context::rank_type, + context::tag_type) + { + device::guard g(m); + packer::unpack(*ptr, g.data()); + })); + } } } - } - }); + }); } void pack() { - for_each(m_mem, [this](std::size_t, auto& m) { - using arch_type = typename std::remove_reference_t::arch_type; - packer::pack(m, m_send_reqs, m_comm); - }); + for_each(m_mem, + [this](std::size_t, auto& m) + { + using arch_type = typename std::remove_reference_t::arch_type; + packer::pack(m, m_send_reqs, m_comm); + }); } private: // wait functions @@ -519,12 +533,9 @@ class communication_object auto& m = std::get(m_mem); for (auto& p0 : m.recv_memory) { - for (auto& p1: p0.second) + for (auto& p1 : p0.second) { - if (p1.second.size > 0u) - { - p1.second.m_stream.sync(); - } + if (p1.second.size > 0u) { p1.second.m_stream.sync(); } } } } @@ -538,20 +549,22 @@ class communication_object m_valid = false; m_send_reqs.clear(); m_recv_reqs.clear(); - for_each(m_mem, [this](std::size_t, auto& m) { - for (auto& p0 : m.send_memory) - for (auto& p1 : p0.second) - { - p1.second.size = 0; - p1.second.field_infos.resize(0); - } - for (auto& p0 : m.recv_memory) - for (auto& p1 : p0.second) - { - p1.second.size = 0; - p1.second.field_infos.resize(0); - } - }); + for_each(m_mem, + [this](std::size_t, auto& m) + { + for (auto& p0 : m.send_memory) + for (auto& p1 : p0.second) + { + p1.second.size = 0; + p1.second.field_infos.resize(0); + } + for (auto& p0 : m.recv_memory) + for (auto& p1 : p0.second) + { + p1.second.size = 0; + p1.second.field_infos.resize(0); + } + }); } // private: // allocation member functions @@ -561,16 +574,14 @@ class communication_object { allocate::recv_buffer_type>( mem->recv_memory[device_id], pattern.recv_halos(), - [field_ptr](const void* buffer, const index_container_type& c, void* arg) { - field_ptr->unpack(reinterpret_cast(buffer), c, arg); - }, - dom_id, tag_offset, true, field_ptr); + [field_ptr](const void* buffer, const index_container_type& c, void* arg) + { field_ptr->unpack(reinterpret_cast(buffer), c, arg); }, dom_id, tag_offset, + true, field_ptr); allocate::send_buffer_type>( mem->send_memory[device_id], pattern.send_halos(), - [field_ptr](void* buffer, const index_container_type& c, void* arg) { - field_ptr->pack(reinterpret_cast(buffer), c, arg); - }, - dom_id, tag_offset, false, field_ptr); + [field_ptr](void* buffer, const index_container_type& c, void* arg) + { field_ptr->pack(reinterpret_cast(buffer), c, arg); }, dom_id, tag_offset, false, + field_ptr); } // compute memory requirements to be allocated on the device @@ -602,9 +613,9 @@ class communication_object if (it == memory.end()) { it = memory - .insert(std::make_pair( - d_p, BufferType{remote_rank, p_id_c.first.tag + tag_offset, {}, 0, - std::vector(), {}})) + .insert(std::make_pair(d_p, + BufferType{remote_rank, p_id_c.first.tag + tag_offset, {}, 0, + std::vector(), {}})) .first; } else if (it->second.size == 0) diff --git a/include/ghex/context.hpp b/include/ghex/context.hpp index d6994e20..edbf86bd 100644 --- a/include/ghex/context.hpp +++ b/include/ghex/context.hpp @@ -20,6 +20,7 @@ class barrier; class context { friend class barrier; + public: using rank_type = oomph::rank_type; using tag_type = oomph::tag_type; diff --git a/include/ghex/device/cuda/error.hpp b/include/ghex/device/cuda/error.hpp index 1725fec2..d4ab95c2 100644 --- a/include/ghex/device/cuda/error.hpp +++ b/include/ghex/device/cuda/error.hpp @@ -25,8 +25,12 @@ std::string(__FILE__) + ":" + std::to_string(__LINE__)); #define GHEX_CHECK_CUDA_RESULT_NO_THROW(x) \ - try { GHEX_CHECK_CUDA_RESULT(x) } \ - catch (const std::exception& e) { \ + try \ + { \ + GHEX_CHECK_CUDA_RESULT(x) \ + } \ + catch (const std::exception& e) \ + { \ std::cerr << e.what() << std::endl; \ std::terminate(); \ } diff --git a/include/ghex/device/cuda/future.hpp b/include/ghex/device/cuda/future.hpp index 26800dde..bdb0965f 100644 --- a/include/ghex/device/cuda/future.hpp +++ b/include/ghex/device/cuda/future.hpp @@ -28,9 +28,10 @@ namespace device template struct future { - GHEX_C_MANAGED_STRUCT(event_type, cudaEvent_t, - [](auto&&... args) { GHEX_CHECK_CUDA_RESULT(cudaEventCreateWithFlags(std::forward(args)...)) }, - [](auto& e){ GHEX_CHECK_CUDA_RESULT_NO_THROW(cudaEventDestroy(e)) }) + GHEX_C_MANAGED_STRUCT( + event_type, cudaEvent_t, [](auto&&... args) + { GHEX_CHECK_CUDA_RESULT(cudaEventCreateWithFlags(std::forward(args)...)) }, + [](auto& e) { GHEX_CHECK_CUDA_RESULT_NO_THROW(cudaEventDestroy(e)) }) event_type m_event; T m_data; @@ -64,9 +65,10 @@ struct future template<> struct future { - GHEX_C_MANAGED_STRUCT(event_type, cudaEvent_t, - [](auto&&... args) { GHEX_CHECK_CUDA_RESULT(cudaEventCreateWithFlags(std::forward(args)...)) }, - [](auto& e){ GHEX_CHECK_CUDA_RESULT_NO_THROW(cudaEventDestroy(e)) }) + GHEX_C_MANAGED_STRUCT( + event_type, cudaEvent_t, [](auto&&... args) + { GHEX_CHECK_CUDA_RESULT(cudaEventCreateWithFlags(std::forward(args)...)) }, + [](auto& e) { GHEX_CHECK_CUDA_RESULT_NO_THROW(cudaEventDestroy(e)) }) event_type m_event; diff --git a/include/ghex/device/cuda/runtime.hpp b/include/ghex/device/cuda/runtime.hpp index ff637b5a..ba6e8123 100644 --- a/include/ghex/device/cuda/runtime.hpp +++ b/include/ghex/device/cuda/runtime.hpp @@ -17,57 +17,57 @@ #include /* GridTools cuda -> hip translations */ -#define cudaDeviceProp hipDeviceProp_t -#define cudaDeviceSynchronize hipDeviceSynchronize -#define cudaErrorInvalidValue hipErrorInvalidValue -#define cudaError_t hipError_t -#define cudaEventCreate hipEventCreate -#define cudaEventDestroy hipEventDestroy -#define cudaEventElapsedTime hipEventElapsedTime -#define cudaEventRecord hipEventRecord -#define cudaEventSynchronize hipEventSynchronize -#define cudaEvent_t hipEvent_t -#define cudaFree hipFree -#define cudaFreeHost hipFreeHost -#define cudaGetDevice hipGetDevice -#define cudaGetDeviceCount hipGetDeviceCount -#define cudaGetDeviceProperties hipGetDeviceProperties -#define cudaGetErrorName hipGetErrorName -#define cudaGetErrorString hipGetErrorString -#define cudaGetLastError hipGetLastError -#define cudaMalloc hipMalloc -#define cudaMallocHost hipMallocHost -#define cudaMallocManaged hipMallocManaged -#define cudaMemAttachGlobal hipMemAttachGlobal -#define cudaMemcpy hipMemcpy -#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost -#define cudaMemcpyHostToDevice hipMemcpyHostToDevice -#define cudaMemoryTypeDevice hipMemoryTypeDevice -#define cudaPointerAttributes hipPointerAttribute_t +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaErrorInvalidValue hipErrorInvalidValue +#define cudaError_t hipError_t +#define cudaEventCreate hipEventCreate +#define cudaEventDestroy hipEventDestroy +#define cudaEventElapsedTime hipEventElapsedTime +#define cudaEventRecord hipEventRecord +#define cudaEventSynchronize hipEventSynchronize +#define cudaEvent_t hipEvent_t +#define cudaFree hipFree +#define cudaFreeHost hipFreeHost +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorName hipGetErrorName +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaMalloc hipMalloc +#define cudaMallocHost hipMallocHost +#define cudaMallocManaged hipMallocManaged +#define cudaMemAttachGlobal hipMemAttachGlobal +#define cudaMemcpy hipMemcpy +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemoryTypeDevice hipMemoryTypeDevice +#define cudaPointerAttributes hipPointerAttribute_t #define cudaPointerGetAttributes hipPointerGetAttributes -#define cudaSetDevice hipSetDevice -#define cudaStreamCreate hipStreamCreate -#define cudaStreamDestroy hipStreamDestroy -#define cudaStreamSynchronize hipStreamSynchronize -#define cudaStream_t hipStream_t -#define cudaSuccess hipSuccess +#define cudaSetDevice hipSetDevice +#define cudaStreamCreate hipStreamCreate +#define cudaStreamDestroy hipStreamDestroy +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess /* additional cuda -> hip translations */ -#define cudaEventCreateWithFlags hipEventCreateWithFlags -#define cudaEventDisableTiming hipEventDisableTiming -#define cudaEventInterprocess hipEventInterprocess -#define cudaEventQuery hipEventQuery -#define cudaIpcCloseMemHandle hipIpcCloseMemHandle -#define cudaIpcEventHandle_t hipIpcEventHandle_t -#define cudaIpcGetEventHandle hipIpcGetEventHandle -#define cudaIpcGetMemHandle hipIpcGetMemHandle -#define cudaIpcMemHandle_t hipIpcMemHandle_t +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventInterprocess hipEventInterprocess +#define cudaEventQuery hipEventQuery +#define cudaIpcCloseMemHandle hipIpcCloseMemHandle +#define cudaIpcEventHandle_t hipIpcEventHandle_t +#define cudaIpcGetEventHandle hipIpcGetEventHandle +#define cudaIpcGetMemHandle hipIpcGetMemHandle +#define cudaIpcMemHandle_t hipIpcMemHandle_t #define cudaIpcMemLazyEnablePeerAccess hipIpcMemLazyEnablePeerAccess -#define cudaIpcOpenEventHandle hipIpcOpenEventHandle -#define cudaIpcOpenMemHandle hipIpcOpenMemHandle -#define cudaMemcpyAsync hipMemcpyAsync -#define cudaStreamCreateWithFlags hipStreamCreateWithFlags -#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaIpcOpenEventHandle hipIpcOpenEventHandle +#define cudaIpcOpenMemHandle hipIpcOpenMemHandle +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamNonBlocking hipStreamNonBlocking #else /* __HIP_PLATFORM_AMD__ */ diff --git a/include/ghex/device/cuda/stream.hpp b/include/ghex/device/cuda/stream.hpp index 5aa75ef0..bd47ea17 100644 --- a/include/ghex/device/cuda/stream.hpp +++ b/include/ghex/device/cuda/stream.hpp @@ -26,11 +26,8 @@ struct stream cudaEvent_t m_event; ghex::util::moved_bit m_moved; - stream() - { - GHEX_CHECK_CUDA_RESULT(cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking)) - GHEX_CHECK_CUDA_RESULT(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming)) - } + stream(){GHEX_CHECK_CUDA_RESULT(cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking)) + GHEX_CHECK_CUDA_RESULT(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming))} stream(const stream&) = delete; stream& operator=(const stream&) = delete; diff --git a/include/ghex/glue/gridtools/field.hpp b/include/ghex/glue/gridtools/field.hpp index 4210e91c..c8e9160e 100644 --- a/include/ghex/glue/gridtools/field.hpp +++ b/include/ghex/glue/gridtools/field.hpp @@ -28,7 +28,7 @@ namespace _impl // return {Halo::template at()...}; //} -template +template using not_negative = std::integral_constant= 0)>; template @@ -47,8 +47,7 @@ template struct get_unmasked_layout_map> { using args = gridtools::meta::list...>; - using unmasked_args = - gridtools::meta::filter; + using unmasked_args = gridtools::meta::filter; using integer_seq = gridtools::meta::list_to_iseq; using type = typename get_layout_map::type; }; @@ -58,8 +57,8 @@ struct get_unmasked_layout_map> template auto wrap_gt_field(const DomainDescriptor& dom, const std::shared_ptr& ds, - const std::array& origin, int device_id = - arch_traits::current_id()) + const std::array& origin, + int device_id = arch_traits::current_id()) { using value_t = typename DataStore::data_t; using layout_t = typename DataStore::layout_t; @@ -72,16 +71,17 @@ wrap_gt_field(const DomainDescriptor& dom, const std::shared_ptr& ds, auto strides = ds->strides(); for (unsigned int i = 0u; i < dimension::value; ++i) strides[i] *= sizeof(value_t); - return field_desc_t( - dom, ds->get_target_ptr(), origin, ds->lengths(), strides, 1, false, device_id); + return field_desc_t(dom, ds->get_target_ptr(), origin, ds->lengths(), strides, 1, false, + device_id); } template auto -wrap_gt_field(const gt_grid& grid, DataStore&& ds, Origin&& origin, int device_id = arch_traits::current_id()) +wrap_gt_field(const gt_grid& grid, DataStore&& ds, Origin&& origin, + int device_id = arch_traits::current_id()) { - return wrap_gt_field( - grid.m_domains[0], std::forward(ds), std::forward(origin), device_id); + return wrap_gt_field(grid.m_domains[0], std::forward(ds), + std::forward(origin), device_id); } } // namespace ghex diff --git a/include/ghex/glue/gridtools/make_gt_pattern.hpp b/include/ghex/glue/gridtools/make_gt_pattern.hpp index 8d118be9..f30720ad 100644 --- a/include/ghex/glue/gridtools/make_gt_pattern.hpp +++ b/include/ghex/glue/gridtools/make_gt_pattern.hpp @@ -20,8 +20,8 @@ auto make_gt_pattern(Grid& grid, Halos&& halos) { const std::array first{0, 0, 0}; - const std::array last{ - grid.m_global_extents[0] - 1, grid.m_global_extents[1] - 1, grid.m_global_extents[2] - 1}; + const std::array last{grid.m_global_extents[0] - 1, grid.m_global_extents[1] - 1, + grid.m_global_extents[2] - 1}; using halo_gen_type = structured::regular::halo_generator>; auto halo_gen = halo_gen_type(first, last, std::forward(halos), grid.m_periodic); diff --git a/include/ghex/glue/gridtools/processor_grid.hpp b/include/ghex/glue/gridtools/processor_grid.hpp index 4d10168f..d801190d 100644 --- a/include/ghex/glue/gridtools/processor_grid.hpp +++ b/include/ghex/glue/gridtools/processor_grid.hpp @@ -107,16 +107,16 @@ make_gt_processor_grid(context& ctxt, const Array0& local_extents, const Array1& } std::partial_sum(extents_z.begin(), extents_z.end(), extents_z.begin()); - const std::array global_extents = { - extents_x.back(), extents_y.back(), extents_z.back()}; + const std::array global_extents = {extents_x.back(), extents_y.back(), + extents_z.back()}; const std::array global_first = {coords[0] == 0 ? 0 : extents_x[coords[0] - 1], coords[1] == 0 ? 0 : extents_y[coords[1] - 1], coords[2] == 0 ? 0 : extents_z[coords[2] - 1]}; const std::array global_last = {global_first[0] + local_extents[0] - 1, global_first[1] + local_extents[1] - 1, global_first[2] + local_extents[2] - 1}; - structured::regular::domain_descriptor> local_domain{ - rank, global_first, global_last}; + structured::regular::domain_descriptor> local_domain{rank, + global_first, global_last}; return {ctxt, {local_domain}, global_extents, periodic}; } diff --git a/include/ghex/packer.hpp b/include/ghex/packer.hpp index a1475ad7..3c807a66 100644 --- a/include/ghex/packer.hpp +++ b/include/ghex/packer.hpp @@ -74,8 +74,9 @@ await_futures(std::vector& range, Continuation&& cont) auto end = index_list.end(); while (begin != end) { - end = - std::remove_if(begin, end, [&range, cont = std::forward(cont)](int idx) { + end = std::remove_if(begin, end, + [&range, cont = std::forward(cont)](int idx) + { if (range[idx].test()) { cont(range[idx].get()); @@ -158,9 +159,8 @@ struct packer } } } - await_futures(stream_futures, [&comm, &send_reqs](send_buffer_type* b) { - send_reqs.push_back(comm.send(b->buffer, b->rank, b->tag)); - }); + await_futures(stream_futures, [&comm, &send_reqs](send_buffer_type* b) + { send_reqs.push_back(comm.send(b->buffer, b->rank, b->tag)); }); } template @@ -274,9 +274,8 @@ struct packer } } } - await_futures(stream_futures, [&comm, &send_reqs](send_buffer_type* b) { - send_reqs.push_back(comm.send(b->buffer, b->rank, b->tag)); - }); + await_futures(stream_futures, [&comm, &send_reqs](send_buffer_type* b) + { send_reqs.push_back(comm.send(b->buffer, b->rank, b->tag)); }); } }; #endif diff --git a/include/ghex/pattern_container.hpp b/include/ghex/pattern_container.hpp index da46bf70..b725fccd 100644 --- a/include/ghex/pattern_container.hpp +++ b/include/ghex/pattern_container.hpp @@ -96,8 +96,8 @@ class pattern_container private: // members oomph::context* m_ctxt; - data_type m_patterns; - int m_max_tag; + data_type m_patterns; + int m_max_tag; }; /** @brief construct a pattern for each domain and establish neighbor relationships @@ -115,8 +115,8 @@ make_pattern(context& c, HaloGenerator&& hgen, DomainRange&& d_range) { using grid_type = typename GridType::template type::value_type>; - return detail::make_pattern_impl::apply( - c, std::forward(hgen), std::forward(d_range)); + return detail::make_pattern_impl::apply(c, std::forward(hgen), + std::forward(d_range)); } /** @brief construct a pattern for each domain and establish neighbor relationships, with @@ -136,8 +136,8 @@ make_pattern(context& c, HaloGenerator&& hgen, DomainRange&& d_range) * @return iterable of patterns (one per domain) */ template auto -make_pattern( - context& c, HaloGenerator&& hgen, RecvDomainIdsGen&& recv_domain_ids_gen, DomainRange&& d_range) +make_pattern(context& c, HaloGenerator&& hgen, RecvDomainIdsGen&& recv_domain_ids_gen, + DomainRange&& d_range) { using grid_type = typename GridType::template type::value_type>; diff --git a/include/ghex/rma/cuda/handle.hpp b/include/ghex/rma/cuda/handle.hpp index 4b368a88..0ec2509c 100644 --- a/include/ghex/rma/cuda/handle.hpp +++ b/include/ghex/rma/cuda/handle.hpp @@ -74,7 +74,9 @@ struct remote_data_holder { // detach rma resource if (m_on_gpu && m_loc == locality::process && m_attached) - { GHEX_CHECK_CUDA_RESULT_NO_THROW(cudaIpcCloseMemHandle(m_cuda_ptr)); } + { + GHEX_CHECK_CUDA_RESULT_NO_THROW(cudaIpcCloseMemHandle(m_cuda_ptr)); + } } void attach(resource_cache& cache, void* ptr) diff --git a/include/ghex/rma/event.hpp b/include/ghex/rma/event.hpp index 3ff8c75a..299f993c 100644 --- a/include/ghex/rma/event.hpp +++ b/include/ghex/rma/event.hpp @@ -54,11 +54,13 @@ struct local_event { #ifdef GHEX_CUDACC if (m_loc == locality::thread) - { GHEX_CHECK_CUDA_RESULT(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming)); } + { + GHEX_CHECK_CUDA_RESULT(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming)); + } if (m_loc == locality::process) { - GHEX_CHECK_CUDA_RESULT(cudaEventCreateWithFlags( - &m_event, cudaEventDisableTiming | cudaEventInterprocess)); + GHEX_CHECK_CUDA_RESULT(cudaEventCreateWithFlags(&m_event, + cudaEventDisableTiming | cudaEventInterprocess)); GHEX_CHECK_CUDA_RESULT(cudaIpcGetEventHandle(&m_event_handle, m_event)); } #endif @@ -67,7 +69,10 @@ struct local_event ~data_holder() { #ifdef GHEX_CUDACC - if (m_loc != locality::remote) { GHEX_CHECK_CUDA_RESULT_NO_THROW(cudaEventDestroy(m_event)); } + if (m_loc != locality::remote) + { + GHEX_CHECK_CUDA_RESULT_NO_THROW(cudaEventDestroy(m_event)); + } #endif } @@ -147,7 +152,8 @@ struct remote_event ~data_holder() { #ifdef GHEX_CUDACC - if (m_source_on_gpu || m_target_on_gpu) GHEX_CHECK_CUDA_RESULT_NO_THROW(cudaStreamDestroy(m_stream)); + if (m_source_on_gpu || m_target_on_gpu) + GHEX_CHECK_CUDA_RESULT_NO_THROW(cudaStreamDestroy(m_stream)); #endif } diff --git a/include/ghex/rma/handle.hpp b/include/ghex/rma/handle.hpp index 3e9fc302..63161859 100644 --- a/include/ghex/rma/handle.hpp +++ b/include/ghex/rma/handle.hpp @@ -129,8 +129,8 @@ struct remote_handle void* get_ptr(locality loc) const { - static_assert( - std::is_same::value, ""); // prevent compiler warning + static_assert(std::is_same::value, + ""); // prevent compiler warning #if defined(GHEX_GPU_MODE_EMULATE) && defined(GHEX_USE_XPMEM) if (loc == locality::process) return m_xpmem_data_holder.get_ptr(); #elif defined(GHEX_USE_XPMEM) diff --git a/include/ghex/rma/range_factory.hpp b/include/ghex/rma/range_factory.hpp index 3c584fcc..8cdf56a9 100644 --- a/include/ghex/rma/range_factory.hpp +++ b/include/ghex/rma/range_factory.hpp @@ -70,8 +70,9 @@ struct range_factory event_info e_info_; std::memcpy(&e_info_, buffer, sizeof(event_info)); buffer += a16(sizeof(event_info)); - return boost::mp11::mp_with_index::value>( - id, [buffer, field_info, info_, e_info_, rank, on_gpu](auto Id) { + return boost::mp11::mp_with_index::value>(id, + [buffer, field_info, info_, e_info_, rank, on_gpu](auto Id) + { using range_t = boost::mp11::mp_at; return range(std::move(*reinterpret_cast(buffer)), decltype(Id)::value, field_info, info_, e_info_, rank, on_gpu); @@ -82,20 +83,21 @@ struct range_factory template static void call_back_with_type(range& r, Func&& f) { - boost::mp11::mp_with_index::value>( - r.m_id, [&r, f = std::forward(f)](auto Id) { + boost::mp11::mp_with_index::value>(r.m_id, + [&r, f = std::forward(f)](auto Id) + { using range_t = boost::mp11::mp_at; f(reinterpret_cast*>(r.m_impl.get())->m); }); } - //private: + //private: template static void serialize(info field_info, local_access_guard& g, local_event& e, const Range& r, unsigned char* buffer) { - static_assert( - boost::mp11::mp_set_contains::value, "range type not registered"); + static_assert(boost::mp11::mp_set_contains::value, + "range type not registered"); using id = boost::mp11::mp_find; const int m_id = id::value; std::memcpy(buffer, &m_id, sizeof(int)); diff --git a/include/ghex/rma/shmem/access_guard.hpp b/include/ghex/rma/shmem/access_guard.hpp index c982019a..caa002c9 100644 --- a/include/ghex/rma/shmem/access_guard.hpp +++ b/include/ghex/rma/shmem/access_guard.hpp @@ -43,7 +43,7 @@ struct local_access_guard impl(access_mode m) : m_handle(&m_ptr, sizeof(access_state), false) - , m_state{*(new (m_ptr) access_state{m, {}, {}})} + , m_state{*(new(m_ptr) access_state{m, {}, {}})} { } }; @@ -69,8 +69,8 @@ struct local_access_guard void start_target_epoch() { lock_type lk{m_impl->m_state.m_mtx}; - m_impl->m_state.m_cv.wait( - lk, [this] { return m_impl->m_state.m_mode == access_mode::local; }); + m_impl->m_state.m_cv.wait(lk, + [this] { return m_impl->m_state.m_mode == access_mode::local; }); } bool try_start_target_epoch() diff --git a/include/ghex/rma/thread/access_guard.hpp b/include/ghex/rma/thread/access_guard.hpp index b428c357..d7756ed7 100644 --- a/include/ghex/rma/thread/access_guard.hpp +++ b/include/ghex/rma/thread/access_guard.hpp @@ -65,8 +65,8 @@ struct local_access_guard void start_target_epoch() { std::unique_lock lk{m_impl->m_state.m_mtx}; - m_impl->m_state.m_cv.wait( - lk, [this] { return m_impl->m_state.m_mode == access_mode::local; }); + m_impl->m_state.m_cv.wait(lk, + [this] { return m_impl->m_state.m_mode == access_mode::local; }); } bool try_start_target_epoch() diff --git a/include/ghex/rma/xpmem/handle.hpp b/include/ghex/rma/xpmem/handle.hpp index f2e973fb..5b6b66f5 100644 --- a/include/ghex/rma/xpmem/handle.hpp +++ b/include/ghex/rma/xpmem/handle.hpp @@ -28,9 +28,9 @@ namespace xpmem // Below are implementations of a handle in a multi-process setting using xpmem. // Please refer to the documentation in rma/handle.hpp for further explanations. -#define align_down_pow2(_n, _alignment) ((_n) & ~((_alignment)-1)) +#define align_down_pow2(_n, _alignment) ((_n) & ~((_alignment) - 1)) -#define align_up_pow2(_n, _alignment) align_down_pow2((_n) + (_alignment)-1, _alignment) +#define align_up_pow2(_n, _alignment) align_down_pow2((_n) + (_alignment) - 1, _alignment) struct info { diff --git a/include/ghex/structured/cubed_sphere/field_descriptor.hpp b/include/ghex/structured/cubed_sphere/field_descriptor.hpp index f84ff60c..f2f22c44 100644 --- a/include/ghex/structured/cubed_sphere/field_descriptor.hpp +++ b/include/ghex/structured/cubed_sphere/field_descriptor.hpp @@ -175,8 +175,8 @@ class field_descriptor> } template - unpack_iteration_space make_unpack_is( - const IterationSpace& is, const T* buffer, size_type size, const transform& t) + unpack_iteration_space make_unpack_is(const IterationSpace& is, const T* buffer, size_type size, + const transform& t) { return { make_buffer_desc>(is, buffer, size), @@ -189,8 +189,8 @@ class field_descriptor> { // description of the halo in the buffer coordinate_type buffer_offset; - std::copy( - is.global().first().begin() + 1, is.global().first().end(), buffer_offset.begin()); + std::copy(is.global().first().begin() + 1, is.global().first().end(), + buffer_offset.begin()); if (has_components::value) buffer_offset[dimension::value - 1] = 0; coordinate_type buffer_extents; std::copy(is.global().last().begin() + 1, is.global().last().end(), buffer_extents.begin()); diff --git a/include/ghex/structured/cubed_sphere/halo_generator.hpp b/include/ghex/structured/cubed_sphere/halo_generator.hpp index ce3a7453..9be7ceca 100644 --- a/include/ghex/structured/cubed_sphere/halo_generator.hpp +++ b/include/ghex/structured/cubed_sphere/halo_generator.hpp @@ -154,7 +154,9 @@ class halo_generator const auto h_i = intersect(h_box, tile_box); if ((h_i.global().first()[1] <= h_i.global().last()[1]) && (h_i.global().first()[2] <= h_i.global().last()[2])) - { result.push_back(h_i); } + { + result.push_back(h_i); + } // intersect with the 4 neighbor tiles for (int n = 0; n < 4; ++n) { @@ -277,10 +279,7 @@ class halo_generator } return {box{first_a_local_new, last_a_local_new}, x}; } - else - { - return intersect(box2{b_a_local, b_a_global}, b_b_global); - } + else { return intersect(box2{b_a_local, b_a_global}, b_b_global); } } }; diff --git a/include/ghex/structured/cubed_sphere/transform.hpp b/include/ghex/structured/cubed_sphere/transform.hpp index f6fee9ad..bd1b861b 100644 --- a/include/ghex/structured/cubed_sphere/transform.hpp +++ b/include/ghex/structured/cubed_sphere/transform.hpp @@ -114,18 +114,18 @@ static constexpr std::array, 6> transform_lu = { // inverse transform: neigbhor tile coordinates to this tile coordinates static constexpr std::array, 6> inverse_transform_lu = { - std::array{ - transform_lu[4][3], transform_lu[1][0], transform_lu[5][3], transform_lu[2][0]}, - std::array{ - transform_lu[0][1], transform_lu[3][2], transform_lu[5][1], transform_lu[2][2]}, - std::array{ - transform_lu[0][3], transform_lu[3][0], transform_lu[1][3], transform_lu[4][0]}, - std::array{ - transform_lu[2][1], transform_lu[5][2], transform_lu[1][1], transform_lu[4][2]}, - std::array{ - transform_lu[2][3], transform_lu[5][0], transform_lu[3][3], transform_lu[0][0]}, - std::array{ - transform_lu[4][1], transform_lu[1][2], transform_lu[3][1], transform_lu[0][2]}}; + std::array{transform_lu[4][3], transform_lu[1][0], transform_lu[5][3], + transform_lu[2][0]}, + std::array{transform_lu[0][1], transform_lu[3][2], transform_lu[5][1], + transform_lu[2][2]}, + std::array{transform_lu[0][3], transform_lu[3][0], transform_lu[1][3], + transform_lu[4][0]}, + std::array{transform_lu[2][1], transform_lu[5][2], transform_lu[1][1], + transform_lu[4][2]}, + std::array{transform_lu[2][3], transform_lu[5][0], transform_lu[3][3], + transform_lu[0][0]}, + std::array{transform_lu[4][1], transform_lu[1][2], transform_lu[3][1], + transform_lu[0][2]}}; } // namespace cubed_sphere } // namespace structured diff --git a/include/ghex/structured/field_descriptor.hpp b/include/ghex/structured/field_descriptor.hpp index e1e9090f..66b6ad51 100644 --- a/include/ghex/structured/field_descriptor.hpp +++ b/include/ghex/structured/field_descriptor.hpp @@ -152,8 +152,8 @@ class field_descriptor field_descriptor(const domain_descriptor_type& dom_, const DomainArray& dom_first_, value_type* data_, const OffsetArray& offsets_, const ExtentArray& extents_, - unsigned int num_components_ = 1u, bool is_vector_field_ = false, device_id_type d_id_ = - arch_traits::current_id()) + unsigned int num_components_ = 1u, bool is_vector_field_ = false, + device_id_type d_id_ = arch_traits::current_id()) : m_dom{dom_} , m_data{data_} , m_num_components{num_components_} @@ -179,8 +179,8 @@ class field_descriptor::template apply( - m_extents, m_byte_strides, 0u); + detail::compute_strides::template apply(m_extents, + m_byte_strides, 0u); m_bytes = m_byte_strides[layout_map::find(0)] * m_extents[layout_map::find(0)]; } @@ -189,8 +189,8 @@ class field_descriptor::current_id()) - : field_descriptor( - dom_, dom_first_, data_, offsets_, extents_, num_components_, is_vector_field_, d_id_) + : field_descriptor(dom_, dom_first_, data_, offsets_, extents_, num_components_, + is_vector_field_, d_id_) { for (unsigned int i = 0u; i < dimension::value; ++i) m_byte_strides[i] = strides_[i]; m_bytes = m_byte_strides[layout_map::find(0)] * m_extents[layout_map::find(0)]; diff --git a/include/ghex/structured/field_utils.hpp b/include/ghex/structured/field_utils.hpp index 1e5b246d..1dc83f90 100644 --- a/include/ghex/structured/field_utils.hpp +++ b/include/ghex/structured/field_utils.hpp @@ -17,28 +17,28 @@ namespace gridtools { template GHEX_FUNCTION array - operator+(array a, const array& b) + operator+(array a, const array& b) { for (std::size_t i = 0u; i < D; ++i) a[i] += b[i]; return a; } template GHEX_FUNCTION array - operator+(array a, const U& scalar) + operator+(array a, const U& scalar) { for (std::size_t i = 0u; i < D; ++i) a[i] += scalar; return a; } template GHEX_FUNCTION array - operator+(const U& scalar, array a) + operator+(const U& scalar, array a) { for (std::size_t i = 0u; i < D; ++i) a[i] += scalar; return a; } template GHEX_FUNCTION array - operator-(array a, const array& b) + operator-(array a, const array& b) { for (std::size_t i = 0u; i < D; ++i) a[i] -= b[i]; return a; @@ -100,7 +100,8 @@ struct compute_strides compute_strides_impl::template apply(extents, strides); } template - GHEX_FUNCTION static void apply(const Coordinate& extents, Strides& strides, std::size_t padding) + GHEX_FUNCTION static void apply(const Coordinate& extents, Strides& strides, + std::size_t padding) { const auto idx = Layout::find(D - 1); strides[idx] = sizeof(T); @@ -132,8 +133,8 @@ struct compute_coordinate_impl { const auto idx = Layout::find(D - (K)); coord[idx] = i / strides[idx]; - compute_coordinate_impl::template apply( - strides, coord, i - coord[idx] * strides[idx]); + compute_coordinate_impl::template apply(strides, coord, + i - coord[idx] * strides[idx]); } }; template @@ -152,8 +153,8 @@ struct compute_coordinate { const auto idx = Layout::find(0); coord[idx] = i / strides[idx]; - compute_coordinate_impl::template apply( - strides, coord, i - coord[idx] * strides[idx]); + compute_coordinate_impl::template apply(strides, coord, + i - coord[idx] * strides[idx]); } }; diff --git a/include/ghex/structured/pack_kernels.hpp b/include/ghex/structured/pack_kernels.hpp index c2c6fa7d..4f7ceb29 100644 --- a/include/ghex/structured/pack_kernels.hpp +++ b/include/ghex/structured/pack_kernels.hpp @@ -44,10 +44,8 @@ struct serialization { using coordinate_type = typename PackIterationSpace::coordinate_t; static constexpr auto D = coordinate_type::size(); - ::ghex::for_loop::template apply( - [&pack_is](auto... xs) { - pack_is.buffer(coordinate_type{xs...}) = pack_is.data(coordinate_type{xs...}); - }, + ::ghex::for_loop::template apply([&pack_is](auto... xs) + { pack_is.buffer(coordinate_type{xs...}) = pack_is.data(coordinate_type{xs...}); }, pack_is.m_data_is.m_first, pack_is.m_data_is.m_last); } @@ -56,10 +54,8 @@ struct serialization { using coordinate_type = typename UnPackIterationSpace::coordinate_t; static constexpr auto D = coordinate_type::size(); - ::ghex::for_loop::template apply( - [&unpack_is](auto... xs) { - unpack_is.data(coordinate_type{xs...}) = unpack_is.buffer(coordinate_type{xs...}); - }, + ::ghex::for_loop::template apply([&unpack_is](auto... xs) + { unpack_is.data(coordinate_type{xs...}) = unpack_is.buffer(coordinate_type{xs...}); }, unpack_is.m_data_is.m_first, unpack_is.m_data_is.m_last); } @@ -84,7 +80,8 @@ struct serialization last[j++] = pack_is.m_data_is.m_last[i]; } ::ghex::for_loop::template apply( - [&pack_is, &x_first, &x_last](auto... xs) { + [&pack_is, &x_first, &x_last](auto... xs) + { const cont_coord_type x0{xs...}; coordinate_type x1; x1[cont_idx] = x_first; @@ -121,7 +118,8 @@ struct serialization last[j++] = unpack_is.m_data_is.m_last[i]; } ::ghex::for_loop::template apply( - [&unpack_is, &x_first, &x_last](auto... xs) { + [&unpack_is, &x_first, &x_last](auto... xs) + { const cont_coord_type x0{xs...}; coordinate_type x1; x1[cont_idx] = x_first; diff --git a/include/ghex/structured/pattern.hpp b/include/ghex/structured/pattern.hpp index 0ccbff39..2a3b3b46 100644 --- a/include/ghex/structured/pattern.hpp +++ b/include/ghex/structured/pattern.hpp @@ -82,8 +82,8 @@ class pattern, DomainIdType> public: // print template - friend std::basic_ostream& operator<<( - std::basic_ostream& os, const iteration_space& is) + friend std::basic_ostream& operator<<(std::basic_ostream& os, + const iteration_space& is) { os << "[" << is._min << ", " << is._max << "]"; return os; @@ -111,8 +111,8 @@ class pattern, DomainIdType> public: // print template - friend std::basic_ostream& operator<<( - std::basic_ostream& os, const iteration_space_pair& is) + friend std::basic_ostream& operator<<(std::basic_ostream& os, + const iteration_space_pair& is) { os << is.m_global << " (local: " << is.m_local << ")"; return os; @@ -126,7 +126,7 @@ class pattern, DomainIdType> public: // members domain_id_type id; int mpi_rank; - int tag; + int tag; public: // member functions // unique ordering given by id and tag @@ -137,8 +137,8 @@ class pattern, DomainIdType> public: // print template - friend std::basic_ostream& operator<<( - std::basic_ostream& os, const extended_domain_id_type& dom_id) + friend std::basic_ostream& operator<<(std::basic_ostream& os, + const extended_domain_id_type& dom_id) { os << "{id=" << dom_id.id << ", tag=" << dom_id.tag << ", rank=" << dom_id.mpi_rank << "}"; @@ -255,11 +255,13 @@ struct make_pattern_impl> { iteration_space_pair is{iteration_space{coordinate_type{h.local().first()}, coordinate_type{h.local().last()}}, - iteration_space{ - coordinate_type{h.global().first()}, coordinate_type{h.global().last()}}}; + iteration_space{coordinate_type{h.global().first()}, + coordinate_type{h.global().last()}}}; // check that invariant is fullfilled (halos are not empty) if (is.local().first() <= is.local().last()) - { my_generated_recv_halos.back().push_back(is); } + { + my_generated_recv_halos.back().push_back(is); + } } } @@ -309,8 +311,8 @@ struct make_pattern_impl> const auto& extent = extents_vec[k]; const auto& domain_id = domain_id_vec[k]; const auto x = hgen.intersect(*d_it, halo.local().first(), - halo.local().last(), halo.global().first(), halo.global().last(), - extent.global().first(), extent.global().last()); + halo.local().last(), halo.global().first(), halo.global().last(), + extent.global().first(), extent.global().last()); const coordinate_type x_global_first{x.global().first()}; const coordinate_type x_global_last{x.global().last()}; if (x_global_first <= x_global_last) @@ -564,12 +566,13 @@ struct make_pattern_impl> } } - return pattern_container(ctxt, std::move(my_patterns), m_max_tag); + return pattern_container(ctxt, std::move(my_patterns), + m_max_tag); } template - static auto apply( - context& ctxt, HaloGenerator&& hgen, RecvDomainIdsGen&&, DomainRange&& d_range) + static auto apply(context& ctxt, HaloGenerator&& hgen, RecvDomainIdsGen&&, + DomainRange&& d_range) { return apply(ctxt, hgen, d_range); } diff --git a/include/ghex/structured/regular/field_descriptor.hpp b/include/ghex/structured/regular/field_descriptor.hpp index 8d3d2cf0..b052e9ea 100644 --- a/include/ghex/structured/regular/field_descriptor.hpp +++ b/include/ghex/structured/regular/field_descriptor.hpp @@ -184,10 +184,11 @@ wrap_field(const DomainDescriptor& dom, T* data, const Array& offsets, const Arr * @param extents extent of the wrapped N-dimensional array (including buffer regions) * @param strides array strides * @return wrapped field*/ -template +template structured::regular::field_descriptor wrap_field(const DomainDescriptor& dom, T* data, const Array& offsets, const Array& extents, - const Strides& strides, + const Strides& strides, typename arch_traits::device_id_type device_id = arch_traits::current_id()) { return {dom, data, offsets, extents, strides, 1, false, device_id}; diff --git a/include/ghex/structured/regular/halo_generator.hpp b/include/ghex/structured/regular/halo_generator.hpp index 0a9b0f08..c8f0fc5f 100644 --- a/include/ghex/structured/regular/halo_generator.hpp +++ b/include/ghex/structured/regular/halo_generator.hpp @@ -33,8 +33,8 @@ class halo_generator> using dimension = typename domain_type::dimension; using coordinate_type = typename grid::template type::coordinate_type; - //private: // member types - // todo (tehrengruber): check with ghex team + //private: // member types + // todo (tehrengruber): check with ghex team public: // member types struct box { @@ -66,8 +66,8 @@ class halo_generator> * @param halos list of halo sizes (dim0_dir-, dim0_dir+, dim1_dir-, dim1_dir+, ...) * @param periodic list of bools indicating periodicity per dimension (true,true,false,...) */ template - halo_generator( - const Array& g_first, const Array& g_last, RangeHalos&& halos, RangePeriodic&& periodic) + halo_generator(const Array& g_first, const Array& g_last, RangeHalos&& halos, + RangePeriodic&& periodic) { std::copy(std::begin(g_first), std::end(g_first), m_first.begin()); std::copy(std::begin(g_last), std::end(g_last), m_last.begin()); @@ -150,8 +150,8 @@ class halo_generator> const coordinate_type& last_a_global, const coordinate_type& first_b_global, const coordinate_type& last_b_global) const noexcept { - const box global_box{ - max(first_a_global, first_b_global), min(last_a_global, last_b_global)}; + const box global_box{max(first_a_global, first_b_global), + min(last_a_global, last_b_global)}; const box local_box{first_a_local + (global_box.first() - first_a_global), first_a_local + (global_box.last() - first_a_global)}; return {local_box, global_box}; diff --git a/include/ghex/structured/regular/make_pattern.hpp b/include/ghex/structured/regular/make_pattern.hpp index 7fc7ab82..8981795c 100644 --- a/include/ghex/structured/regular/make_pattern.hpp +++ b/include/ghex/structured/regular/make_pattern.hpp @@ -242,8 +242,8 @@ make_staged_pattern(ghex::context& ctxt, DomainRange&& d_range, DomainLookUp&& d auto& ti_vec = send_tag_map[id_is_pair.first.mpi_rank]; domain_id_type source_id = p.domain_id(); domain_id_type dest_id = id_is_pair.first.id; - auto tag = - std::find_if(ti_vec.begin(), ti_vec.end(), [source_id, dest_id](const auto& x) { + auto tag = std::find_if(ti_vec.begin(), ti_vec.end(), + [source_id, dest_id](const auto& x) { return x.source_id == source_id && x.dest_id == dest_id; })->tag; const_cast(id_is_pair.first).tag = tag; diff --git a/include/ghex/structured/rma_put.hpp b/include/ghex/structured/rma_put.hpp index da106ebb..996c4b26 100644 --- a/include/ghex/structured/rma_put.hpp +++ b/include/ghex/structured/rma_put.hpp @@ -72,7 +72,8 @@ put(rma_range& s, rma_range& t, using sv_t = rma_range; using coordinate = typename sv_t::coordinate; for_loop::apply( - [&s, &t](auto... c) { + [&s, &t](auto... c) + { auto dst = t.ptr(coordinate{c...}); auto src = s.ptr(coordinate{c...}); for (unsigned int i = 0; i < s.m_chunk_size_; ++i) { dst[i] = src[i]; } @@ -96,7 +97,8 @@ put(rma_range& s, rma_range& t, using coordinate = typename sv_t::coordinate; const auto nc = s.m_field.num_components(); for_loop::apply( - [&s, &t, nc](auto... c) { + [&s, &t, nc](auto... c) + { std::memcpy(t.ptr(coordinate{c...}), s.ptr(coordinate{c...}), s.m_chunk_size * nc); // auto dst = t.ptr(coordinate{c...}); // auto src = s.ptr(coordinate{c...}); @@ -124,7 +126,8 @@ put([[maybe_unused]] rma_range& s, [[maybe_unused]] rma_range; using coordinate = typename sv_t::coordinate; for_loop::apply( - [&s, &t, &st](auto... c) { + [&s, &t, &st](auto... c) + { GHEX_CHECK_CUDA_RESULT(cudaMemcpyAsync(t.ptr(coordinate{c...}), s.ptr(coordinate{c...}), s.m_chunk_size, cudaMemcpyHostToDevice, st)); }, @@ -152,7 +155,8 @@ put([[maybe_unused]] rma_range& s, [[maybe_unused]] rma_range::apply( - [&s, &t, &st](auto... c) { + [&s, &t, &st](auto... c) + { GHEX_CHECK_CUDA_RESULT(cudaMemcpyAsync(t.ptr(coordinate{c...}), s.ptr(coordinate{c...}), s.m_chunk_size, cudaMemcpyDeviceToHost, st)); }, @@ -161,7 +165,8 @@ put([[maybe_unused]] rma_range& s, [[maybe_unused]] rma_range::apply( - [&s, &t, &st](auto... c) { + [&s, &t, &st](auto... c) + { GHEX_CHECK_CUDA_RESULT(cudaMemcpyAsync(t.ptr(coordinate{c...}), s.ptr(coordinate{c...}), s.m_chunk_size, cudaMemcpyDeviceToHost, st)); }, @@ -175,7 +180,8 @@ put([[maybe_unused]] rma_range& s, [[maybe_unused]] rma_range::apply( - [&s, &t, &st, &i, &st2](auto... c) { + [&s, &t, &st, &i, &st2](auto... c) + { if (data.size() < i + 1) data.push_back(std::vector(s.m_chunk_size)); else data[i].resize(s.m_chunk_size); @@ -186,10 +192,9 @@ put([[maybe_unused]] rma_range& s, [[maybe_unused]] rma_range::apply( - [&s, &t, &i](auto... c) { - std::memcpy(t.ptr(coordinate{c...}), data[i++].data(), s.m_chunk_size); - }, - s.m_begin, s.m_end); + [&s, &t, &i](auto... c) + { std::memcpy(t.ptr(coordinate{c...}), data[i++].data(), s.m_chunk_size); }, s.m_begin, + s.m_end); } #endif #endif diff --git a/include/ghex/structured/rma_range_generator.hpp b/include/ghex/structured/rma_range_generator.hpp index 6b7bd314..56e74147 100644 --- a/include/ghex/structured/rma_range_generator.hpp +++ b/include/ghex/structured/rma_range_generator.hpp @@ -68,8 +68,8 @@ struct rma_range_generator , m_event{m_on_gpu, loc} , m_comm{&comm} { - RangeFactory::serialize( - field_info, m_local_guard, m_event, m_local_range, m_archive.data()); + RangeFactory::serialize(field_info, m_local_guard, m_event, m_local_range, + m_archive.data()); m_request = comm.send(m_archive, m_dst, m_tag); } @@ -148,8 +148,8 @@ struct rma_range_generator m_request.wait(); // creates a traget range m_remote_range = RangeFactory::deserialize(m_archive.data(), m_src, m_on_gpu); - RangeFactory::call_back_with_type( - m_remote_range, [this](auto& r) { init(r, m_remote_range); }); + RangeFactory::call_back_with_type(m_remote_range, + [this](auto& r) { init(r, m_remote_range); }); m_remote_range.end_source_epoch(); } diff --git a/include/ghex/unstructured/user_concepts.hpp b/include/ghex/unstructured/user_concepts.hpp index 66becac5..a8f5250c 100644 --- a/include/ghex/unstructured/user_concepts.hpp +++ b/include/ghex/unstructured/user_concepts.hpp @@ -307,7 +307,8 @@ class data_descriptor , m_index_stride{levels_first ? (outer_stride ? outer_stride : m_levels) : 1u} , m_level_stride{levels_first ? 1u : (outer_stride ? outer_stride : m_domain_size)} { - assert(field.size() == (levels_first ? domain.size() * m_index_stride : m_level_stride * m_levels)); + assert(field.size() == + (levels_first ? domain.size() * m_index_stride : m_level_stride * m_levels)); assert(!(outer_stride) || (outer_stride >= (levels_first ? m_levels : m_domain_size))); } @@ -318,8 +319,8 @@ class data_descriptor * @param levels_first stride of levels * @param levels_first indicates whether levels have stide 1 * @param outer_stride outer dimension's stride measured in number of elements of type T (special value 0: no padding)*/ - data_descriptor(const domain_descriptor_type& domain, value_type* field_ptr, std::size_t levels = 1u, - bool levels_first = true, std::size_t outer_stride = 0u) + data_descriptor(const domain_descriptor_type& domain, value_type* field_ptr, + std::size_t levels = 1u, bool levels_first = true, std::size_t outer_stride = 0u) : m_domain_id{domain.domain_id()} , m_domain_size{domain.size()} , m_levels{levels} @@ -338,8 +339,8 @@ class data_descriptor * @param levels number of levels * @param levels_first indicates whether levels have stide 1 * @param outer_stride outer dimension's stride measured in number of elements of type T (special value 0: no padding)*/ - data_descriptor(domain_id_type domain_id, std::size_t domain_size, value_type* field_ptr, std::size_t levels = 1u, - bool levels_first = true, std::size_t outer_stride = 0u) + data_descriptor(domain_id_type domain_id, std::size_t domain_size, value_type* field_ptr, + std::size_t levels = 1u, bool levels_first = true, std::size_t outer_stride = 0u) : m_domain_id{domain_id} , m_domain_size{domain_size} , m_levels{levels} @@ -404,7 +405,6 @@ class data_descriptor buffer += sizeof(value_type); } } - } } diff --git a/include/ghex/util/coordinate.hpp b/include/ghex/util/coordinate.hpp index 8706f8a0..25e94f1e 100644 --- a/include/ghex/util/coordinate.hpp +++ b/include/ghex/util/coordinate.hpp @@ -34,8 +34,8 @@ struct coordinate public: // print template - friend std::basic_ostream& operator<<( - std::basic_ostream& os, const coordinate& c) + friend std::basic_ostream& operator<<(std::basic_ostream& os, + const coordinate& c) { os << "{"; for (int i = 0; i < size() - 1; ++i) os << c.m_coord[i] << ", "; diff --git a/include/ghex/util/decomposition.hpp b/include/ghex/util/decomposition.hpp index b2b91ecc..8b60416d 100644 --- a/include/ghex/util/decomposition.hpp +++ b/include/ghex/util/decomposition.hpp @@ -144,7 +144,7 @@ class hierarchical_decomposition /** returns domain coordinate given rank and thread index */ array_type operator()(size_type rank, size_type thread_idx) const noexcept { - return this->operator()(rank* threads_per_rank() + thread_idx); + return this->operator()(rank * threads_per_rank() + thread_idx); } }; diff --git a/include/ghex/util/resource_layout.hpp b/include/ghex/util/resource_layout.hpp index bfb1aa59..20788235 100644 --- a/include/ghex/util/resource_layout.hpp +++ b/include/ghex/util/resource_layout.hpp @@ -25,8 +25,8 @@ struct dist_1D_generator> using dist_1D_tuple_type = std::tuple...>; template - static hierarchical_distribution generate_1( - const Dims& dims, std::index_sequence) noexcept + static hierarchical_distribution generate_1(const Dims& dims, + std::index_sequence) noexcept { return {{dims[sizeof...(Is) - sizeof...(Js) + 1 + Js].size()...}, false}; } @@ -68,8 +68,8 @@ class hierarchical_resource_layout private: template - static distribution_type make_dist( - const dims_map_array_type& d, std::index_sequence) noexcept + static distribution_type make_dist(const dims_map_array_type& d, + std::index_sequence) noexcept { return {{d[Is].size()...}, true}; } @@ -131,15 +131,15 @@ class hierarchical_resource_layout } private: - size_type relative_resource( - size_type idx, std::integral_constant) const noexcept + size_type relative_resource(size_type idx, + std::integral_constant) const noexcept { return idx - index<0>(idx) * std::get<0>(m_1D_dist).size(); } template - size_type relative_resource( - size_type idx, std::integral_constant) const noexcept + size_type relative_resource(size_type idx, + std::integral_constant) const noexcept { return relative_resource(idx, std::integral_constant()) - index(idx) * std::get(m_1D_dist).size(); diff --git a/scripts/container_clang_format.sh b/scripts/container_clang_format.sh index ccc1e872..05e5f6ca 100644 --- a/scripts/container_clang_format.sh +++ b/scripts/container_clang_format.sh @@ -7,10 +7,11 @@ CONTAINERFILE="$REPO_ROOT/$CONTAINERFILE_REL" IMAGE_NAME="${CLANG_FORMAT_IMAGE_NAME:-clang-format}" # Detect container runtime -if command -v docker &>/dev/null; then - OCIRUN="docker" - USER_FLAG="--user $(id -u):$(id -g)" -elif command -v podman &>/dev/null; then +# if command -v docker &>/dev/null; then +# OCIRUN="docker" +# USER_FLAG="--user $(id -u):$(id -g)" +# el +if command -v podman &>/dev/null; then OCIRUN="podman" USER_FLAG="--userns=keep-id" else diff --git a/test/mpi_runner/gtest_main_mpi.cpp b/test/mpi_runner/gtest_main_mpi.cpp index 9172d4e0..82e000dc 100644 --- a/test/mpi_runner/gtest_main_mpi.cpp +++ b/test/mpi_runner/gtest_main_mpi.cpp @@ -23,7 +23,7 @@ main(int argc, char** argv) if (provided < required) throw std::runtime_error("MPI does not support required threading level"); #else - MPI_Init(&argc,&argv); + MPI_Init(&argc, &argv); #endif // printf("Running main() from %s\n", __FILE__); diff --git a/test/structured/cubed_sphere/test_cubed_sphere_exchange.cpp b/test/structured/cubed_sphere/test_cubed_sphere_exchange.cpp index 3856e3e4..88a38989 100644 --- a/test/structured/cubed_sphere/test_cubed_sphere_exchange.cpp +++ b/test/structured/cubed_sphere/test_cubed_sphere_exchange.cpp @@ -77,30 +77,36 @@ // -------------------------------------------------------+------------------------------------------------------- // helper macro for checks -#define GHEX_CS_CHECK_HEADER \ - const auto x_dom_min = field.offsets()[0]; \ - const auto x_min = x_dom_min-halo; \ - const auto y_dom_min = field.offsets()[1]; \ - const auto y_min = y_dom_min-halo; \ - const auto x_dom_max = x_dom_min + n; \ - const auto x_max = x_dom_max+halo; \ - const auto y_dom_max = y_dom_min + n; \ - const auto y_max = y_dom_max+halo; \ - const auto strides = field.byte_strides(); \ +#define GHEX_CS_CHECK_HEADER \ + const auto x_dom_min = field.offsets()[0]; \ + const auto x_min = x_dom_min - halo; \ + const auto y_dom_min = field.offsets()[1]; \ + const auto y_min = y_dom_min - halo; \ + const auto x_dom_max = x_dom_min + n; \ + const auto x_max = x_dom_max + halo; \ + const auto y_dom_max = y_dom_min + n; \ + const auto y_max = y_dom_max + halo; \ + const auto strides = field.byte_strides(); \ using value_type = typename Field::value_type; // helper macro for checks -#define GHEX_CS_CHECK_VALUE \ - const auto memory_location = strides[3]*c + strides[0]*x + strides[1]*y+ strides[2]*z; \ - const value_type value = *reinterpret_cast( \ - reinterpret_cast(field.data())+memory_location); +#define GHEX_CS_CHECK_VALUE \ + const auto memory_location = \ + strides[3] * c + strides[0] * x + strides[1] * y + strides[2] * z; \ + const value_type value = *reinterpret_cast( \ + reinterpret_cast(field.data()) + memory_location); template -int id_to_int(const Id& id) { - if (id[0]==0 && id[1]==0) return 0; - else if (id[1]==0) return 1; - else if (id[0]==0) return 2; - else return 3; +int +id_to_int(const Id& id) +{ + if (id[0] == 0 && id[1] == 0) return 0; + else if (id[1] == 0) + return 1; + else if (id[0] == 0) + return 2; + else + return 3; } // even checks @@ -108,85 +114,94 @@ int id_to_int(const Id& id) { // check received data for even tile and subdomain with id 0 template -void check_even_0(const Field& field, int halo, int n) { +void +check_even_0(const Field& field, int halo, int n) +{ GHEX_CS_CHECK_HEADER using namespace ghex::structured::cubed_sphere; - for (int c=0; c -void check_even_1(const Field& field, int halo, int n) { +void +check_even_1(const Field& field, int halo, int n) +{ GHEX_CS_CHECK_HEADER using namespace ghex::structured::cubed_sphere; - for (int c=0; c -void check_even_2(const Field& field, int halo, int n) { +void +check_even_2(const Field& field, int halo, int n) +{ GHEX_CS_CHECK_HEADER using namespace ghex::structured::cubed_sphere; - for (int c=0; c -void check_even_3(const Field& field, int halo, int n) { +void +check_even_3(const Field& field, int halo, int n) +{ GHEX_CS_CHECK_HEADER using namespace ghex::structured::cubed_sphere; - for (int c=0; c -void check_odd_0(const Field& field, int halo, int n) { +void +check_odd_0(const Field& field, int halo, int n) +{ GHEX_CS_CHECK_HEADER using namespace ghex::structured::cubed_sphere; - for (int c=0; c -void check_odd_1(const Field& field, int halo, int n) { +void +check_odd_1(const Field& field, int halo, int n) +{ GHEX_CS_CHECK_HEADER using namespace ghex::structured::cubed_sphere; - for (int c=0; c -void check_odd_2(const Field& field, int halo, int n) { +void +check_odd_2(const Field& field, int halo, int n) +{ GHEX_CS_CHECK_HEADER using namespace ghex::structured::cubed_sphere; - for (int c=0; c -void check_odd_3(const Field& field, int halo, int n) { +void +check_odd_3(const Field& field, int halo, int n) +{ GHEX_CS_CHECK_HEADER using namespace ghex::structured::cubed_sphere; - for (int c=0; c -void check_field(const Field& field, int halo, int n) { +void +check_field(const Field& field, int halo, int n) +{ const auto id = id_to_int(field.domain_id().id); - if (field.domain_id().tile % 2 == 0) { - switch (id) { + if (field.domain_id().tile % 2 == 0) + { + switch (id) + { case 0: check_even_0(field, halo, n); break; @@ -825,8 +907,10 @@ void check_field(const Field& field, int halo, int n) { break; } } - else { - switch (id) { + else + { + switch (id) + { case 0: check_odd_0(field, halo, n); break; @@ -855,62 +939,48 @@ TEST_F(mpi_test_fixture, cubed_sphere) halo_generator halo_gen(2); // cube with size 10 and 6 levels - cube c{10,6}; + cube c{10, 6}; // define 4 local domains - domain_descriptor domain0 (c, ctxt.rank(), 0, 4, 0, 4); - domain_descriptor domain1 (c, ctxt.rank(), 5, 9, 0, 4); - domain_descriptor domain2 (c, ctxt.rank(), 0, 4, 5, 9); - domain_descriptor domain3 (c, ctxt.rank(), 5, 9, 5, 9); - std::vector local_domains{ domain0, domain1, domain2, domain3 }; + domain_descriptor domain0(c, ctxt.rank(), 0, 4, 0, 4); + domain_descriptor domain1(c, ctxt.rank(), 5, 9, 0, 4); + domain_descriptor domain2(c, ctxt.rank(), 0, 4, 5, 9); + domain_descriptor domain3(c, ctxt.rank(), 5, 9, 5, 9); + std::vector local_domains{domain0, domain1, domain2, domain3}; // allocate large enough memory for fields, sufficient for 3 halo lines // use 8 components per field and 6 z-levels - const int halo=3; - ghex::test::util::memory data_dom_0((2*halo+5)*(2*halo+5)*6*8,-1); // fields - ghex::test::util::memory data_dom_1((2*halo+5)*(2*halo+5)*6*8,-1); // fields - ghex::test::util::memory data_dom_2((2*halo+5)*(2*halo+5)*6*8,-1); // fields - ghex::test::util::memory data_dom_3((2*halo+5)*(2*halo+5)*6*8,-1); // fields + const int halo = 3; + ghex::test::util::memory data_dom_0((2 * halo + 5) * (2 * halo + 5) * 6 * 8, + -1); // fields + ghex::test::util::memory data_dom_1((2 * halo + 5) * (2 * halo + 5) * 6 * 8, + -1); // fields + ghex::test::util::memory data_dom_2((2 * halo + 5) * (2 * halo + 5) * 6 * 8, + -1); // fields + ghex::test::util::memory data_dom_3((2 * halo + 5) * (2 * halo + 5) * 6 * 8, + -1); // fields // initialize physical domain (leave halos as they are) - for (int comp=0; comp<8; ++comp) - for (int z=0; z<6; ++z) - for (int y=0; y<5; ++y) - for (int x=0; x<5; ++x) - { - const auto idx = - (x+halo) + - (y+halo)*(2*halo+5) + - z*(2*halo+5)*(2*halo+5) + - comp*(2*halo+5)*(2*halo+5)*6; - data_dom_0[idx] = - 100000*(domain0.domain_id().tile+1) + - 10000*id_to_int(domain0.domain_id().id) + - 1000*comp + - 100*x + - 10*y + - 1*z; - data_dom_1[idx] = - 100000*(domain1.domain_id().tile+1) + - 10000*id_to_int(domain1.domain_id().id) + - 1000*comp + - 100*x + - 10*y + - 1*z; - data_dom_2[idx] = - 100000*(domain2.domain_id().tile+1) + - 10000*id_to_int(domain2.domain_id().id) + - 1000*comp + - 100*x + - 10*y + - 1*z; - data_dom_3[idx] = - 100000*(domain3.domain_id().tile+1) + - 10000*id_to_int(domain3.domain_id().id) + - 1000*comp + - 100*x + - 10*y + - 1*z; + for (int comp = 0; comp < 8; ++comp) + for (int z = 0; z < 6; ++z) + for (int y = 0; y < 5; ++y) + for (int x = 0; x < 5; ++x) + { + const auto idx = (x + halo) + (y + halo) * (2 * halo + 5) + + z * (2 * halo + 5) * (2 * halo + 5) + + comp * (2 * halo + 5) * (2 * halo + 5) * 6; + data_dom_0[idx] = 100000 * (domain0.domain_id().tile + 1) + + 10000 * id_to_int(domain0.domain_id().id) + 1000 * comp + + 100 * x + 10 * y + 1 * z; + data_dom_1[idx] = 100000 * (domain1.domain_id().tile + 1) + + 10000 * id_to_int(domain1.domain_id().id) + 1000 * comp + + 100 * x + 10 * y + 1 * z; + data_dom_2[idx] = 100000 * (domain2.domain_id().tile + 1) + + 10000 * id_to_int(domain2.domain_id().id) + 1000 * comp + + 100 * x + 10 * y + 1 * z; + data_dom_3[idx] = 100000 * (domain3.domain_id().tile + 1) + + 10000 * id_to_int(domain3.domain_id().id) + 1000 * comp + + 100 * x + 10 * y + 1 * z; } #if defined(GHEX_USE_GPU) || defined(GHEX_GPU_MODE_EMULATE) @@ -932,30 +1002,14 @@ TEST_F(mpi_test_fixture, cubed_sphere) #endif // wrap field memory in a field_descriptor - field_descriptor field_dom_0( - domain0, - data_ptr_0, - std::array{halo,halo,0}, - std::array{2*halo+5,2*halo+5,6}, - 8); - field_descriptor field_dom_1( - domain1, - data_ptr_1, - std::array{halo,halo,0}, - std::array{2*halo+5,2*halo+5,6}, - 8); - field_descriptor field_dom_2( - domain2, - data_ptr_2, - std::array{halo,halo,0}, - std::array{2*halo+5,2*halo+5,6}, - 8); - field_descriptor field_dom_3( - domain3, - data_ptr_3, - std::array{halo,halo,0}, - std::array{2*halo+5,2*halo+5,6}, - 8); + field_descriptor field_dom_0(domain0, data_ptr_0, + std::array{halo, halo, 0}, std::array{2 * halo + 5, 2 * halo + 5, 6}, 8); + field_descriptor field_dom_1(domain1, data_ptr_1, + std::array{halo, halo, 0}, std::array{2 * halo + 5, 2 * halo + 5, 6}, 8); + field_descriptor field_dom_2(domain2, data_ptr_2, + std::array{halo, halo, 0}, std::array{2 * halo + 5, 2 * halo + 5, 6}, 8); + field_descriptor field_dom_3(domain3, data_ptr_3, + std::array{halo, halo, 0}, std::array{2 * halo + 5, 2 * halo + 5, 6}, 8); // create a structured pattern auto pattern1 = ghex::make_pattern(ctxt, halo_gen, local_domains); @@ -965,11 +1019,9 @@ TEST_F(mpi_test_fixture, cubed_sphere) auto co = ghex::make_communication_object(ctxt); // exchange halo data - co.exchange( - pattern1(field_dom_0), - pattern1(field_dom_1), - pattern1(field_dom_2), - pattern1(field_dom_3)).wait(); + co.exchange(pattern1(field_dom_0), pattern1(field_dom_1), pattern1(field_dom_2), + pattern1(field_dom_3)) + .wait(); #if defined(GHEX_USE_GPU) || defined(GHEX_GPU_MODE_EMULATE) data_dom_0.clone_to_host(); @@ -1001,62 +1053,48 @@ TEST_F(mpi_test_fixture, cubed_sphere_vector) halo_generator halo_gen(2); // cube with size 10 and 7 levels - cube c{10,7}; + cube c{10, 7}; // define 4 local domains - domain_descriptor domain0 (c, ctxt.rank(), 0, 4, 0, 4); - domain_descriptor domain1 (c, ctxt.rank(), 5, 9, 0, 4); - domain_descriptor domain2 (c, ctxt.rank(), 0, 4, 5, 9); - domain_descriptor domain3 (c, ctxt.rank(), 5, 9, 5, 9); - std::vector local_domains{ domain0, domain1, domain2, domain3 }; + domain_descriptor domain0(c, ctxt.rank(), 0, 4, 0, 4); + domain_descriptor domain1(c, ctxt.rank(), 5, 9, 0, 4); + domain_descriptor domain2(c, ctxt.rank(), 0, 4, 5, 9); + domain_descriptor domain3(c, ctxt.rank(), 5, 9, 5, 9); + std::vector local_domains{domain0, domain1, domain2, domain3}; // allocate large enough memory for fields, sufficient for 3 halo lines // use 8 components per field and 6 z-levels - const int halo=3; - ghex::test::util::memory data_dom_0((2*halo+5)*(2*halo+5)*3*7,-1); // fields - ghex::test::util::memory data_dom_1((2*halo+5)*(2*halo+5)*3*7,-1); // fields - ghex::test::util::memory data_dom_2((2*halo+5)*(2*halo+5)*3*7,-1); // fields - ghex::test::util::memory data_dom_3((2*halo+5)*(2*halo+5)*3*7,-1); // fields + const int halo = 3; + ghex::test::util::memory data_dom_0((2 * halo + 5) * (2 * halo + 5) * 3 * 7, + -1); // fields + ghex::test::util::memory data_dom_1((2 * halo + 5) * (2 * halo + 5) * 3 * 7, + -1); // fields + ghex::test::util::memory data_dom_2((2 * halo + 5) * (2 * halo + 5) * 3 * 7, + -1); // fields + ghex::test::util::memory data_dom_3((2 * halo + 5) * (2 * halo + 5) * 3 * 7, + -1); // fields // initialize physical domain (leave halos as they are) - for (int comp=0; comp<3; ++comp) - for (int z=0; z<7; ++z) - for (int y=0; y<5; ++y) - for (int x=0; x<5; ++x) - { - const auto idx = - (x+halo) + - (y+halo)*(2*halo+5) + - z*(2*halo+5)*(2*halo+5) + - comp*(2*halo+5)*(2*halo+5)*7; - data_dom_0[idx] = - 100000*(domain0.domain_id().tile+1) + - 10000*id_to_int(domain0.domain_id().id) + - 1000*comp + - 100*x + - 10*y + - 1*z; - data_dom_1[idx] = - 100000*(domain1.domain_id().tile+1) + - 10000*id_to_int(domain1.domain_id().id) + - 1000*comp + - 100*x + - 10*y + - 1*z; - data_dom_2[idx] = - 100000*(domain2.domain_id().tile+1) + - 10000*id_to_int(domain2.domain_id().id) + - 1000*comp + - 100*x + - 10*y + - 1*z; - data_dom_3[idx] = - 100000*(domain3.domain_id().tile+1) + - 10000*id_to_int(domain3.domain_id().id) + - 1000*comp + - 100*x + - 10*y + - 1*z; + for (int comp = 0; comp < 3; ++comp) + for (int z = 0; z < 7; ++z) + for (int y = 0; y < 5; ++y) + for (int x = 0; x < 5; ++x) + { + const auto idx = (x + halo) + (y + halo) * (2 * halo + 5) + + z * (2 * halo + 5) * (2 * halo + 5) + + comp * (2 * halo + 5) * (2 * halo + 5) * 7; + data_dom_0[idx] = 100000 * (domain0.domain_id().tile + 1) + + 10000 * id_to_int(domain0.domain_id().id) + 1000 * comp + + 100 * x + 10 * y + 1 * z; + data_dom_1[idx] = 100000 * (domain1.domain_id().tile + 1) + + 10000 * id_to_int(domain1.domain_id().id) + 1000 * comp + + 100 * x + 10 * y + 1 * z; + data_dom_2[idx] = 100000 * (domain2.domain_id().tile + 1) + + 10000 * id_to_int(domain2.domain_id().id) + 1000 * comp + + 100 * x + 10 * y + 1 * z; + data_dom_3[idx] = 100000 * (domain3.domain_id().tile + 1) + + 10000 * id_to_int(domain3.domain_id().id) + 1000 * comp + + 100 * x + 10 * y + 1 * z; } #if defined(GHEX_USE_GPU) || defined(GHEX_GPU_MODE_EMULATE) @@ -1078,30 +1116,18 @@ TEST_F(mpi_test_fixture, cubed_sphere_vector) #endif // wrap field memory in a field_descriptor - field_descriptor field_dom_0( - domain0, - data_ptr_0, - std::array{halo,halo,0}, - std::array{2*halo+5,2*halo+5,7}, - 3, true); - field_descriptor field_dom_1( - domain1, - data_ptr_1, - std::array{halo,halo,0}, - std::array{2*halo+5,2*halo+5,7}, - 3, true); - field_descriptor field_dom_2( - domain2, - data_ptr_2, - std::array{halo,halo,0}, - std::array{2*halo+5,2*halo+5,7}, - 3, true); - field_descriptor field_dom_3( - domain3, - data_ptr_3, - std::array{halo,halo,0}, - std::array{2*halo+5,2*halo+5,7}, - 3, true); + field_descriptor field_dom_0(domain0, data_ptr_0, + std::array{halo, halo, 0}, std::array{2 * halo + 5, 2 * halo + 5, 7}, 3, + true); + field_descriptor field_dom_1(domain1, data_ptr_1, + std::array{halo, halo, 0}, std::array{2 * halo + 5, 2 * halo + 5, 7}, 3, + true); + field_descriptor field_dom_2(domain2, data_ptr_2, + std::array{halo, halo, 0}, std::array{2 * halo + 5, 2 * halo + 5, 7}, 3, + true); + field_descriptor field_dom_3(domain3, data_ptr_3, + std::array{halo, halo, 0}, std::array{2 * halo + 5, 2 * halo + 5, 7}, 3, + true); // create a structured pattern auto pattern1 = ghex::make_pattern(ctxt, halo_gen, local_domains); @@ -1111,11 +1137,9 @@ TEST_F(mpi_test_fixture, cubed_sphere_vector) auto co = ghex::make_communication_object(ctxt); // exchange halo data - co.exchange( - pattern1(field_dom_0), - pattern1(field_dom_1), - pattern1(field_dom_2), - pattern1(field_dom_3)).wait(); + co.exchange(pattern1(field_dom_0), pattern1(field_dom_1), pattern1(field_dom_2), + pattern1(field_dom_3)) + .wait(); #if defined(GHEX_USE_GPU) || defined(GHEX_GPU_MODE_EMULATE) data_dom_0.clone_to_host(); diff --git a/test/structured/regular/test_local_rma.cpp b/test/structured/regular/test_local_rma.cpp index b7a4f27c..c264770d 100644 --- a/test/structured/regular/test_local_rma.cpp +++ b/test/structured/regular/test_local_rma.cpp @@ -84,8 +84,9 @@ struct simulation_1 std::vector local_domains; std::array halos; halo_generator_type halo_gen; - using pattern_type = std::remove_reference_t(ctxt, halo_gen, local_domains))>; + using pattern_type = + std::remove_reference_t(ctxt, halo_gen, + local_domains))>; pattern_type pattern; field_descriptor_type field_1a; field_descriptor_type field_1b; @@ -126,44 +127,38 @@ struct simulation_1 std::array{((ctxt.rank() % 2) * 2 + 1) * local_ext[0] - 1, (ctxt.rank() / 2 + 1) * local_ext[1] - 1, local_ext[2] - 1}}, domain_descriptor_type{ctxt.rank() * 2 + 1, - std::array{ - ((ctxt.rank() % 2) * 2 + 1) * local_ext[0], (ctxt.rank() / 2) * local_ext[1], 0}, + std::array{((ctxt.rank() % 2) * 2 + 1) * local_ext[0], + (ctxt.rank() / 2) * local_ext[1], 0}, std::array{((ctxt.rank() % 2) * 2 + 2) * local_ext[0] - 1, (ctxt.rank() / 2 + 1) * local_ext[1] - 1, local_ext[2] - 1}}} , halos{2, 2, 2, 2, 2, 2} , halo_gen(g_first, g_last, halos, periodic) , pattern{ghex::make_pattern(ctxt, halo_gen, local_domains)} - , field_1a{ghex::wrap_field>( - local_domains[0], field_1a_raw.data(), offset, local_ext_buffer)} - , field_1b{ghex::wrap_field>( - local_domains[1], field_1b_raw.data(), offset, local_ext_buffer)} - , field_2a{ghex::wrap_field>( - local_domains[0], field_2a_raw.data(), offset, local_ext_buffer)} - , field_2b{ghex::wrap_field>( - local_domains[1], field_2b_raw.data(), offset, local_ext_buffer)} - , field_3a{ghex::wrap_field>( - local_domains[0], field_3a_raw.data(), offset, local_ext_buffer)} - , field_3b - { - ghex::wrap_field>( - local_domains[1], field_3b_raw.data(), offset, local_ext_buffer) - } + , field_1a{ghex::wrap_field>(local_domains[0], + field_1a_raw.data(), offset, local_ext_buffer)} + , field_1b{ghex::wrap_field>(local_domains[1], + field_1b_raw.data(), offset, local_ext_buffer)} + , field_2a{ghex::wrap_field>(local_domains[0], + field_2a_raw.data(), offset, local_ext_buffer)} + , field_2b{ghex::wrap_field>(local_domains[1], + field_2b_raw.data(), offset, local_ext_buffer)} + , field_3a{ghex::wrap_field>(local_domains[0], + field_3a_raw.data(), offset, local_ext_buffer)} + , field_3b{ghex::wrap_field>(local_domains[1], + field_3b_raw.data(), offset, local_ext_buffer)} #if defined(GHEX_USE_GPU) || defined(GHEX_GPU_MODE_EMULATE) - , field_1a_gpu{ghex::wrap_field>( - local_domains[0], field_1a_raw.device_data(), offset, local_ext_buffer)}, - field_1b_gpu{ghex::wrap_field>( - local_domains[1], field_1b_raw.device_data(), offset, local_ext_buffer)}, - field_2a_gpu{ghex::wrap_field>( - local_domains[0], field_2a_raw.device_data(), offset, local_ext_buffer)}, - field_2b_gpu{ghex::wrap_field>( - local_domains[1], field_2b_raw.device_data(), offset, local_ext_buffer)}, - field_3a_gpu{ghex::wrap_field>( - local_domains[0], field_3a_raw.device_data(), offset, local_ext_buffer)}, - field_3b_gpu - { - ghex::wrap_field>( - local_domains[1], field_3b_raw.device_data(), offset, local_ext_buffer) - } + , field_1a_gpu{ghex::wrap_field>(local_domains[0], + field_1a_raw.device_data(), offset, local_ext_buffer)} + , field_1b_gpu{ghex::wrap_field>(local_domains[1], + field_1b_raw.device_data(), offset, local_ext_buffer)} + , field_2a_gpu{ghex::wrap_field>(local_domains[0], + field_2a_raw.device_data(), offset, local_ext_buffer)} + , field_2b_gpu{ghex::wrap_field>(local_domains[1], + field_2b_raw.device_data(), offset, local_ext_buffer)} + , field_3a_gpu{ghex::wrap_field>(local_domains[0], + field_3a_raw.device_data(), offset, local_ext_buffer)} + , field_3b_gpu{ghex::wrap_field>(local_domains[1], + field_3b_raw.device_data(), offset, local_ext_buffer)} #endif , mt{multithread} { @@ -296,7 +291,9 @@ struct simulation_1 { int zl = 0; for (int z = d.first()[2]; z <= d.last()[2]; ++z, ++zl) - { f(xl, yl, zl) = array_type{(T)x, (T)y, (T)z}; } + { + f(xl, yl, zl) = array_type{(T)x, (T)y, (T)z}; + } } } } @@ -320,7 +317,9 @@ struct simulation_1 hxl = 0; } if (i == 1 && size == 1) //comm.rank()%2 == 0 && comm.rank()+1 == comm.size()) - { hxr = 0; } + { + hxr = 0; + } // hack end for (int x = d.first()[0] - hxl; x <= d.last()[0] + hxr; ++x, ++xl) { diff --git a/test/structured/regular/test_regular_domain.cpp b/test/structured/regular/test_regular_domain.cpp index 1f9f0160..0137b88d 100644 --- a/test/structured/regular/test_regular_domain.cpp +++ b/test/structured/regular/test_regular_domain.cpp @@ -89,8 +89,8 @@ struct parameters static field_type wrap(ghex::test::util::memory& f, domain_descriptor_type& d, Offsets const& o, Extents const& ext, ghex::cpu) { - return ghex::wrap_field>( - d, f.host_data(), o, ext); + return ghex::wrap_field>(d, f.host_data(), o, + ext); } #if defined(GHEX_USE_GPU) || defined(GHEX_GPU_MODE_EMULATE) @@ -98,8 +98,8 @@ struct parameters static field_type wrap(ghex::test::util::memory& f, domain_descriptor_type& d, Offsets const& o, Extents const& ext, ghex::gpu) { - return ghex::wrap_field>( - d, f.device_data(), o, ext); + return ghex::wrap_field>(d, f.device_data(), o, + ext); } #endif @@ -122,8 +122,8 @@ struct parameters // local domains std::vector local_domains; // pattern containers - using pattern_container_type = decltype(ghex::make_pattern( - ctxt, std::declval(), local_domains)); + using pattern_container_type = decltype(ghex::make_pattern(ctxt, + std::declval(), local_domains)); std::unique_ptr pattern1; std::unique_ptr pattern2; @@ -162,13 +162,13 @@ struct parameters } template - void fill_values( - ghex::test::util::memory>& m, domain_descriptor_type const& d, ghex::cpu); + void fill_values(ghex::test::util::memory>& m, domain_descriptor_type const& d, + ghex::cpu); #if defined(GHEX_USE_GPU) || defined(GHEX_GPU_MODE_EMULATE) template - void fill_values( - ghex::test::util::memory>& m, domain_descriptor_type const& d, ghex::gpu) + void fill_values(ghex::test::util::memory>& m, domain_descriptor_type const& d, + ghex::gpu) { fill_values(m, d, ghex::cpu{}); m.clone_to_device(); @@ -237,7 +237,8 @@ struct test_exchange static void run_mt(ghex::context& ctxt) { params_type params(ctxt); - auto func = [&ctxt](auto... bis) { + auto func = [&ctxt](auto... bis) + { auto co = ghex::make_communication_object(ctxt); co.exchange(bis...).wait(); }; @@ -253,7 +254,8 @@ struct test_exchange static void run_mt_async(ghex::context& ctxt) { params_type params(ctxt); - auto func = [&ctxt](auto... bis) { + auto func = [&ctxt](auto... bis) + { auto co = ghex::make_communication_object(ctxt); co.exchange(bis...).wait(); }; @@ -280,12 +282,12 @@ struct test_exchange auto func = [&ctxt](auto co, auto... bis) { return co->exchange(bis...); }; auto co1 = ghex::make_communication_object(ctxt); auto co2 = ghex::make_communication_object(ctxt); - auto future1 = std::async( - policy, func, &co1, params.field_1a.bi, params.field_2a.bi, params.field_3a.bi); - auto future2 = std::async( - policy, func, &co2, params.field_1b.bi, params.field_2b.bi, params.field_3b.bi); - auto h1 = future1.get(); - auto h2 = future2.get(); + auto future1 = std::async(policy, func, &co1, params.field_1a.bi, params.field_2a.bi, + params.field_3a.bi); + auto future2 = std::async(policy, func, &co2, params.field_1b.bi, params.field_2b.bi, + params.field_3b.bi); + auto h1 = future1.get(); + auto h2 = future2.get(); while (!h1.is_ready() || !h2.is_ready()) { h1.progress(); @@ -308,10 +310,10 @@ struct test_exchange_vector { params_type params(ctxt); auto co = ghex::make_communication_object(ctxt); - std::vector fields1{ - params.field_1a.bi, params.field_2a.bi, params.field_3a.bi}; - std::vector fields2{ - params.field_1b.bi, params.field_2b.bi, params.field_3b.bi}; + std::vector fields1{params.field_1a.bi, params.field_2a.bi, + params.field_3a.bi}; + std::vector fields2{params.field_1b.bi, params.field_2b.bi, + params.field_3b.bi}; co.exchange(fields1.begin(), fields1.end(), fields2.begin(), fields2.end()).wait(); params.check_values(); } @@ -321,12 +323,12 @@ struct test_exchange_vector params_type params(ctxt); auto co1 = ghex::make_communication_object(ctxt); auto co2 = ghex::make_communication_object(ctxt); - std::vector fields1{ - params.field_1a.bi, params.field_2a.bi, params.field_3a.bi}; - std::vector fields2{ - params.field_1b.bi, params.field_2b.bi, params.field_3b.bi}; - auto h1 = co1.exchange(fields1.begin(), fields1.end()); - auto h2 = co2.exchange(fields2.begin(), fields2.end()); + std::vector fields1{params.field_1a.bi, params.field_2a.bi, + params.field_3a.bi}; + std::vector fields2{params.field_1b.bi, params.field_2b.bi, + params.field_3b.bi}; + auto h1 = co1.exchange(fields1.begin(), fields1.end()); + auto h2 = co2.exchange(fields2.begin(), fields2.end()); while (!h1.is_ready() || !h2.is_ready()) { h1.progress(); @@ -338,15 +340,16 @@ struct test_exchange_vector static void run_mt(ghex::context& ctxt) { params_type params(ctxt); - auto func = [&ctxt](auto vec) { + auto func = [&ctxt](auto vec) + { auto co = ghex::make_communication_object(ctxt); co.exchange(vec.begin(), vec.end()).wait(); }; - std::vector fields1{ - params.field_1a.bi, params.field_2a.bi, params.field_3a.bi}; - std::vector fields2{ - params.field_1b.bi, params.field_2b.bi, params.field_3b.bi}; - std::vector threads; + std::vector fields1{params.field_1a.bi, params.field_2a.bi, + params.field_3a.bi}; + std::vector fields2{params.field_1b.bi, params.field_2b.bi, + params.field_3b.bi}; + std::vector threads; threads.push_back(std::thread{func, fields1}); threads.push_back(std::thread{func, fields2}); for (auto& t : threads) t.join(); @@ -356,15 +359,16 @@ struct test_exchange_vector static void run_mt_async(ghex::context& ctxt) { params_type params(ctxt); - auto func = [&ctxt](auto vec) { + auto func = [&ctxt](auto vec) + { auto co = ghex::make_communication_object(ctxt); co.exchange(vec.begin(), vec.end()).wait(); }; - std::vector fields1{ - params.field_1a.bi, params.field_2a.bi, params.field_3a.bi}; - std::vector fields2{ - params.field_1b.bi, params.field_2b.bi, params.field_3b.bi}; - auto policy = std::launch::async; + std::vector fields1{params.field_1a.bi, params.field_2a.bi, + params.field_3a.bi}; + std::vector fields2{params.field_1b.bi, params.field_2b.bi, + params.field_3b.bi}; + auto policy = std::launch::async; // note: deferred launch policy does not work since it will deadlock in the func auto future1 = std::async(policy, func, fields1); auto future2 = std::async(policy, func, fields2); @@ -385,14 +389,14 @@ struct test_exchange_vector auto func = [&ctxt](auto co, auto vec) { return co->exchange(vec.begin(), vec.end()); }; auto co1 = ghex::make_communication_object(ctxt); auto co2 = ghex::make_communication_object(ctxt); - std::vector fields1{ - params.field_1a.bi, params.field_2a.bi, params.field_3a.bi}; - std::vector fields2{ - params.field_1b.bi, params.field_2b.bi, params.field_3b.bi}; - auto future1 = std::async(policy, func, &co1, fields1); - auto future2 = std::async(policy, func, &co2, fields2); - auto h1 = future1.get(); - auto h2 = future2.get(); + std::vector fields1{params.field_1a.bi, params.field_2a.bi, + params.field_3a.bi}; + std::vector fields2{params.field_1b.bi, params.field_2b.bi, + params.field_3b.bi}; + auto future1 = std::async(policy, func, &co1, fields1); + auto future2 = std::async(policy, func, &co2, fields2); + auto h1 = future1.get(); + auto h2 = future2.get(); while (!h1.is_ready() || !h2.is_ready()) { h1.progress(); @@ -606,16 +610,16 @@ parameters::parameters(ghex::context& c) template template void -parameters::fill_values( - ghex::test::util::memory>& m, domain_descriptor_type const& d, ghex::cpu) +parameters::fill_values(ghex::test::util::memory>& m, + domain_descriptor_type const& d, ghex::cpu) { for (int z = 0; z < local_ext[2]; ++z) for (int y = 0; y < local_ext[1]; ++y) for (int x = 0; x < local_ext[0]; ++x) m[(x + offset[0]) + local_ext_buffer[0] * ((y + offset[1]) + local_ext_buffer[1] * (z + offset[2]))] = - array_type{ - (T)(x + d.first()[0]), (T)(y + d.first()[1]), (T)(z + d.first()[2])}; + array_type{(T)(x + d.first()[0]), (T)(y + d.first()[1]), + (T)(z + d.first()[2])}; } template diff --git a/test/structured/regular/test_simple_regular_domain.cpp b/test/structured/regular/test_simple_regular_domain.cpp index cd1ab5b9..ff798051 100644 --- a/test/structured/regular/test_simple_regular_domain.cpp +++ b/test/structured/regular/test_simple_regular_domain.cpp @@ -53,8 +53,8 @@ template auto wrap_cpu_field(RawField& raw_field, const domain& d) { - return wrap_field>( - d, raw_field.data(), arr{HALO, HALO}, arr{HALO * 2 + DIM, HALO * 2 + DIM / 2}); + return wrap_field>(d, raw_field.data(), arr{HALO, HALO}, + arr{HALO * 2 + DIM, HALO * 2 + DIM / 2}); } #if defined(GHEX_USE_GPU) || defined(GHEX_GPU_MODE_EMULATE) @@ -62,8 +62,8 @@ template auto wrap_gpu_field(RawField& raw_field, const domain& d) { - return wrap_field>( - d, raw_field.device_data(), arr{HALO, HALO}, arr{HALO * 2 + DIM, HALO * 2 + DIM / 2}); + return wrap_field>(d, raw_field.device_data(), + arr{HALO, HALO}, arr{HALO * 2 + DIM, HALO * 2 + DIM / 2}); } #endif @@ -128,8 +128,8 @@ check(const Field& field, const arr& dims) expected(j, dims[1], field.domain().first()[1], field.domain().last()[1], periodic[1]); for (int i = -HALO; i < DIM + HALO; ++i) { - const auto x = expected( - i, dims[0], field.domain().first()[0], field.domain().last()[0], periodic[0]); + const auto x = expected(i, dims[0], field.domain().first()[0], field.domain().last()[0], + periodic[0]); res = res && compare(field({i, j}), x, y); } } @@ -481,13 +481,13 @@ sim(bool multi_threaded) coords[1] = ctxt.rank() / dims[0]; coords[0] = ctxt.rank() - coords[1] * dims[0]; // make 2 domains per rank - std::vector domains{ - make_domain(ctxt.rank(), 0, coords), make_domain(ctxt.rank(), 1, coords)}; + std::vector domains{make_domain(ctxt.rank(), 0, coords), + make_domain(ctxt.rank(), 1, coords)}; // neighbor lookup domain_lu d_lu{dims}; - auto staged_pattern = structured::regular::make_staged_pattern( - ctxt, domains, d_lu, arr{0, 0}, arr{dims[0] * DIM - 1, dims[1] * DIM - 1}, halos, periodic); + auto staged_pattern = structured::regular::make_staged_pattern(ctxt, domains, d_lu, arr{0, 0}, + arr{dims[0] * DIM - 1, dims[1] * DIM - 1}, halos, periodic); // make halo generator halo_gen gen{arr{0, 0}, arr{dims[0] * DIM - 1, dims[1] * DIM - 1}, halos, periodic}; @@ -497,18 +497,14 @@ sim(bool multi_threaded) bool res = true; if (multi_threaded) { - auto run_fct = [&ctxt, &pattern, &staged_pattern, &domains, &dims](int id) { - return run(ctxt, pattern, staged_pattern, domains, dims, id); - }; + auto run_fct = [&ctxt, &pattern, &staged_pattern, &domains, &dims](int id) + { return run(ctxt, pattern, staged_pattern, domains, dims, id); }; auto f1 = std::async(std::launch::async, run_fct, 0); auto f2 = std::async(std::launch::async, run_fct, 1); res = res && f1.get(); res = res && f2.get(); } - else - { - res = res && run(ctxt, pattern, staged_pattern, domains, dims); - } + else { res = res && run(ctxt, pattern, staged_pattern, domains, dims); } // reduce res bool all_res = false; MPI_Reduce(&res, &all_res, 1, MPI_C_BOOL, MPI_LAND, 0, MPI_COMM_WORLD); diff --git a/test/unstructured/test_user_concepts.cpp b/test/unstructured/test_user_concepts.cpp index 35e4d0a3..1fb6e02a 100644 --- a/test/unstructured/test_user_concepts.cpp +++ b/test/unstructured/test_user_concepts.cpp @@ -271,8 +271,8 @@ test_data_descriptor(ghex::context& ctxt, std::size_t levels, bool levels_first) auto co = ghex::make_communication_object(ctxt); // application data - auto& d = local_domains[0]; - ghex::test::util::memory field(d.size()*levels, 0); + auto& d = local_domains[0]; + ghex::test::util::memory field(d.size() * levels, 0); initialize_data(d, field, levels, levels_first); data_descriptor_cpu_int_type data{d, field, levels, levels_first}; diff --git a/test/unstructured/unstructured_test_case.hpp b/test/unstructured/unstructured_test_case.hpp index 2f6a55ec..6ac0ad2c 100644 --- a/test/unstructured/unstructured_test_case.hpp +++ b/test/unstructured/unstructured_test_case.hpp @@ -344,27 +344,29 @@ check_recv_halos_indices(const pattern_type& p) template void -initialize_data(const domain_descriptor_type& d, Container& field, std::size_t levels = 1u, bool levels_first = true) +initialize_data(const domain_descriptor_type& d, Container& field, std::size_t levels = 1u, + bool levels_first = true) { assert(field.size() == d.size() * levels); if (levels_first) for (const auto& x : d.inner_ids()) for (std::size_t level = 0u; level < levels; ++level) - field[x.second * levels + level] = d.domain_id() * 10000 + x.first*100 + level; + field[x.second * levels + level] = d.domain_id() * 10000 + x.first * 100 + level; else for (std::size_t level = 0u; level < levels; ++level) for (const auto& x : d.inner_ids()) - field[x.second + level*d.size()] = d.domain_id() * 10000 + x.first*100 + level; + field[x.second + level * d.size()] = d.domain_id() * 10000 + x.first * 100 + level; } template void -check_exchanged_data(const domain_descriptor_type& d, const Container& field, const pattern_type& p, std::size_t levels = 1u, bool levels_first = true) +check_exchanged_data(const domain_descriptor_type& d, const Container& field, const pattern_type& p, + std::size_t levels = 1u, bool levels_first = true) { using value_type = typename Container::value_type; using index_type = pattern_type::index_type; std::map halo_map{}; - for (const auto& [edid, c]: p.recv_halos()) + for (const auto& [edid, c] : p.recv_halos()) { for (const auto idx : c.front().local_indices()) { @@ -374,11 +376,15 @@ check_exchanged_data(const domain_descriptor_type& d, const Container& field, co if (levels_first) for (auto [idx, did] : halo_map) for (std::size_t level = 0u; level < levels; ++level) - EXPECT_EQ(field[idx * levels + level], static_cast(did * 10000 + d.global_index(idx).value()*100 + level)); + EXPECT_EQ(field[idx * levels + level], + static_cast( + did * 10000 + d.global_index(idx).value() * 100 + level)); else for (std::size_t level = 0u; level < levels; ++level) for (auto [idx, did] : halo_map) - EXPECT_EQ(field[idx + level * d.size()], static_cast(did * 10000 + d.global_index(idx).value()*100 + level)); + EXPECT_EQ(field[idx + level * d.size()], + static_cast( + did * 10000 + d.global_index(idx).value() * 100 + level)); } /** @brief Helper functor type, used as default template argument below*/ diff --git a/test/util/memory.hpp b/test/util/memory.hpp index 7e90ca5d..24f00084 100644 --- a/test/util/memory.hpp +++ b/test/util/memory.hpp @@ -41,10 +41,7 @@ struct memory memory(unsigned int size_, const T& value = T{}, bool /*no_device_delete*/ = false) #endif : m_size{size_} - , m_host_memory - { - new T[m_size] - } + , m_host_memory{new T[m_size]} #if defined(GHEX_USE_GPU) || defined(GHEX_GPU_MODE_EMULATE) , m_device_memory((T*)hwmalloc::device_malloc(sizeof(T) * m_size), deleter{no_device_delete}) #endif