From f2bb5be241077ec66d7cb5047b29d06543b6fff6 Mon Sep 17 00:00:00 2001
From: FG-TUM <f.gratl@tum.de>
Date: Wed, 4 Sep 2024 13:46:53 +0200
Subject: [PATCH 01/15] const and replace outdated todo by comment

---
 src/parallel/GeneralDomainDecomposition.h | 2 +-
 src/particleContainer/TraversalTuner.h    | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/parallel/GeneralDomainDecomposition.h b/src/parallel/GeneralDomainDecomposition.h
index 7adbde1a7..f1900e720 100644
--- a/src/parallel/GeneralDomainDecomposition.h
+++ b/src/parallel/GeneralDomainDecomposition.h
@@ -172,7 +172,7 @@ class GeneralDomainDecomposition : public DomainDecompMPIBase {
 	std::pair<std::array<double, 3>, std::array<double, 3>> latchToGridSize(std::array<double, 3> boxMin,
 																			std::array<double, 3> boxMax) {
 		for (size_t ind = 0; ind < 3; ++ind) {
-			double currentGridSize = (*_gridSize)[ind];
+			const double currentGridSize = (*_gridSize)[ind];
 			// For boxmin, the lower domain boundary is 0, so that's always fine!
 			boxMin[ind] = std::round(boxMin[ind] / currentGridSize) * currentGridSize;
 			// update boxmax only if it isn't at the very top of the domain!
diff --git a/src/particleContainer/TraversalTuner.h b/src/particleContainer/TraversalTuner.h
index b4f0f27a6..5c1b66368 100644
--- a/src/particleContainer/TraversalTuner.h
+++ b/src/particleContainer/TraversalTuner.h
@@ -132,7 +132,8 @@ TraversalTuner<CellTemplate>::~TraversalTuner() {
 
 template<class CellTemplate>
 void TraversalTuner<CellTemplate>::findOptimalTraversal() {
-	// TODO implement autotuning here! At the moment the traversal is chosen via readXML!
+  // ls1 always uses the traversal selected via the XML
+  // If you want auto tuning activate AutoPas via CMake
 
 	_optimalTraversal = _traversals[selectedTraversal].first;
 

From c8b30341ecee68c0c5fe8e28cded6c8cf9330818 Mon Sep 17 00:00:00 2001
From: FG-TUM <f.gratl@tum.de>
Date: Wed, 4 Sep 2024 13:49:26 +0200
Subject: [PATCH 02/15] Change log level from info to debug for low level
 information

---
 .../OriginalCellPairTraversal.h                |  2 +-
 src/particleContainer/LinkedCells.cpp          |  8 ++++----
 src/particleContainer/TraversalTuner.h         | 18 +++++++++---------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/particleContainer/LinkedCellTraversals/OriginalCellPairTraversal.h b/src/particleContainer/LinkedCellTraversals/OriginalCellPairTraversal.h
index c54e9045f..16324a213 100644
--- a/src/particleContainer/LinkedCellTraversals/OriginalCellPairTraversal.h
+++ b/src/particleContainer/LinkedCellTraversals/OriginalCellPairTraversal.h
@@ -123,7 +123,7 @@ inline void OriginalCellPairTraversal<CellTemplate>::computeNeighbourOffsets() {
 	mardyn_assert(forwardNeighbourIndex == 13);
 	mardyn_assert(backwardNeighbourIndex == 13);
 
-	Log::global_log->info() << "Neighbour offsets are bounded by "
+	Log::global_log->debug() << "Neighbour offsets are bounded by "
 			<< minNeighbourOffset << ", " << maxNeighbourOffset << std::endl;
 
 }
diff --git a/src/particleContainer/LinkedCells.cpp b/src/particleContainer/LinkedCells.cpp
index d214872be..bb03396c2 100644
--- a/src/particleContainer/LinkedCells.cpp
+++ b/src/particleContainer/LinkedCells.cpp
@@ -131,7 +131,7 @@ void LinkedCells::readXML(XMLfileUnits& xmlconfig) {
 }
 
 bool LinkedCells::rebuild(double bBoxMin[3], double bBoxMax[3]) {
-	Log::global_log->info() << "REBUILD OF LinkedCells" << std::endl;
+	Log::global_log->debug() << "REBUILD OF LinkedCells" << std::endl;
 
 	for (int i = 0; i < 3; i++) {
 		this->_boundingBoxMin[i] = bBoxMin[i];
@@ -139,13 +139,13 @@ bool LinkedCells::rebuild(double bBoxMin[3], double bBoxMax[3]) {
 //		_haloWidthInNumCells[i] = ::ceil(_cellsInCutoff);
 		_haloWidthInNumCells[i] = _cellsInCutoff;
 	}
-	Log::global_log->info() << "Bounding box: " << "[" << bBoxMin[0] << ", " << bBoxMax[0] << "]" << " x " << "["
+	Log::global_log->debug() << "Bounding box: " << "[" << bBoxMin[0] << ", " << bBoxMax[0] << "]" << " x " << "["
 			<< bBoxMin[1] << ", " << bBoxMax[1] << "]" << " x " << "[" << bBoxMin[2] << ", " << bBoxMax[2] << "]"
 			<< std::endl;
 
 	int numberOfCells = 1;
 
-	Log::global_log->info() << "Using " << _cellsInCutoff << " cells in cutoff." << std::endl;
+	Log::global_log->debug() << "Using " << _cellsInCutoff << " cells in cutoff." << std::endl;
 	float rc = (_cutoffRadius / _cellsInCutoff);
 
 	for (int dim = 0; dim < 3; dim++) {
@@ -171,7 +171,7 @@ bool LinkedCells::rebuild(double bBoxMin[3], double bBoxMax[3]) {
 		_haloBoundingBoxMax[dim] = _boundingBoxMax[dim] + _haloLength[dim];
 	}
 
-	Log::global_log->info() << "Cells per dimension (incl. halo): " << _cellsPerDimension[0] << " x "
+	Log::global_log->debug() << "Cells per dimension (incl. halo): " << _cellsPerDimension[0] << " x "
 			<< _cellsPerDimension[1] << " x " << _cellsPerDimension[2] << std::endl;
 
 
diff --git a/src/particleContainer/TraversalTuner.h b/src/particleContainer/TraversalTuner.h
index 5c1b66368..85578bc52 100644
--- a/src/particleContainer/TraversalTuner.h
+++ b/src/particleContainer/TraversalTuner.h
@@ -139,27 +139,27 @@ void TraversalTuner<CellTemplate>::findOptimalTraversal() {
 
 	// log traversal
 	if (dynamic_cast<HalfShellTraversal<CellTemplate> *>(_optimalTraversal))
-		Log::global_log->info() << "Using HalfShellTraversal." << std::endl;
+		Log::global_log->debug() << "Using HalfShellTraversal." << std::endl;
 	else if (dynamic_cast<OriginalCellPairTraversal<CellTemplate> *>(_optimalTraversal))
-		Log::global_log->info() << "Using OriginalCellPairTraversal." << std::endl;
+		Log::global_log->debug() << "Using OriginalCellPairTraversal." << std::endl;
 	else if (dynamic_cast<C08CellPairTraversal<CellTemplate> *>(_optimalTraversal))
-		Log::global_log->info() << "Using C08CellPairTraversal without eighthShell." << std::endl;
+		Log::global_log->debug() << "Using C08CellPairTraversal without eighthShell." << std::endl;
 	else if (dynamic_cast<C08CellPairTraversal<CellTemplate, true> *>(_optimalTraversal))
-		Log::global_log->info() << "Using C08CellPairTraversal with eighthShell." << std::endl;
+		Log::global_log->debug() << "Using C08CellPairTraversal with eighthShell." << std::endl;
 	else if (dynamic_cast<C04CellPairTraversal<CellTemplate> *>(_optimalTraversal))
-		Log::global_log->info() << "Using C04CellPairTraversal." << std::endl;
+		Log::global_log->debug() << "Using C04CellPairTraversal." << std::endl;
 	else if (dynamic_cast<MidpointTraversal<CellTemplate> *>(_optimalTraversal))
-		Log::global_log->info() << "Using MidpointTraversal." << std::endl;
+		Log::global_log->debug() << "Using MidpointTraversal." << std::endl;
 	else if (dynamic_cast<NeutralTerritoryTraversal<CellTemplate> *>(_optimalTraversal))
-		Log::global_log->info() << "Using NeutralTerritoryTraversal." << std::endl;
+		Log::global_log->debug() << "Using NeutralTerritoryTraversal." << std::endl;
 	else if (dynamic_cast<QuickschedTraversal<CellTemplate> *>(_optimalTraversal)) {
-		Log::global_log->info() << "Using QuickschedTraversal." << std::endl;
+		Log::global_log->debug() << "Using QuickschedTraversal." << std::endl;
 #ifndef QUICKSCHED
 		Log::global_log->error() << "MarDyn was compiled without Quicksched Support. Aborting!" << std::endl;
 		mardyn_exit(1);
 #endif
 	} else if (dynamic_cast<SlicedCellPairTraversal<CellTemplate> *>(_optimalTraversal))
-		Log::global_log->info() << "Using SlicedCellPairTraversal." << std::endl;
+		Log::global_log->debug() << "Using SlicedCellPairTraversal." << std::endl;
 	else
 		Log::global_log->warning() << "Using unknown traversal." << std::endl;
 

From 84dbe7afdfae5ec88fddce61e96b1fdda80298f2 Mon Sep 17 00:00:00 2001
From: FG-TUM <f.gratl@tum.de>
Date: Wed, 4 Sep 2024 13:53:39 +0200
Subject: [PATCH 03/15] Reorder cases based on likelihood to avoid unnecessary
 dynamic casts

---
 src/particleContainer/TraversalTuner.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/particleContainer/TraversalTuner.h b/src/particleContainer/TraversalTuner.h
index 85578bc52..65b072512 100644
--- a/src/particleContainer/TraversalTuner.h
+++ b/src/particleContainer/TraversalTuner.h
@@ -138,16 +138,18 @@ void TraversalTuner<CellTemplate>::findOptimalTraversal() {
 	_optimalTraversal = _traversals[selectedTraversal].first;
 
 	// log traversal
-	if (dynamic_cast<HalfShellTraversal<CellTemplate> *>(_optimalTraversal))
-		Log::global_log->debug() << "Using HalfShellTraversal." << std::endl;
-	else if (dynamic_cast<OriginalCellPairTraversal<CellTemplate> *>(_optimalTraversal))
-		Log::global_log->debug() << "Using OriginalCellPairTraversal." << std::endl;
+	if (dynamic_cast<SlicedCellPairTraversal<CellTemplate> *>(_optimalTraversal))
+		Log::global_log->debug() << "Using SlicedCellPairTraversal." << std::endl;
 	else if (dynamic_cast<C08CellPairTraversal<CellTemplate> *>(_optimalTraversal))
 		Log::global_log->debug() << "Using C08CellPairTraversal without eighthShell." << std::endl;
-	else if (dynamic_cast<C08CellPairTraversal<CellTemplate, true> *>(_optimalTraversal))
-		Log::global_log->debug() << "Using C08CellPairTraversal with eighthShell." << std::endl;
 	else if (dynamic_cast<C04CellPairTraversal<CellTemplate> *>(_optimalTraversal))
 		Log::global_log->debug() << "Using C04CellPairTraversal." << std::endl;
+	else if (dynamic_cast<C08CellPairTraversal<CellTemplate, true> *>(_optimalTraversal))
+		Log::global_log->debug() << "Using C08CellPairTraversal with eighthShell." << std::endl;
+	else if (dynamic_cast<HalfShellTraversal<CellTemplate> *>(_optimalTraversal))
+		Log::global_log->debug() << "Using HalfShellTraversal." << std::endl;
+	else if (dynamic_cast<OriginalCellPairTraversal<CellTemplate> *>(_optimalTraversal))
+		Log::global_log->debug() << "Using OriginalCellPairTraversal." << std::endl;
 	else if (dynamic_cast<MidpointTraversal<CellTemplate> *>(_optimalTraversal))
 		Log::global_log->debug() << "Using MidpointTraversal." << std::endl;
 	else if (dynamic_cast<NeutralTerritoryTraversal<CellTemplate> *>(_optimalTraversal))
@@ -158,9 +160,7 @@ void TraversalTuner<CellTemplate>::findOptimalTraversal() {
 		Log::global_log->error() << "MarDyn was compiled without Quicksched Support. Aborting!" << std::endl;
 		mardyn_exit(1);
 #endif
-	} else if (dynamic_cast<SlicedCellPairTraversal<CellTemplate> *>(_optimalTraversal))
-		Log::global_log->debug() << "Using SlicedCellPairTraversal." << std::endl;
-	else
+	} else
 		Log::global_log->warning() << "Using unknown traversal." << std::endl;
 
 	if (_cellsInCutoff > _optimalTraversal->maxCellsInCutoff()) {

From 65a0ce179346df9188e8733c9266d18bd08c5535 Mon Sep 17 00:00:00 2001
From: FG-TUM <f.gratl@tum.de>
Date: Wed, 4 Sep 2024 14:08:42 +0200
Subject: [PATCH 04/15] fix incomplete output

---
 src/parallel/NeighbourCommunicationScheme.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/parallel/NeighbourCommunicationScheme.cpp b/src/parallel/NeighbourCommunicationScheme.cpp
index 7eaa8e94e..cef75c7ff 100644
--- a/src/parallel/NeighbourCommunicationScheme.cpp
+++ b/src/parallel/NeighbourCommunicationScheme.cpp
@@ -525,9 +525,10 @@ void IndirectNeighbourCommunicationScheme::initExchangeMoleculesMPI1D(ParticleCo
 		const int numNeighbours = (*_neighbours)[d].size();
 		std::vector<Molecule> dummy;
 		for (int i = 0; i < numNeighbours; ++i) {
-			Log::global_log->debug() << "Rank " << domainDecomp->getRank() << " is initiating communication to" << std::endl;
+			Log::global_log->debug() << "Rank " << domainDecomp->getRank()
+									 << " is initiating communication to " << (*_neighbours)[d][i].getRank() << "\n";
 			(*_neighbours)[d][i].initSend(moleculeContainer, domainDecomp->getCommunicator(),
-					domainDecomp->getMPIParticleType(), msgType, dummy, false, true/*do halo position change*/);
+					domainDecomp->getMPIParticleType(), msgType, dummy, false, true/*do halo position check*/);
 		}
 
 	}

From 0fa3e008975911572c39aafd1188af38e63725f3 Mon Sep 17 00:00:00 2001
From: FG-TUM <f.gratl@tum.de>
Date: Wed, 4 Sep 2024 17:20:22 +0200
Subject: [PATCH 05/15] Refactor getCoversWholeDomain to return const reference

---
 src/parallel/ALLLoadBalancer.h              | 2 +-
 src/parallel/GeneralDomainDecomposition.cpp | 2 +-
 src/parallel/LoadBalancer.h                 | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/parallel/ALLLoadBalancer.h b/src/parallel/ALLLoadBalancer.h
index 9284dcfcf..c9f0eb996 100644
--- a/src/parallel/ALLLoadBalancer.h
+++ b/src/parallel/ALLLoadBalancer.h
@@ -21,7 +21,7 @@ class ALLLoadBalancer : public LoadBalancer {
 		// nothing yet.
 	}
 
-	std::array<bool, 3> getCoversWholeDomain() override { return _coversWholeDomain; }
+	const std::array<bool, 3>& getCoversWholeDomain() const override { return _coversWholeDomain; }
 
 private:
 	ALL<double, double> _all;
diff --git a/src/parallel/GeneralDomainDecomposition.cpp b/src/parallel/GeneralDomainDecomposition.cpp
index fbae56f0e..daf73bb47 100644
--- a/src/parallel/GeneralDomainDecomposition.cpp
+++ b/src/parallel/GeneralDomainDecomposition.cpp
@@ -250,7 +250,7 @@ void GeneralDomainDecomposition::migrateParticles(Domain* domain, ParticleContai
 
 void GeneralDomainDecomposition::initCommPartners(ParticleContainer* moleculeContainer,
 												  Domain* domain) {  // init communication partners
-	auto coversWholeDomain = _loadBalancer->getCoversWholeDomain();
+	const auto coversWholeDomain = _loadBalancer->getCoversWholeDomain();
 	for (int d = 0; d < DIMgeom; ++d) {
 		// this needs to be updated for proper initialization of the neighbours
 		_neighbourCommunicationScheme->setCoverWholeDomain(d, coversWholeDomain[d]);
diff --git a/src/parallel/LoadBalancer.h b/src/parallel/LoadBalancer.h
index 2b11b0249..f6b4e2faf 100644
--- a/src/parallel/LoadBalancer.h
+++ b/src/parallel/LoadBalancer.h
@@ -40,5 +40,5 @@ class LoadBalancer {
 	 * Indicates if the current process / MPI rank spans the full length of a dimension.
 	 * @return Array of bools, for each dimension one value: true, iff the process spans the entire domain along this dimension.
 	 */
-	virtual std::array<bool, 3> getCoversWholeDomain() = 0;
+	virtual const std::array<bool, 3>& getCoversWholeDomain() const = 0;
 };

From 0cfabdd3ab09060fd4b434352a7a178126a529b9 Mon Sep 17 00:00:00 2001
From: FG-TUM <f.gratl@tum.de>
Date: Wed, 4 Sep 2024 17:30:58 +0200
Subject: [PATCH 06/15] rename GeneralDomainDecomposition::gridSize ->
 _latchGridSize for clarity

---
 src/parallel/GeneralDomainDecomposition.cpp | 16 ++++++++--------
 src/parallel/GeneralDomainDecomposition.h   |  6 +++---
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/parallel/GeneralDomainDecomposition.cpp b/src/parallel/GeneralDomainDecomposition.cpp
index daf73bb47..549633177 100644
--- a/src/parallel/GeneralDomainDecomposition.cpp
+++ b/src/parallel/GeneralDomainDecomposition.cpp
@@ -36,22 +36,22 @@ void GeneralDomainDecomposition::initializeALL() {
 	Log::global_log->info() << "gridSize:" << gridSize[0] << ", " << gridSize[1] << ", " << gridSize[2] << std::endl;
 	Log::global_log->info() << "gridCoords:" << gridCoords[0] << ", " << gridCoords[1] << ", " << gridCoords[2] << std::endl;
 	std::tie(_boxMin, _boxMax) = initializeRegularGrid(_domainLength, gridSize, gridCoords);
-	if (_forceLatchingToLinkedCellsGrid and not _gridSize.has_value()) {
+	if (_forceLatchingToLinkedCellsGrid and not _latchGridSize.has_value()) {
 		std::array<double, 3> forcedGridSize{};
 		for(size_t dim = 0; dim < 3; ++dim){
 			size_t numCells = _domainLength[dim] / _interactionLength;
 			forcedGridSize[dim] = _domainLength[dim] / numCells;
 		}
-		_gridSize = forcedGridSize;
+		_latchGridSize = forcedGridSize;
 	}
-	if (_gridSize.has_value()) {
+	if (_latchGridSize.has_value()) {
 		std::tie(_boxMin, _boxMax) = latchToGridSize(_boxMin, _boxMax);
 	}
 #ifdef ENABLE_ALLLBL
 	// Increased slightly to prevent rounding errors.
 	const double safetyFactor = 1. + 1.e-10;
 	const std::array<double, 3> minimalDomainSize =
-		_gridSize.has_value() ? *_gridSize
+		_latchGridSize.has_value() ? *_latchGridSize
 							  : std::array{_interactionLength * safetyFactor, _interactionLength * safetyFactor,
 										   _interactionLength * safetyFactor};
 
@@ -102,7 +102,7 @@ void GeneralDomainDecomposition::balanceAndExchange(double lastTraversalTime, bo
 			Log::global_log->debug() << "work:" << lastTraversalTime << std::endl;
 			Log::global_log->set_mpi_output_root(0);
 			auto [newBoxMin, newBoxMax] = _loadBalancer->rebalance(lastTraversalTime);
-			if (_gridSize.has_value()) {
+			if (_latchGridSize.has_value()) {
 				std::tie(newBoxMin, newBoxMax) = latchToGridSize(newBoxMin, newBoxMax);
 			}
 			// migrate the particles, this will rebuild the moleculeContainer!
@@ -292,12 +292,12 @@ void GeneralDomainDecomposition::readXML(XMLfileUnits& xmlconfig) {
 					<< strings.size() << "!" << std::endl;
 				mardyn_exit(8134);
 			}
-			_gridSize = {std::stod(strings[0]), std::stod(strings[1]), std::stod(strings[2])};
+			_latchGridSize = {std::stod(strings[0]), std::stod(strings[1]), std::stod(strings[2])};
 		} else {
 			double gridSize = std::stod(gridSizeString);
-			_gridSize = {gridSize, gridSize, gridSize};
+			_latchGridSize = {gridSize, gridSize, gridSize};
 		}
-		for (auto gridSize : *_gridSize) {
+		for (auto gridSize : *_latchGridSize) {
 			if (gridSize < _interactionLength) {
 				Log::global_log->error() << "GeneralDomainDecomposition's gridSize (" << gridSize
 									<< ") is smaller than the interactionLength (" << _interactionLength
diff --git a/src/parallel/GeneralDomainDecomposition.h b/src/parallel/GeneralDomainDecomposition.h
index f1900e720..12dd37f02 100644
--- a/src/parallel/GeneralDomainDecomposition.h
+++ b/src/parallel/GeneralDomainDecomposition.h
@@ -172,7 +172,7 @@ class GeneralDomainDecomposition : public DomainDecompMPIBase {
 	std::pair<std::array<double, 3>, std::array<double, 3>> latchToGridSize(std::array<double, 3> boxMin,
 																			std::array<double, 3> boxMax) {
 		for (size_t ind = 0; ind < 3; ++ind) {
-			const double currentGridSize = (*_gridSize)[ind];
+			const double currentGridSize = (*_latchGridSize)[ind];
 			// For boxmin, the lower domain boundary is 0, so that's always fine!
 			boxMin[ind] = std::round(boxMin[ind] / currentGridSize) * currentGridSize;
 			// update boxmax only if it isn't at the very top of the domain!
@@ -197,10 +197,10 @@ class GeneralDomainDecomposition : public DomainDecompMPIBase {
 	size_t _initFrequency{500};
 
 	/**
-	 * Optionally safe a given grid size on which the process boundaries are bound/latched.
+	 * Optionally, give a grid size (=3D size of one grid cell) on which the process boundaries are bound/latched.
 	 * If no value is given, it is not used.
 	 */
-	std::optional<std::array<double, 3>> _gridSize{};
+	std::optional<std::array<double, 3>> _latchGridSize{};
 
 	/**
 	 * Bool that indicates whether a grid should be forced even if no gridSize is set.

From 39d51ba51385a5be3f8a7a7bb594994e2cd12577 Mon Sep 17 00:00:00 2001
From: FG-TUM <f.gratl@tum.de>
Date: Wed, 4 Sep 2024 17:43:12 +0200
Subject: [PATCH 07/15] Refactor ALLLoadBalancer constructor to use const
 references

Updated ALLLoadBalancer constructor parameters to use `const` references for efficiency and alignment with expected input types for the ALL library. This change improves code clarity and ensures proper handling of input arguments by maintaining consistency in data types.
---
 src/parallel/ALLLoadBalancer.cpp | 30 ++++++++++++++++--------------
 src/parallel/ALLLoadBalancer.h   |  6 +++---
 2 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/src/parallel/ALLLoadBalancer.cpp b/src/parallel/ALLLoadBalancer.cpp
index d6ff1c95e..b58965216 100644
--- a/src/parallel/ALLLoadBalancer.cpp
+++ b/src/parallel/ALLLoadBalancer.cpp
@@ -5,24 +5,26 @@
  */
 
 #include "ALLLoadBalancer.h"
-ALLLoadBalancer::ALLLoadBalancer(std::array<double, 3> boxMin, std::array<double, 3> boxMax, double gamma,
-								 MPI_Comm comm, std::array<size_t, 3> globalSize,
-								 std::array<size_t, 3> localCoordinates, std::array<double, 3> minimalPartitionSize)
-	: _all(3 /*dim*/, gamma) {
-	std::vector<Point> points;
-	points.emplace_back(3, boxMin.data());
-	points.emplace_back(3, boxMax.data());
+ALLLoadBalancer::ALLLoadBalancer(const std::array<double, 3> &boxMin, const std::array<double, 3> &boxMax, double gamma,
+								 MPI_Comm comm, const std::array<size_t, 3> &globalSize,
+								 const std::array<size_t, 3> &localCoordinates,
+								 const std::array<double, 3> &minimalPartitionSize)
+	: _all(3 /*dim*/, gamma), _minimalPartitionSize(minimalPartitionSize) {
+	// convert input into non-const vector because that is what ALL expects
+	std::vector<Point> points {
+		{3, boxMin.data()},
+		{3, boxMax.data()},
+	};
 	_all.set_vertices(points);
-	std::array<int, 3> global_size{static_cast<int>(globalSize[0]), static_cast<int>(globalSize[1]),
+	// convert input into non-const int arrays because that is what ALL expects
+	std::array<int, 3> globalSizeIntArray{static_cast<int>(globalSize[0]), static_cast<int>(globalSize[1]),
 								   static_cast<int>(globalSize[2])};
 	std::array<int, 3> coords{static_cast<int>(localCoordinates[0]), static_cast<int>(localCoordinates[1]),
 							  static_cast<int>(localCoordinates[2])};
-	_all.set_proc_grid_params(coords.data(), global_size.data());
+	_all.set_proc_grid_params(coords.data(), globalSizeIntArray.data());
 	_all.set_communicator(comm);
 
-	_coversWholeDomain = {globalSize[0] == 1, global_size[1] == 1, global_size[2] == 1};
-
-	_minimalPartitionSize = minimalPartitionSize;
+	_coversWholeDomain = {globalSizeIntArray[0] == 1, globalSizeIntArray[1] == 1, globalSizeIntArray[2] == 1};
 }
 std::tuple<std::array<double, 3>, std::array<double, 3>> ALLLoadBalancer::rebalance(double work) {
 	_all.set_work(work);
@@ -30,8 +32,8 @@ std::tuple<std::array<double, 3>, std::array<double, 3>> ALLLoadBalancer::rebala
 	_all.set_min_domain_size(ALL_LB_t::STAGGERED, _minimalPartitionSize.data());
 	_all.balance(ALL_LB_t::STAGGERED);
 	auto resultVertices = _all.get_result_vertices();
-	std::array<double, 3> boxMin{resultVertices[0].x(0), resultVertices[0].x(1), resultVertices[0].x(2)};
-	std::array<double, 3> boxMax{resultVertices[1].x(0), resultVertices[1].x(1), resultVertices[1].x(2)};
 	_all.set_vertices(resultVertices);
+	const std::array<double, 3> boxMin{resultVertices[0].x(0), resultVertices[0].x(1), resultVertices[0].x(2)};
+	const std::array<double, 3> boxMax{resultVertices[1].x(0), resultVertices[1].x(1), resultVertices[1].x(2)};
 	return std::make_tuple(boxMin, boxMax);
 }
diff --git a/src/parallel/ALLLoadBalancer.h b/src/parallel/ALLLoadBalancer.h
index c9f0eb996..b70562ef1 100644
--- a/src/parallel/ALLLoadBalancer.h
+++ b/src/parallel/ALLLoadBalancer.h
@@ -11,9 +11,9 @@
 
 class ALLLoadBalancer : public LoadBalancer {
 public:
-	ALLLoadBalancer(std::array<double, 3> boxMin, std::array<double, 3> boxMax, double gamma, MPI_Comm comm,
-					std::array<size_t, 3> globalSize, std::array<size_t, 3> localCoordinates,
-					std::array<double, 3> minimalPartitionSize);
+	ALLLoadBalancer(const std::array<double, 3> &boxMin, const std::array<double, 3> &boxMax, double gamma, MPI_Comm comm,
+					const std::array<size_t, 3>& globalSize, const std::array<size_t, 3>& localCoordinates,
+					const std::array<double, 3>& minimalPartitionSize);
 
 	~ALLLoadBalancer() override = default;
 	std::tuple<std::array<double, 3>, std::array<double, 3>> rebalance(double work) override;

From 8b4291904f7b5c81da000dc335be2fff6a107644 Mon Sep 17 00:00:00 2001
From: FG-TUM <f.gratl@tum.de>
Date: Wed, 4 Sep 2024 17:46:14 +0200
Subject: [PATCH 08/15] fix automatic grid size calculation

---
 src/parallel/GeneralDomainDecomposition.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/parallel/GeneralDomainDecomposition.cpp b/src/parallel/GeneralDomainDecomposition.cpp
index 549633177..946157a12 100644
--- a/src/parallel/GeneralDomainDecomposition.cpp
+++ b/src/parallel/GeneralDomainDecomposition.cpp
@@ -39,7 +39,8 @@ void GeneralDomainDecomposition::initializeALL() {
 	if (_forceLatchingToLinkedCellsGrid and not _latchGridSize.has_value()) {
 		std::array<double, 3> forcedGridSize{};
 		for(size_t dim = 0; dim < 3; ++dim){
-			size_t numCells = _domainLength[dim] / _interactionLength;
+			// if we calculate 3.5 cells per dim there is only space for 3 -> floor
+			const auto numCells = std::floor(_domainLength[dim] / _interactionLength);
 			forcedGridSize[dim] = _domainLength[dim] / numCells;
 		}
 		_latchGridSize = forcedGridSize;

From 117ac00df1bab40d6975f08afae2db3897d63f20 Mon Sep 17 00:00:00 2001
From: FG-TUM <f.gratl@tum.de>
Date: Wed, 4 Sep 2024 18:07:25 +0200
Subject: [PATCH 09/15] use non-flushing linebreaks

---
 src/parallel/DomainDecomposition.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/parallel/DomainDecomposition.cpp b/src/parallel/DomainDecomposition.cpp
index 4aa51be05..bf2a87437 100644
--- a/src/parallel/DomainDecomposition.cpp
+++ b/src/parallel/DomainDecomposition.cpp
@@ -87,15 +87,15 @@ bool DomainDecomposition::queryBalanceAndExchangeNonBlocking(bool /*forceRebalan
 void DomainDecomposition::balanceAndExchange(double /*lastTraversalTime*/, bool /*forceRebalancing*/, ParticleContainer* moleculeContainer,
 		Domain* domain) {
 	if (sendLeavingWithCopies()) {
-		Log::global_log->debug() << "DD: Sending Leaving and Halos." << std::endl;
+		Log::global_log->debug() << "DD: Sending Leaving and Halos.\n";
 		DomainDecompMPIBase::exchangeMoleculesMPI(moleculeContainer, domain, LEAVING_AND_HALO_COPIES);
 	} else {
-		Log::global_log->debug() << "DD: Sending Leaving." << std::endl;
+		Log::global_log->debug() << "DD: Sending Leaving.\n";
 		DomainDecompMPIBase::exchangeMoleculesMPI(moleculeContainer, domain, LEAVING_ONLY);
 #ifndef MARDYN_AUTOPAS
 		moleculeContainer->deleteOuterParticles();
 #endif
-		Log::global_log->debug() << "DD: Sending Halos." << std::endl;
+		Log::global_log->debug() << "DD: Sending Halos.\n";
 		DomainDecompMPIBase::exchangeMoleculesMPI(moleculeContainer, domain, HALO_COPIES);
 	}
 }

From 47180f4dfa9da8085b92de440ac0e985fadcca9a Mon Sep 17 00:00:00 2001
From: FG-TUM <f.gratl@tum.de>
Date: Wed, 4 Sep 2024 18:37:39 +0200
Subject: [PATCH 10/15] const, rename for clarity and use std::array instead of
 pointer where possible

---
 src/parallel/NeighbourCommunicationScheme.cpp | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/parallel/NeighbourCommunicationScheme.cpp b/src/parallel/NeighbourCommunicationScheme.cpp
index cef75c7ff..ab4190d75 100644
--- a/src/parallel/NeighbourCommunicationScheme.cpp
+++ b/src/parallel/NeighbourCommunicationScheme.cpp
@@ -434,16 +434,17 @@ void NeighbourCommunicationScheme::selectNeighbours(MessageType msgType, bool im
 void DirectNeighbourCommunicationScheme::initCommunicationPartners(double cutoffRadius, Domain * domain,
 		DomainDecompMPIBase* domainDecomp, ParticleContainer* moleculeContainer) {
 	// corners of the process-specific domain
-	double rmin[DIMgeom]; // lower corner
-	double rmax[DIMgeom]; // higher corner
-
-	for (int d = 0; d < DIMgeom; d++) {
-		rmin[d] = domainDecomp->getBoundingBoxMin(d, domain);
-		rmax[d] = domainDecomp->getBoundingBoxMax(d, domain);
-
-		// TODO: this should be safe, as long as molecules don't start flying around
-		// at the speed of one cutoffRadius per time step
-	}
+	static_assert(DIMgeom == 3); // The initialization here assumes 3 dimensions!
+	const std::array<double, DIMgeom> localLowerCorner{
+		domainDecomp->getBoundingBoxMin(0, domain),
+		domainDecomp->getBoundingBoxMin(1, domain),
+		domainDecomp->getBoundingBoxMin(2, domain),
+	};
+	const std::array<double, DIMgeom> localUpperCorner{
+		domainDecomp->getBoundingBoxMax(0, domain),
+		domainDecomp->getBoundingBoxMax(1, domain),
+		domainDecomp->getBoundingBoxMax(2, domain),
+	};
 
 	if (_pushPull) {
 		for (unsigned int d = 0; d < _commDimms; d++) { // why free?
@@ -458,18 +459,17 @@ void DirectNeighbourCommunicationScheme::initCommunicationPartners(double cutoff
 		}
 	}
 
-	HaloRegion ownRegion = {rmin[0], rmin[1], rmin[2], rmax[0], rmax[1], rmax[2], 0, 0, 0, cutoffRadius};
+	HaloRegion ownRegion = {localLowerCorner[0], localLowerCorner[1], localLowerCorner[2], localUpperCorner[0], localUpperCorner[1], localUpperCorner[2], 0, 0, 0, cutoffRadius};
 
 	if (_pushPull) {
-		double* cellLength = moleculeContainer->getHaloSize();
+		double* const cellLength = moleculeContainer->getHaloSize();
 		// halo/force regions
 		std::vector<HaloRegion> haloOrForceRegions =
 			_zonalMethod->getHaloImportForceExportRegions(ownRegion, cutoffRadius, _coversWholeDomain, cellLength);
 		std::vector<HaloRegion> leavingRegions =
-				_zonalMethod->getLeavingExportRegions(ownRegion, cutoffRadius,
-						_coversWholeDomain);
+			_zonalMethod->getLeavingExportRegions(ownRegion, cutoffRadius, _coversWholeDomain);
 
-		std::array<double, 3> globalDomainLength{domain->getGlobalLength(0), domain->getGlobalLength(1),
+		const std::array<double, 3> globalDomainLength{domain->getGlobalLength(0), domain->getGlobalLength(1),
 												 domain->getGlobalLength(2)};
 		// assuming p1 sends regions to p2
 		std::tie((*_haloImportForceExportNeighbours)[0], (*_haloExportForceImportNeighbours)[0]) =

From 46dc52c2532ebd3be25545af165becd01b1d55d5 Mon Sep 17 00:00:00 2001
From: FG-TUM <f.gratl@tum.de>
Date: Thu, 5 Sep 2024 13:58:25 +0200
Subject: [PATCH 11/15] Refactor NeighborAcquirer.cpp for readability and
 efficiency

  - const
  - constexpr
  - rename variables
  - move declarations to usages
  - limit scope
  - reserve before push_back
---
 src/parallel/NeighborAcquirer.cpp | 151 +++++++++++++++---------------
 src/parallel/NeighborAcquirer.h   |   2 +-
 2 files changed, 78 insertions(+), 75 deletions(-)

diff --git a/src/parallel/NeighborAcquirer.cpp b/src/parallel/NeighborAcquirer.cpp
index 86d69fe75..0f3bc5844 100644
--- a/src/parallel/NeighborAcquirer.cpp
+++ b/src/parallel/NeighborAcquirer.cpp
@@ -18,49 +18,50 @@
  * saved in partners01.
  */
 std::tuple<std::vector<CommunicationPartner>, std::vector<CommunicationPartner>> NeighborAcquirer::acquireNeighbors(
-	const std::array<double, 3> &globalDomainLength, HaloRegion *ownRegion, std::vector<HaloRegion> &desiredRegions,
+	const std::array<double, 3> &globalDomainLength, HaloRegion *ownRegion, const std::vector<HaloRegion> &desiredRegions,
 	const MPI_Comm &comm, bool excludeOwnRank) {
 
-	int my_rank;  // my rank
+	int my_rank{};  // my rank
 	MPI_Comm_rank(comm, &my_rank);
-	int num_processes;  // the number of processes in comm
+	int num_processes{};  // the number of processes in comm
 	MPI_Comm_size(comm, &num_processes);
 
-	int num_regions = desiredRegions.size();  // the number of regions I would like to acquire from other processes
+	const auto num_regions = desiredRegions.size();  // the number of regions I would like to acquire from other processes
 
 	// tell the other processes how much you are going to send
-	int num_bytes_send =
-		sizeof(int) * 2 + (sizeof(double) * 3 + sizeof(double) * 3 + sizeof(int) * 3 + sizeof(double) * 1) *
-							  num_regions;  // how many bytes am I going to send to all the other processes?
-	std::vector<int> num_bytes_receive_vec(num_processes, 0);  // vector of number of bytes I am going to receive
-	// MPI_Allreduce(&num_bytes_send, &num_bytes_receive, 1, MPI_INT, MPI_SUM, comm);
-	MPI_Allgather(&num_bytes_send, 1, MPI_INT, num_bytes_receive_vec.data(), 1, MPI_INT, comm);
+	// how many bytes am I going to send to all the other processes
+	const int num_bytes_send =
+		sizeof(int) * 2 + (sizeof(double) * 3 + sizeof(double) * 3 + sizeof(int) * 3 + sizeof(double) * 1) * num_regions;
 
-	// create byte buffer
+	// create byte send buffer
 	std::vector<unsigned char> outgoingDesiredRegionsVector(num_bytes_send);  // outgoing byte buffer
-	int i = 0;
-	int p = 0;
 
 	// msg format: rank | number_of_regions | region_01 | region_02 | ...
-
-	memcpy(outgoingDesiredRegionsVector.data() + i, &my_rank, sizeof(int));
-	i += sizeof(int);
-	memcpy(outgoingDesiredRegionsVector.data() + i, &num_regions, sizeof(int));
-	i += sizeof(int);
+	// fill the buffer
+	int bufferPosition = 0;
+	memcpy(outgoingDesiredRegionsVector.data() + bufferPosition, &my_rank, sizeof(int));
+	bufferPosition += sizeof(int);
+	memcpy(outgoingDesiredRegionsVector.data() + bufferPosition, &num_regions, sizeof(int));
+	bufferPosition += sizeof(int);
 
 	for (auto &region : desiredRegions) {  // filling up the outgoing byte buffer
-		memcpy(outgoingDesiredRegionsVector.data() + i, region.rmin, sizeof(double) * 3);
-		i += sizeof(double) * 3;
-		memcpy(outgoingDesiredRegionsVector.data() + i, region.rmax, sizeof(double) * 3);
-		i += sizeof(double) * 3;
-		memcpy(outgoingDesiredRegionsVector.data() + i, region.offset, sizeof(int) * 3);
-		i += sizeof(int) * 3;
-		memcpy(outgoingDesiredRegionsVector.data() + i, &region.width, sizeof(double));
-		i += sizeof(double);
+		memcpy(outgoingDesiredRegionsVector.data() + bufferPosition, region.rmin, sizeof(double) * 3);
+		bufferPosition += sizeof(double) * 3;
+		memcpy(outgoingDesiredRegionsVector.data() + bufferPosition, region.rmax, sizeof(double) * 3);
+		bufferPosition += sizeof(double) * 3;
+		memcpy(outgoingDesiredRegionsVector.data() + bufferPosition, region.offset, sizeof(int) * 3);
+		bufferPosition += sizeof(int) * 3;
+		memcpy(outgoingDesiredRegionsVector.data() + bufferPosition, &region.width, sizeof(double));
+		bufferPosition += sizeof(double);
 	}
 
+	// set up structure information data for the Allgatherv operation
+	// vector of number of bytes I am going to receive
+	std::vector<int> num_bytes_receive_vec(num_processes, 0);
+	MPI_Allgather(&num_bytes_send, 1, MPI_INT, num_bytes_receive_vec.data(), 1, MPI_INT, comm);
+	// vector of offsets (=displacement in MPI) in the receive buffer
+	std::vector<int> num_bytes_displacements(num_processes, 0);
 	int num_bytes_receive = 0;
-	std::vector<int> num_bytes_displacements(num_processes, 0);  // vector of number of bytes I am going to receive
 	for (int j = 0; j < num_processes; j++) {
 		num_bytes_displacements[j] = num_bytes_receive;
 		num_bytes_receive += num_bytes_receive_vec[j];
@@ -74,38 +75,40 @@ std::tuple<std::vector<CommunicationPartner>, std::vector<CommunicationPartner>>
 
 	std::vector<int> numberOfRegionsToSendToRank(num_processes, 0);       // outgoing row
 
-	int bytesOneRegion =
+	constexpr int bytesOneRegion =
 		sizeof(double) * 3 + sizeof(double) * 3 + sizeof(int) * 3 + sizeof(double) + sizeof(double) * 3;
-	std::vector<std::vector<std::vector<unsigned char>>> sendingList(num_processes);  // the regions I own and want to send
+	// the regions I own and want to send: ranks<regions<regionData>>
+	std::vector<std::vector<std::vector<unsigned char>>> sendingList(num_processes);
 	std::vector<CommunicationPartner> comm_partners02;
 
-	i = 0;
-	while (i != num_bytes_receive) {
-		int rank;
-		int regions;
+	bufferPosition = 0;
+	while (bufferPosition < num_bytes_receive /*== buffer length*/) {
 
-		memcpy(&rank, incomingDesiredRegionsVector.data() + i, sizeof(int));
-		i += sizeof(int);  // 4
-		memcpy(&regions, incomingDesiredRegionsVector.data() + i, sizeof(int));
-		i += sizeof(int);  // 4
+		int rank{};
+		memcpy(&rank, incomingDesiredRegionsVector.data() + bufferPosition, sizeof(int));
+		bufferPosition += sizeof(int);  // 4
+		int regions{};
+		memcpy(&regions, incomingDesiredRegionsVector.data() + bufferPosition, sizeof(int));
+		bufferPosition += sizeof(int);  // 4
 
-		for (int j = 0; j < regions; j++) {
+		for (int regionId = 0; regionId < regions; ++regionId) {
 			HaloRegion unshiftedRegion{};
-			memcpy(unshiftedRegion.rmin, incomingDesiredRegionsVector.data() + i, sizeof(double) * 3);
-			i += sizeof(double) * 3;  // 24
-			memcpy(unshiftedRegion.rmax, incomingDesiredRegionsVector.data() + i, sizeof(double) * 3);
-			i += sizeof(double) * 3;  // 24
-			memcpy(unshiftedRegion.offset, incomingDesiredRegionsVector.data() + i, sizeof(int) * 3);
-			i += sizeof(int) * 3;  // 12
-			memcpy(&unshiftedRegion.width, incomingDesiredRegionsVector.data() + i, sizeof(double));
-			i += sizeof(double);  // 4
+			memcpy(unshiftedRegion.rmin, incomingDesiredRegionsVector.data() + bufferPosition, sizeof(double) * 3);
+			bufferPosition += sizeof(double) * 3;  // 24
+			memcpy(unshiftedRegion.rmax, incomingDesiredRegionsVector.data() + bufferPosition, sizeof(double) * 3);
+			bufferPosition += sizeof(double) * 3;  // 24
+			memcpy(unshiftedRegion.offset, incomingDesiredRegionsVector.data() + bufferPosition, sizeof(int) * 3);
+			bufferPosition += sizeof(int) * 3;  // 12
+			memcpy(&unshiftedRegion.width, incomingDesiredRegionsVector.data() + bufferPosition, sizeof(double));
+			bufferPosition += sizeof(double);  // 4
 
 			// msg format one region: rmin | rmax | offset | width | shift
-			auto shiftedRegionShiftPair = getPotentiallyShiftedRegions(globalDomainLength, unshiftedRegion);
-
-			std::vector<HaloRegion> regionsToTest = shiftedRegionShiftPair.first;
-			std::vector<std::array<double, 3>> shifts  = shiftedRegionShiftPair.second;
-
+			auto [regionsToTest, shifts] = getPotentiallyShiftedRegions(globalDomainLength, unshiftedRegion);
+			// Before every set of push_backs make sure there is enough space for this set + all remaining.
+			// Work with the assumption that the others are of the same size as the current ones.
+			// This is potentially an overestimate but avoids a large number of resizes.
+			sendingList.reserve(sendingList.size() + ((regions - regionId) * regionsToTest.size()));
+			comm_partners02.reserve(comm_partners02.size() + ((regions - regionId) * regionsToTest.size()));
 			for(size_t regionIndex = 0; regionIndex < regionsToTest.size(); ++regionIndex){
 				auto regionToTest = regionsToTest[regionIndex];
 				if ((not excludeOwnRank or rank != my_rank) and isIncluded(ownRegion, &regionToTest)) {
@@ -113,10 +116,10 @@ std::tuple<std::vector<CommunicationPartner>, std::vector<CommunicationPartner>>
 
 					numberOfRegionsToSendToRank[rank]++;  // this is a region I will send to rank
 
-					auto overlappedRegion = overlap(*ownRegion, regionToTest);  // different shift for the overlap?
+					const auto overlappedRegion = overlap(*ownRegion, regionToTest);  // different shift for the overlap?
 
 					// make a note in partners02 - don't forget to squeeze partners02
-					bool enlarged[3][2] = {{false}};
+					constexpr bool enlarged[3][2] = {{false}};
 					for (int k = 0; k < 3; k++) currentShift[k] *= -1;
 
 					comm_partners02.emplace_back(rank, overlappedRegion.rmin, overlappedRegion.rmax, overlappedRegion.rmin,
@@ -143,7 +146,7 @@ std::tuple<std::vector<CommunicationPartner>, std::vector<CommunicationPartner>>
 
 					std::vector<unsigned char> singleRegion(bytesOneRegion);
 
-					p = 0;
+					int p = 0;
 					memcpy(&singleRegion[p], unshiftedOverlappedRegion.rmin, sizeof(double) * 3);
 					p += sizeof(double) * 3;
 					memcpy(&singleRegion[p], unshiftedOverlappedRegion.rmax, sizeof(double) * 3);
@@ -155,7 +158,7 @@ std::tuple<std::vector<CommunicationPartner>, std::vector<CommunicationPartner>>
 					memcpy(&singleRegion[p], currentShift.data(), sizeof(double) * 3);
 					//p += sizeof(double) * 3;
 
-					sendingList[rank].push_back(std::move(singleRegion));
+					sendingList[rank].emplace_back(std::move(singleRegion));
 				}
 			}
 		}
@@ -218,19 +221,17 @@ std::tuple<std::vector<CommunicationPartner>, std::vector<CommunicationPartner>>
 	std::vector<CommunicationPartner> comm_partners01;  // the communication partners
 
 	// receive data (blocking)
-	int byte_counter = 0;
-
 	/**
 	 * We now receive as many regions as we previously determined that we will receive.
 	 * For that we keep track, how many regions we received and increase this according to the number of regions
 	 * received per MPI operation.
 	 */
-	while (byte_counter < numberOfRegionsToReceive[my_rank] * bytesOneRegion) {
+	for (int byte_counter = 0; byte_counter < numberOfRegionsToReceive[my_rank] * bytesOneRegion; ) {
 		// MPI_PROBE
 		MPI_Probe(MPI_ANY_SOURCE, 1, comm, &probe_status);
 		// interpret probe
-		int source = probe_status.MPI_SOURCE;
-		int bytes;
+		const auto source = probe_status.MPI_SOURCE;
+		int bytes{};
 		MPI_Get_count(&probe_status, MPI_BYTE, &bytes);
 		// we have receive `bytes` bytes. So we increase the byte_counter.
 		byte_counter += bytes;
@@ -238,24 +239,26 @@ std::tuple<std::vector<CommunicationPartner>, std::vector<CommunicationPartner>>
 		std::vector<unsigned char> raw_neighbours(bytes);
 		MPI_Recv(raw_neighbours.data(), bytes, MPI_BYTE, source, 1, comm, &rec_status);
 		// Interpret Buffer and add neighbours
-		for (int k = 0; k < (bytes / bytesOneRegion); k++) {  // number of regions from this process
+		const auto numRegionsToReceive = bytes / bytesOneRegion;
+		comm_partners01.reserve(std::max(comm_partners01.size(), static_cast<size_t>(numberOfRegionsToReceive[my_rank] * numRegionsToReceive)));
+		for (int regionId = 0; regionId < numRegionsToReceive; ++regionId) {  // number of regions from this process
 			HaloRegion region{};
-			double shift[3];
-			i = k * bytesOneRegion;
+			bufferPosition = regionId * bytesOneRegion;
 
-			memcpy(region.rmin, raw_neighbours.data() + i, sizeof(double) * 3);
-			i += sizeof(double) * 3;
-			memcpy(region.rmax, raw_neighbours.data() + i, sizeof(double) * 3);
-			i += sizeof(double) * 3;
-			memcpy(region.offset, raw_neighbours.data() + i, sizeof(int) * 3);
-			i += sizeof(int) * 3;
-			memcpy(&region.width, raw_neighbours.data() + i, sizeof(double));
-			i += sizeof(double);
+			memcpy(region.rmin, raw_neighbours.data() + bufferPosition, sizeof(double) * 3);
+			bufferPosition += sizeof(double) * 3;
+			memcpy(region.rmax, raw_neighbours.data() + bufferPosition, sizeof(double) * 3);
+			bufferPosition += sizeof(double) * 3;
+			memcpy(region.offset, raw_neighbours.data() + bufferPosition, sizeof(int) * 3);
+			bufferPosition += sizeof(int) * 3;
+			memcpy(&region.width, raw_neighbours.data() + bufferPosition, sizeof(double));
+			bufferPosition += sizeof(double);
 
-			memcpy(shift, raw_neighbours.data() + i, sizeof(double) * 3);
-			i += sizeof(double) * 3;
+			double shift[3];
+			memcpy(shift, raw_neighbours.data() + bufferPosition, sizeof(double) * 3);
+			// bufferPosition += sizeof(double) * 3;
 
-			bool enlarged[3][2] = {{false}};
+			constexpr bool enlarged[3][2] = {{false}};
 
 			comm_partners01.emplace_back(source, region.rmin, region.rmax, region.rmin, region.rmax, shift,
 										 region.offset, enlarged);
diff --git a/src/parallel/NeighborAcquirer.h b/src/parallel/NeighborAcquirer.h
index 5fb9cba96..0f92e35ce 100644
--- a/src/parallel/NeighborAcquirer.h
+++ b/src/parallel/NeighborAcquirer.h
@@ -29,7 +29,7 @@ class NeighborAcquirer {
 	 * second vector will own the particles.
 	 */
 	static std::tuple<std::vector<CommunicationPartner>, std::vector<CommunicationPartner>> acquireNeighbors(
-		const std::array<double, 3>& globalDomainLength, HaloRegion* ownRegion, std::vector<HaloRegion>& desiredRegions,
+		const std::array<double, 3>& globalDomainLength, HaloRegion* ownRegion, const std::vector<HaloRegion>& desiredRegions,
 		const MPI_Comm& comm, bool excludeOwnRank=true);
 
 	static std::vector<CommunicationPartner> squeezePartners(const std::vector<CommunicationPartner>& partners);

From da8e980cd99d712321510d38d5c912dbc7f2537b Mon Sep 17 00:00:00 2001
From: FG-TUM <f.gratl@tum.de>
Date: Thu, 5 Sep 2024 14:07:29 +0200
Subject: [PATCH 12/15] change low level info logs to debug

---
 src/parallel/GeneralDomainDecomposition.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/parallel/GeneralDomainDecomposition.cpp b/src/parallel/GeneralDomainDecomposition.cpp
index 946157a12..efd6aa750 100644
--- a/src/parallel/GeneralDomainDecomposition.cpp
+++ b/src/parallel/GeneralDomainDecomposition.cpp
@@ -97,7 +97,7 @@ void GeneralDomainDecomposition::balanceAndExchange(double lastTraversalTime, bo
 			moleculeContainer->deleteOuterParticles();
 
 			// rebalance
-			Log::global_log->info() << "rebalancing..." << std::endl;
+			Log::global_log->debug() << "rebalancing..." << std::endl;
 
 			Log::global_log->set_mpi_output_all();
 			Log::global_log->debug() << "work:" << lastTraversalTime << std::endl;
@@ -107,7 +107,7 @@ void GeneralDomainDecomposition::balanceAndExchange(double lastTraversalTime, bo
 				std::tie(newBoxMin, newBoxMax) = latchToGridSize(newBoxMin, newBoxMax);
 			}
 			// migrate the particles, this will rebuild the moleculeContainer!
-			Log::global_log->info() << "migrating particles" << std::endl;
+			Log::global_log->debug() << "migrating particles" << std::endl;
 			migrateParticles(domain, moleculeContainer, newBoxMin, newBoxMax);
 
 #ifndef MARDYN_AUTOPAS
@@ -120,9 +120,9 @@ void GeneralDomainDecomposition::balanceAndExchange(double lastTraversalTime, bo
 			_boxMax = newBoxMax;
 
 			// init communication partners
-			Log::global_log->info() << "updating communication partners" << std::endl;
+			Log::global_log->debug() << "updating communication partners" << std::endl;
 			initCommPartners(moleculeContainer, domain);
-			Log::global_log->info() << "rebalancing finished" << std::endl;
+			Log::global_log->debug() << "rebalancing finished" << std::endl;
 			DomainDecompMPIBase::exchangeMoleculesMPI(moleculeContainer, domain, HALO_COPIES);
 		} else {
 			if (sendLeavingWithCopies()) {

From 3d8dab716987119082d20f516b08d4f222c8dfba Mon Sep 17 00:00:00 2001
From: FG-TUM <f.gratl@tum.de>
Date: Thu, 5 Sep 2024 15:45:30 +0200
Subject: [PATCH 13/15] const + default inits + comments

---
 src/parallel/NeighborAcquirer.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/parallel/NeighborAcquirer.cpp b/src/parallel/NeighborAcquirer.cpp
index 0f3bc5844..7f1d00784 100644
--- a/src/parallel/NeighborAcquirer.cpp
+++ b/src/parallel/NeighborAcquirer.cpp
@@ -75,12 +75,13 @@ std::tuple<std::vector<CommunicationPartner>, std::vector<CommunicationPartner>>
 
 	std::vector<int> numberOfRegionsToSendToRank(num_processes, 0);       // outgoing row
 
+	// parse / deserialize received data
 	constexpr int bytesOneRegion =
 		sizeof(double) * 3 + sizeof(double) * 3 + sizeof(int) * 3 + sizeof(double) + sizeof(double) * 3;
 	// the regions I own and want to send: ranks<regions<regionData>>
 	std::vector<std::vector<std::vector<unsigned char>>> sendingList(num_processes);
-	std::vector<CommunicationPartner> comm_partners02;
 
+	std::vector<CommunicationPartner> comm_partners02{};
 	bufferPosition = 0;
 	while (bufferPosition < num_bytes_receive /*== buffer length*/) {
 
@@ -103,7 +104,7 @@ std::tuple<std::vector<CommunicationPartner>, std::vector<CommunicationPartner>>
 			bufferPosition += sizeof(double);  // 4
 
 			// msg format one region: rmin | rmax | offset | width | shift
-			auto [regionsToTest, shifts] = getPotentiallyShiftedRegions(globalDomainLength, unshiftedRegion);
+			const auto [regionsToTest, shifts] = getPotentiallyShiftedRegions(globalDomainLength, unshiftedRegion);
 			// Before every set of push_backs make sure there is enough space for this set + all remaining.
 			// Work with the assumption that the others are of the same size as the current ones.
 			// This is potentially an overestimate but avoids a large number of resizes.

From c3176486e5f1b1fda3dad88c339af76c6d568669 Mon Sep 17 00:00:00 2001
From: FG-TUM <f.gratl@tum.de>
Date: Mon, 7 Oct 2024 15:06:10 +0200
Subject: [PATCH 14/15] Reserve the correct subvector of sendingList + clarify
 doc

---
 src/parallel/NeighborAcquirer.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/parallel/NeighborAcquirer.cpp b/src/parallel/NeighborAcquirer.cpp
index 7f1d00784..4dba32754 100644
--- a/src/parallel/NeighborAcquirer.cpp
+++ b/src/parallel/NeighborAcquirer.cpp
@@ -106,9 +106,10 @@ std::tuple<std::vector<CommunicationPartner>, std::vector<CommunicationPartner>>
 			// msg format one region: rmin | rmax | offset | width | shift
 			const auto [regionsToTest, shifts] = getPotentiallyShiftedRegions(globalDomainLength, unshiftedRegion);
 			// Before every set of push_backs make sure there is enough space for this set + all remaining.
-			// Work with the assumption that the others are of the same size as the current ones.
-			// This is potentially an overestimate but avoids a large number of resizes.
-			sendingList.reserve(sendingList.size() + ((regions - regionId) * regionsToTest.size()));
+			// This guarantees that there is enough space for the current set of push_backs, and, if subsequent sets
+			// are smaller, further reallocations can be avoided. This potentially leads to an overestimate but comes
+			// with the advantage of fewer resizes.
+			sendingList[rank].reserve(sendingList[rank].size() + ((regions - regionId) * regionsToTest.size()));
 			comm_partners02.reserve(comm_partners02.size() + ((regions - regionId) * regionsToTest.size()));
 			for(size_t regionIndex = 0; regionIndex < regionsToTest.size(); ++regionIndex){
 				auto regionToTest = regionsToTest[regionIndex];

From 0bb2a7c3ff9ebd27ff12d1955b693d6bd839a950 Mon Sep 17 00:00:00 2001
From: FG-TUM <f.gratl@tum.de>
Date: Mon, 7 Oct 2024 15:07:20 +0200
Subject: [PATCH 15/15] formatting

---
 src/parallel/NeighborAcquirer.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/parallel/NeighborAcquirer.cpp b/src/parallel/NeighborAcquirer.cpp
index 4dba32754..db4ae4078 100644
--- a/src/parallel/NeighborAcquirer.cpp
+++ b/src/parallel/NeighborAcquirer.cpp
@@ -122,12 +122,16 @@ std::tuple<std::vector<CommunicationPartner>, std::vector<CommunicationPartner>>
 
 					// make a note in partners02 - don't forget to squeeze partners02
 					constexpr bool enlarged[3][2] = {{false}};
-					for (int k = 0; k < 3; k++) currentShift[k] *= -1;
+					for (int k = 0; k < 3; k++) {
+						currentShift[k] *= -1;
+					}
 
 					comm_partners02.emplace_back(rank, overlappedRegion.rmin, overlappedRegion.rmax, overlappedRegion.rmin,
 												 overlappedRegion.rmax, currentShift.data(), overlappedRegion.offset, enlarged);
 
-					for (int k = 0; k < 3; k++) currentShift[k] *= -1;
+					for (int k = 0; k < 3; k++) {
+						currentShift[k] *= -1;
+					}
 
 					// Undo the shift. So it is again in the perspective of the rank we got this region from.
 					// We cannot use unshiftedRegion, as it is not overlapped and thus potentially too big.