From 650f8723a8aa054dcf9bddcccb32a416efe60703 Mon Sep 17 00:00:00 2001
From: Antonio Bellotta <antonio.bellotta@epfl.ch>
Date: Thu, 23 May 2024 13:41:57 +0200
Subject: [PATCH] [BBPBGLIB-1158] Add empirical mechanism to estimate
 simulation memory usage (#165)

## Context
Add an estimate for the simulation memory usage. The estimate is simply
calculated considering 2.5 times the total amount of memory estimated
for cell + synapses.
This factor is based on tests that we performed on a variety of circuit
that seem to suggest that the ratio between the instantiation and the
simulation tops at 2.5 times.

For more information see:
https://bbpteam.epfl.ch/project/issues/browse/BBPBGLIB-1158

## Scope
Just a simple add to already existing functions.

## Testing
The modification will be tested during the already implemented
integration tests for the dry run workflow.

## Review
* [x] PR description is complete
* [x] Coding style (imports, function length, New functions, classes or
files) are good
* [x] Unit/Scientific test added
* [x] Updated Readme, in-code, developer documentation
---
 docs/architecture.rst      | 14 ++++++++++++--
 neurodamus/utils/memory.py | 20 ++++++++++++++++++--
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/docs/architecture.rst b/docs/architecture.rst
index ff162b89..9e0d4310 100644
--- a/docs/architecture.rst
+++ b/docs/architecture.rst
@@ -320,8 +320,8 @@ Dry Run
 -------
 
 A dry run mode was introduced to help users in understanding how many nodes and tasks are
-necessary to run a specific circuit. In the future this mode will also be used to improve
-load balancing.
+necessary to run a specific circuit. This mode can also be used to improve load balancing,
+as it generates an `allocation.pkl.gz` file which can be used to load balance the simulation.
 
 By running a dry run, using the `--dry-run` flag, the user will NOT run an actual simulation but
 will get a summary of the estimated memory used for cells and synapses, including also the overhead
@@ -388,6 +388,16 @@ itself, e.g. data structures, loaded libraries and so on. This is done by measur
 process before any of the actual instantiation is done. This value, since it's averaged over all ranks that take
 part in the execution, is then multiplied by the number of ranks used in the execution.
 
+On top of this we also need to consider the memory usage of the simulation itself. Unfortunately
+at the moment there are no easy ways to estimate this value, so we have opted for a simple heuristic
+approach. We assume that the memory usage of the simulation is proportional to the memory usage of
+the cells and synapses. From tests on a wide variety of circuits we've seen that the simulation memory 
+usage is typically between 1.5 and 2.5 times the memory usage of the cells and synapses. We've opted for
+the more conservative value of 2.5 times the memory usage of the cells and synapses.
+The simulation estimate is not considered for the load balancing part of the dry run since we assume that
+it's proportional to the memory usage of the cells and synapses and it's just used for the suggestions
+of nodes to use in the simulation and the relative target ranks (more on this later).
+
 The final result is then printed to the user in a human readable format together with an estimate
 of the number of nodes needed to run the simulation on the same machine used to run the dry run.
 
diff --git a/neurodamus/utils/memory.py b/neurodamus/utils/memory.py
index 51a34ef0..0f245a53 100644
--- a/neurodamus/utils/memory.py
+++ b/neurodamus/utils/memory.py
@@ -22,6 +22,11 @@
 
 import numpy as np
 
+# The factor to multiply the cell + synapses memory usage by to get the simulation memory estimate.
+# This is an heuristic estimate based on tests on multiple circuits.
+# More info in docs/architecture.rst.
+SIM_ESTIMATE_FACTOR = 2.5
+
 
 def trim_memory():
     """
@@ -361,8 +366,14 @@ def display_total(self):
         logging.info("| {:<40s} | {:12.1f} |".format(f"Overhead (ranks={MPI.size})", full_overhead))
         logging.info("| {:<40s} | {:12.1f} |".format("Cells", self.cell_memory_total))
         logging.info("| {:<40s} | {:12.1f} |".format("Synapses", self.synapse_memory_total))
+        self.simulation_estimate = (self.cell_memory_total
+                                    + self.synapse_memory_total) * SIM_ESTIMATE_FACTOR
+        logging.info("| {:<40s} | {:12.1f} |".format("Simulation", self.simulation_estimate))
         logging.info("+{:-^57}+".format(""))
-        grand_total = full_overhead + self.cell_memory_total + self.synapse_memory_total
+        grand_total = (full_overhead
+                       + self.cell_memory_total
+                       + self.synapse_memory_total
+                       + self.simulation_estimate)
         grand_total = pretty_printing_memory_mb(grand_total)
         logging.info("| {:<40s} | {:>12s} |".format("GRAND TOTAL", grand_total))
         logging.info("+{:-^57}+".format(""))
@@ -402,7 +413,12 @@ def suggest_nodes(self, margin):
 
         while (prev_est_nodes is None or est_nodes != prev_est_nodes) and iter_count < max_iter:
             prev_est_nodes = est_nodes
-            mem_usage_per_node = full_overhead + self.cell_memory_total + self.synapse_memory_total
+            simulation_estimate = (self.cell_memory_total +
+                                   self.synapse_memory_total) * SIM_ESTIMATE_FACTOR
+            mem_usage_per_node = (full_overhead
+                                  + self.cell_memory_total
+                                  + self.synapse_memory_total
+                                  + simulation_estimate)
             mem_usage_with_margin = mem_usage_per_node * (1 + margin)
             est_nodes = math.ceil(mem_usage_with_margin / DryRunStats.total_memory_available())
             full_overhead = self.base_memory * ranks_per_node * est_nodes