diff --git a/aiida_nanotech_empa/workflows/cp2k/cp2k_benchmark_workchain.py b/aiida_nanotech_empa/workflows/cp2k/cp2k_benchmark_workchain.py
index 4e07876..abfd37f 100644
--- a/aiida_nanotech_empa/workflows/cp2k/cp2k_benchmark_workchain.py
+++ b/aiida_nanotech_empa/workflows/cp2k/cp2k_benchmark_workchain.py
@@ -1,6 +1,7 @@
 import copy
 import pathlib
 import re
+import math
 
 import numpy as np
 from ...utils import common_utils
@@ -11,6 +12,24 @@
 
 ALLOWED_PROTOCOLS = ["standard"]
 
+def is_perfect_square(x):
+    """Check if x is a perfect square"""
+    root = int(math.isqrt(x))
+    return root * root == x
+
+def find_multiples_of_ngpus(ngpus,n, max_N):
+    """
+    Returns a list of integers N <= max_N that are multiples of ngpus
+    and such that n * N is a perfect square.
+    """
+    result = []
+    
+    for N in range(ngpus, max_N + 1, ngpus):  # Only multiples of ngpus
+        if is_perfect_square(n * N):
+            result.append(N)
+    
+    return result
+
 @engine.calcfunction
 def analyze_speedup(time_dict):
     """
@@ -38,14 +57,14 @@ def analyze_speedup(time_dict):
         nnodes_str, ntasks_str, nthreads_str = key.split('_')
         nnodes = int(nnodes_str)
         # Collect time for each nnodes
-        times_per_nnodes[nnodes].append(time)
+        if time != 'FAILED':
+            times_per_nnodes[nnodes].append(time)
 
     # Find the minimum time for each nnodes
     min_times_per_nnodes = {}
     for nnodes, times in times_per_nnodes.items():
         min_time = min(times)
-        if min_time < 100000:
-            min_times_per_nnodes[nnodes] = min_time
+        min_times_per_nnodes[nnodes] = min_time
 
     # Sort nnodes to find the lowest nnodes (reference)
     sorted_nnodes = sorted(min_times_per_nnodes.keys())
@@ -97,7 +116,7 @@ def get_timing_from_FolderData(folder_node=None):
     """
     # Load the FolderData node
     if folder_node is None:
-        return orm.Float(100000)
+        return orm.Str('FAILED')
     
     # Check if 'aiida.out' exists in the FolderData
     if 'aiida.out' not in folder_node.list_object_names():
@@ -184,9 +203,16 @@ def define(cls, spec):
             help="List of #nodes to be used in the benchmark.",
         )
         spec.input(
-            "list_tasks_per_node",
-            valid_type=orm.List,
-            default=lambda: orm.List(list=[6]),
+            "ngpus",
+            valid_type=orm.Int,
+            default=lambda: orm.Int(1),
+            required=True,
+            help="Number of GPUs per node.",
+        )
+        spec.input(
+            "max_tasks_per_node",
+            valid_type=orm.Int,
+            default=lambda: orm.Int(2),
             required=True,
             help="List of #tasks per node to be used in the benchmark.",
         )
@@ -237,9 +263,9 @@ def setup(self):
             "pseudo": orm.SinglefileData(
                 file=pathlib.Path(__file__).parent / "data" / "POTENTIAL"
             ),
-            "mpswrapper": orm.SinglefileData(
-                file=pathlib.Path(__file__).parent / "data" / "mps-wrapper.sh"
-            ),
+            #"mpswrapper": orm.SinglefileData(
+            #    file=pathlib.Path(__file__).parent / "data" / "mps-wrapper.sh"
+            #),
         }
         self.ctx.input_dict = cp2k_utils.load_protocol(
                 "benchmarks.yml", self.inputs.protocol.value)
@@ -283,7 +309,7 @@ def submit_calculations(self):
 	            mywall=20
              
             # Loop for mpi tasks 
-            for ntasks in self.inputs.list_tasks_per_node:
+            for ntasks in find_multiples_of_ngpus(self.inputs.ngpus.value,nnodes, self.inputs.max_tasks_per_node.value):
                 for nthreads in self.inputs.list_threads_per_task:
                     # Loop for threads,check that nthreads * ntasks <= max_tasks
                     if  nthreads<=self.ctx.max_tasks/ntasks :
@@ -326,7 +352,7 @@ def finalize(self):
         
         for nnodes in self.inputs.list_nodes:
             # Loop for mpi tasks 
-            for ntasks in self.inputs.list_tasks_per_node:
+            for ntasks in find_multiples_of_ngpus(self.inputs.ngpus.value,nnodes, self.inputs.max_tasks_per_node.value):
                 # Loop for mpi tasks
                 for nthreads in self.inputs.list_threads_per_task:
                     # Check that nthreads * ntasks <= 72
diff --git a/examples/workflows/example_cp2k_benchmark.py b/examples/workflows/example_cp2k_benchmark.py
index 484c47f..a2cab97 100644
--- a/examples/workflows/example_cp2k_benchmark.py
+++ b/examples/workflows/example_cp2k_benchmark.py
@@ -9,7 +9,7 @@
 
 Cp2kBenchmarkWorkChain = plugins.WorkflowFactory("nanotech_empa.cp2k.benchmark")
 
-def _example_cp2k_benchmark(cp2k_code, nnodes, ntasks,nthreads):
+def _example_cp2k_benchmark(cp2k_code, nnodes, max_ntasks,nthreads):
     # Check test geometries are already in database.
     qb = orm.QueryBuilder()
     qb.append(
@@ -36,7 +36,8 @@ def _example_cp2k_benchmark(cp2k_code, nnodes, ntasks,nthreads):
     builder.code = cp2k_code
     builder.protocol = orm.Str("scf_ot_no_wfn")
     builder.list_nodes = orm.List(list=nnodes)
-    builder.list_tasks_per_node = orm.List(list=ntasks)
+    builder.max_tasks_per_node = orm.Int(max_ntasks)
+    builder.ngpus=orm.Int(1)
     builder.list_threads_per_task = orm.List(list=nthreads)
     builder.metadata.label = "CP2K_Scf"
     builder.structure = structures["c2h2.xyz"]
@@ -50,7 +51,7 @@ def _example_cp2k_benchmark(cp2k_code, nnodes, ntasks,nthreads):
 def run_all(cp2k_code):
     print("####    Starting benchmark")
     _example_cp2k_benchmark(
-        orm.load_code(cp2k_code),[1],[1],[1,2]
+        orm.load_code(cp2k_code),[1],8,[1,2]
     )
 
 if __name__ == "__main__":