NNPDF · APJansen · Dec 8, 2023 · Dec 8, 2023 · Dec 8, 2023 · Dec 8, 2023
diff --git a/n3fit/runcards/examples/developing_weights.h5 b/n3fit/runcards/examples/developing_weights.h5
diff --git a/n3fit/src/n3fit/backends/__init__.py b/n3fit/src/n3fit/backends/__init__.py
@@ -1,20 +1,23 @@
-from n3fit.backends.keras_backend.internal_state import (
-    set_initial_state,
-    clear_backend_state,
-    set_eager
-)
+from n3fit.backends.keras_backend import callbacks, constraints, operations
 from n3fit.backends.keras_backend.MetaLayer import MetaLayer
-from n3fit.backends.keras_backend.MetaModel import MetaModel
+from n3fit.backends.keras_backend.MetaModel import (
+    NN_LAYER_ALL_REPLICAS,
+    NN_PREFIX,
+    PREPROCESSING_LAYER_ALL_REPLICAS,
+    MetaModel,
+)
 from n3fit.backends.keras_backend.base_layers import (
+    Concatenate,
     Input,
-    concatenate,
     Lambda,
     base_layer_selector,
+    concatenate,
     regularizer_selector,
-    Concatenate,
 )
-from n3fit.backends.keras_backend import operations
-from n3fit.backends.keras_backend import constraints
-from n3fit.backends.keras_backend import callbacks
+from n3fit.backends.keras_backend.internal_state import (
+    clear_backend_state,
+    set_eager,
+    set_initial_state,
+)
 
 print("Using Keras backend")
diff --git a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
@@ -5,6 +5,7 @@
     backend-dependent calls.
 """
 
+import logging
 import re
 
 import h5py
@@ -16,6 +17,8 @@
 
 import n3fit.backends.keras_backend.operations as op
 
+log = logging.getLogger(__name__)
+
 # Check the TF version to check if legacy-mode is needed (TF < 2.2)
 tf_version = tf.__version__.split(".")
 if int(tf_version[0]) == 2 and int(tf_version[1]) < 2:
@@ -46,7 +49,8 @@
 }
 
 NN_PREFIX = "NN"
-PREPROCESSING_PREFIX = "preprocessing_factor"
+NN_LAYER_ALL_REPLICAS = "all_NNs"
+PREPROCESSING_LAYER_ALL_REPLICAS = "preprocessing_factor"
 
 # Some keys need to work for everyone
 for k, v in optimizers.items():
@@ -156,7 +160,7 @@ def perform_fit(self, x=None, y=None, epochs=1, **kwargs):
         of the model (the loss functions) to the partial losses.
 
         If the model was compiled with input and output data, they will not be passed through.
-        In this case by default the number of `epochs` will be set to 1
+        In this case by default the number of ``epochs`` will be set to 1
 
         ex:
             {'loss': [100], 'dataset_a_loss1' : [67], 'dataset_2_loss': [33]}
@@ -169,10 +173,36 @@ def perform_fit(self, x=None, y=None, epochs=1, **kwargs):
         x_params = self._parse_input(x)
         if y is None:
             y = self.target_tensors
-        history = super().fit(x=x_params, y=y, epochs=epochs, **kwargs)
+
+        # Avoids Tensorflow overhead that happens at every epoch, by putting multiple steps in an epoch
+        steps_per_epoch = self.determine_steps_per_epoch(epochs)
+
+        for k, v in x_params.items():
+            x_params[k] = tf.repeat(v, steps_per_epoch, axis=0)
+        y = [tf.repeat(yi, steps_per_epoch, axis=0) for yi in y]
+
+        history = super().fit(
+            x=x_params, y=y, epochs=epochs // steps_per_epoch, batch_size=1, **kwargs
+        )
         loss_dict = history.history
         return loss_dict
 
+    def determine_steps_per_epoch(self, epochs):
+        num_replicas = self.output_shape[0][0]
+        # in this case we're most likely running on the CPU and this is not worth it
+        if num_replicas == 1:
+            return 1
+
+        # On the GPU, run with
+        for divisor in [10, 100]:
+            if epochs % divisor != 0:
+                steps_per_epoch = divisor // 10
+                log.warning(
+                    f"Epochs {epochs} not divisible by {divisor}, using {steps_per_epoch} steps per epoch"
+                )
+                return steps_per_epoch
+        return 100
+
     def predict(self, x=None, **kwargs):
         """Call super().predict with the right input arguments"""
         x = self._parse_input(x)
@@ -198,10 +228,15 @@ def compute_losses(self):
             out_names = [f"{i}_loss" for i in self.output_names]
             out_names.insert(0, "loss")
 
+            inputs = self._parse_input(None)
+            # get rid of the repetitions by number of epochs made in perform_fit
+            for k, v in inputs.items():
+                inputs[k] = v[:1]
+
             # Compile a evaluation function
             @tf.function
             def losses_fun():
-                predictions = self(self._parse_input(None))
+                predictions = self(inputs)
                 # If we only have one dataset the output changes
                 if len(out_names) == 2:
                     predictions = [predictions]
@@ -228,7 +263,7 @@ def compile(
     ):
         """
         Compile the model given an optimizer and a list of loss functions.
-        The optimizer must be one of those implemented in the `optimizer` attribute of this class.
+        The optimizer must be one of those implemented in the ``optimizer`` attribute of this class.
 
         Options:
             - A learning rate and a list of target outpout can be defined.
@@ -353,14 +388,10 @@ def get_replica_weights(self, i_replica):
             dict
                 dictionary with the weights of the replica
         """
-        NN_weights = [
-            tf.Variable(w, name=w.name) for w in self.get_layer(f"{NN_PREFIX}_{i_replica}").weights
-        ]
-        prepro_weights = [
-            tf.Variable(w, name=w.name)
-            for w in self.get_layer(f"{PREPROCESSING_PREFIX}_{i_replica}").weights
-        ]
-        weights = {NN_PREFIX: NN_weights, PREPROCESSING_PREFIX: prepro_weights}
+        weights = {}
+        for layer_type in [NN_LAYER_ALL_REPLICAS, PREPROCESSING_LAYER_ALL_REPLICAS]:
+            layer = self.get_layer(layer_type)
+            weights[layer_type] = get_layer_replica_weights(layer, i_replica)
 
         return weights
 
@@ -378,10 +409,9 @@ def set_replica_weights(self, weights, i_replica=0):
             i_replica: int
                 the replica number to set, defaulting to 0
         """
-        self.get_layer(f"{NN_PREFIX}_{i_replica}").set_weights(weights[NN_PREFIX])
-        self.get_layer(f"{PREPROCESSING_PREFIX}_{i_replica}").set_weights(
-            weights[PREPROCESSING_PREFIX]
-        )
+        for layer_type in [NN_LAYER_ALL_REPLICAS, PREPROCESSING_LAYER_ALL_REPLICAS]:
+            layer = self.get_layer(layer_type)
+            set_layer_replica_weights(layer=layer, weights=weights[layer_type], i_replica=i_replica)
 
     def split_replicas(self):
         """
@@ -411,51 +441,84 @@ def load_identical_replicas(self, model_file):
         """
         From a single replica model, load the same weights into all replicas.
         """
-        weights = self._format_weights_from_file(model_file)
+        single_replica = self.single_replica_generator()
+        single_replica.load_weights(model_file)
+        weights = single_replica.get_replica_weights(0)
 
         for i_replica in range(self.num_replicas):
             self.set_replica_weights(weights, i_replica)
 
-    def _format_weights_from_file(self, model_file):
-        """Read weights from a .h5 file and format into a dictionary of tf.Variables"""
-        weights = {}
 
-        with h5py.File(model_file, 'r') as f:
-            # look at layers of the form NN_i and take the lowest i
-            i_replica = 0
-            while f"{NN_PREFIX}_{i_replica}" not in f:
-                i_replica += 1
+def is_stacked_single_replicas(layer):
+    """
+    Check if the layer consists of stacked single replicas (Only happens for NN layers),
+    to determine how to extract single replica weights.
 
-            weights[NN_PREFIX] = self._extract_weights(
-                f[f"{NN_PREFIX}_{i_replica}"], NN_PREFIX, i_replica
-            )
-            weights[PREPROCESSING_PREFIX] = self._extract_weights(
-                f[f"{PREPROCESSING_PREFIX}_{i_replica}"], PREPROCESSING_PREFIX, i_replica
-            )
+    Parameters
+    ----------
+        layer: MetaLayer
+            the layer to check
 
-        return weights
+    Returns
+    -------
+        bool
+            True if the layer consists of stacked single replicas
+    """
+    if not isinstance(layer, MetaModel):
+        return False
+    return f"{NN_PREFIX}_0" in [sublayer.name for sublayer in layer.layers]
 
-    def _extract_weights(self, h5_group, weights_key, i_replica):
-        """Extract weights from a h5py group, turning them into Tensorflow variables"""
-        weights = []
 
-        def append_weights(name, node):
-            if isinstance(node, h5py.Dataset):
-                weight_name = node.name.split("/", 2)[-1]
-                weight_name = weight_name.replace(f"{NN_PREFIX}_{i_replica}", f"{NN_PREFIX}_0")
-                weight_name = weight_name.replace(
-                    f"{PREPROCESSING_PREFIX}_{i_replica}", f"{PREPROCESSING_PREFIX}_0"
-                )
-                weights.append(tf.Variable(node[()], name=weight_name))
+def get_layer_replica_weights(layer, i_replica: int):
+    """
+    Get the weights for the given single replica ``i_replica``,
+    from a ``layer`` that has weights for all replicas.
+
+    Note that the layer could be a complete  a complete NN with many separated sub_layers
+    each of which containing weights for all replicas together.
+    This functions separates the per-replica weights and returns the list of weight as if the
+    input ``layer`` were made of _only_ replica ``i_replica``.
+    Parameters
+    ----------
+        layer: MetaLayer
+            the layer to get the weights from
+        i_replica: int
+            the replica number
+
+    Returns
+    -------
+        weights: list
+            list of weights for the replica
+    """
+    if is_stacked_single_replicas(layer):
+        weights = layer.get_layer(f"{NN_PREFIX}_{i_replica}").weights
+    else:
+        weights = [tf.Variable(w[i_replica : i_replica + 1], name=w.name) for w in layer.weights]
 
-        h5_group.visititems(append_weights)
+    return weights
+
+
+def set_layer_replica_weights(layer, weights, i_replica: int):
+    """
+    Set the weights for the given single replica ``i_replica``.
+    When the input ``layer`` contains weights for many replicas, ensures that
+    only those corresponding to replica ``i_replica`` are updated.
+
+    Parameters
+    ----------
+        layer: MetaLayer
+            the layer to set the weights for
+        weights: list
+            list of weights for the replica
+        i_replica: int
+            the replica number
+    """
+    if is_stacked_single_replicas(layer):
+        layer.get_layer(f"{NN_PREFIX}_{i_replica}").set_weights(weights)
+        return
 
-        # have to put them in the same order
-        weights_ordered = []
-        weights_model_order = [w.name for w in self.get_replica_weights(0)[weights_key]]
-        for w in weights_model_order:
-            for w_h5 in weights:
-                if w_h5.name == w:
-                    weights_ordered.append(w_h5)
+    full_weights = [w.numpy() for w in layer.weights]
+    for w_old, w_new in zip(full_weights, weights):
+        w_old[i_replica : i_replica + 1] = w_new
 
-        return weights_ordered
+    layer.set_weights(full_weights)