Implement MultiDense layer

Add MultiDense layer Add MultiDense layer improvements Recreate initializer per replica to make sure seed is properly set Add tolerences to test Add multi_dense path in generate_nn Add MultiDropout Replace old dense layer everywhere Remove MultiDropout, not necessary Update developing weights structure Remove MultiDropout once more Fix naming inconsistency wrt parallel-prefactor
NNPDF · Jan 26, 2024 · 4dca563 · 4dca563
1 parent 5e3e192
commit 4dca563
Show file tree

Hide file tree

Showing 9 changed files with 335 additions and 45 deletions.
diff --git a/n3fit/runcards/examples/developing_weights.h5 b/n3fit/runcards/examples/developing_weights.h5
diff --git a/n3fit/src/n3fit/backends/keras_backend/base_layers.py b/n3fit/src/n3fit/backends/keras_backend/base_layers.py
@@ -17,34 +17,44 @@
     The names of the layer and the activation function are the ones to be used in the n3fit runcard.
 """
 
-from tensorflow.keras.layers import Lambda, LSTM, Dropout, Concatenate
-from tensorflow.keras.layers import concatenate, Input # pylint: disable=unused-import
+from tensorflow import expand_dims, math, nn
+from tensorflow.keras.layers import (  # pylint: disable=unused-import
+    Dropout,
+    Input,
+    Lambda,
+    concatenate,
+)
 from tensorflow.keras.layers import Dense as KerasDense
-from tensorflow import expand_dims
+from tensorflow.keras.layers import LSTM, Concatenate  # pylint: disable=unused-import
 from tensorflow.keras.regularizers import l1_l2
-from tensorflow import nn, math
 
 from n3fit.backends import MetaLayer
+from n3fit.backends.keras_backend.multi_dense import MultiDense
+
 
 # Custom activation functions
 def square_activation(x):
-    """ Squares the input """
-    return x*x
+    """Squares the input"""
+    return x * x
+
 
 def modified_tanh(x):
-    """ A non-saturating version of the tanh function """
-    return math.abs(x)*nn.tanh(x)
+    """A non-saturating version of the tanh function"""
+    return math.abs(x) * nn.tanh(x)
+
 
 def leaky_relu(x):
-    """ Computes the Leaky ReLU activation function """
+    """Computes the Leaky ReLU activation function"""
     return nn.leaky_relu(x, alpha=0.2)
 
+
 custom_activations = {
-    "square" : square_activation,
+    "square": square_activation,
     "leaky_relu": leaky_relu,
     "modified_tanh": modified_tanh,
 }
 
+
 def LSTM_modified(**kwargs):
     """
     LSTM asks for a sample X timestep X features kind of thing so we need to reshape the input
@@ -61,9 +71,11 @@ def ReshapedLSTM(input_tensor):
 
     return ReshapedLSTM
 
+
 class Dense(KerasDense, MetaLayer):
     pass
 
+
 def dense_per_flavour(basis_size=8, kernel_initializer="glorot_normal", **dense_kwargs):
     """
     Generates a list of layers which can take as an input either one single layer
@@ -85,7 +97,7 @@ def dense_per_flavour(basis_size=8, kernel_initializer="glorot_normal", **dense_
 
     # Need to generate a list of dense layers
     dense_basis = [
-        base_layer_selector("dense", kernel_initializer=initializer, **dense_kwargs)
+        base_layer_selector("single_dense", kernel_initializer=initializer, **dense_kwargs)
         for initializer in kernel_initializer
     ]
 
@@ -116,13 +128,26 @@ def apply_dense(xinput):
 
 layers = {
     "dense": (
+        MultiDense,
+        {
+            "input_shape": (1,),
+            "replica_seeds": None,
+            "kernel_initializer": "glorot_normal",
+            "units": 5,
+            "activation": "sigmoid",
+            "kernel_regularizer": None,
+            "replica_input": True,
+        },
+    ),
+    # This one is only used inside dense_per_flavour
+    "single_dense": (
         Dense,
         {
             "input_shape": (1,),
             "kernel_initializer": "glorot_normal",
             "units": 5,
             "activation": "sigmoid",
-            "kernel_regularizer": None
+            "kernel_regularizer": None,
         },
     ),
     "dense_per_flavour": (
@@ -143,31 +168,28 @@ def apply_dense(xinput):
     "concatenate": (Concatenate, {}),
 }
 
-regularizers = {
-    'l1_l2': (l1_l2, {'l1': 0., 'l2': 0.})
-    }
+regularizers = {'l1_l2': (l1_l2, {'l1': 0.0, 'l2': 0.0})}
+
 
 def base_layer_selector(layer_name, **kwargs):
     """
-        Given a layer name, looks for it in the `layers` dictionary and returns an instance.
+    Given a layer name, looks for it in the `layers` dictionary and returns an instance.
 
-        The layer dictionary defines a number of defaults
-        but they can be overwritten/enhanced through kwargs
+    The layer dictionary defines a number of defaults
+    but they can be overwritten/enhanced through kwargs
 
-        Parameters
-        ----------
-            `layer_name
-                str with the name of the layer
-            `**kwargs`
-                extra optional arguments to pass to the layer (beyond their defaults)
+    Parameters
+    ----------
+        `layer_name
+            str with the name of the layer
+        `**kwargs`
+            extra optional arguments to pass to the layer (beyond their defaults)
     """
     try:
         layer_tuple = layers[layer_name]
     except KeyError as e:
         raise NotImplementedError(
-            "Layer not implemented in keras_backend/base_layers.py: {0}".format(
-                layer_name
-            )
+            "Layer not implemented in keras_backend/base_layers.py: {0}".format(layer_name)
         ) from e
 
     layer_class = layer_tuple[0]
@@ -182,6 +204,7 @@ def base_layer_selector(layer_name, **kwargs):
 
     return layer_class(**layer_args)
 
+
 def regularizer_selector(reg_name, **kwargs):
     """Given a regularizer name looks in the `regularizer` dictionary and
     return an instance.
@@ -204,7 +227,8 @@ def regularizer_selector(reg_name, **kwargs):
         reg_tuple = regularizers[reg_name]
     except KeyError:
         raise NotImplementedError(
-            "Regularizer not implemented in keras_backend/base_layers.py: {0}".format(reg_name))
+            "Regularizer not implemented in keras_backend/base_layers.py: {0}".format(reg_name)
+        )
 
     reg_class = reg_tuple[0]
     reg_args = reg_tuple[1]

diff --git a/n3fit/src/n3fit/backends/keras_backend/multi_dense.py b/n3fit/src/n3fit/backends/keras_backend/multi_dense.py
@@ -0,0 +1,179 @@
+from typing import List
+
+import tensorflow as tf
+from tensorflow.keras.initializers import Initializer
+from tensorflow.keras.layers import Dense, Dropout
+
+
+class MultiDense(Dense):
+    """
+    Dense layer for multiple replicas at the same time.
+
+    Inputs to this layer may contain multiple replicas, for the later layers.
+    In this case, the `replica_input` argument should be set to `True`, which is the default.
+    The input shape in this case is (batch_size, replicas, gridsize, features).
+    For the first layer, there are no replicas yet, and so the `replica_input` argument
+    should be set to `False`.
+    The input shape in this case is (batch_size, gridsize, features).
+
+    Weights are initialized using a `replica_seeds` list of seeds, and are identical to the
+    weights of a list of single dense layers with the same `replica_seeds`.
+
+
+    Example
+    -------
+
+    >>> from tensorflow.keras import Sequential
+    >>> from tensorflow.keras.layers import Dense
+    >>> from tensorflow.keras.initializers import GlorotUniform
+    >>> import tensorflow as tf
+    >>> replicas = 2
+    >>> multi_dense_model = Sequential([
+    >>>     MultiDense(units=8, replica_seeds=[42, 43], replica_input=False, kernel_initializer=GlorotUniform(seed=0)),
+    >>>     MultiDense(units=4, replica_seeds=[52, 53], kernel_initializer=GlorotUniform(seed=0)),
+    >>>     ])
+    >>> single_models = [
+    >>>     Sequential([
+    >>>         Dense(units=8, kernel_initializer=GlorotUniform(seed=42 + r)),
+    >>>         Dense(units=4, kernel_initializer=GlorotUniform(seed=52 + r)),
+    >>>         ])
+    >>>     for r in range(replicas)
+    >>>     ]
+    >>> gridsize, features = 100, 2
+    >>> multi_dense_model.build(input_shape=(None, gridsize, features))
+    >>> for single_model in single_models:
+    >>>     single_model.build(input_shape=(None, gridsize, features))
+    >>> test_input = tf.random.uniform(shape=(1, gridsize, features))
+    >>> multi_dense_output = multi_dense_model(test_input)
+    >>> single_dense_output = tf.stack([single_model(test_input) for single_model in single_models], axis=1)
+    >>> tf.reduce_all(tf.equal(multi_dense_output, single_dense_output))
+
+    Parameters
+    ----------
+    replica_seeds: List[int]
+        List of seeds per replica for the kernel initializer.
+    kernel_initializer: Initializer
+        Initializer class for the kernel.
+    replica_input: bool (default: True)
+        Whether the input already contains multiple replicas.
+    """
+
+    def __init__(
+        self,
+        replica_seeds: List[int],
+        kernel_initializer: Initializer,
+        replica_input: bool = True,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.replicas = len(replica_seeds)
+        self.replica_seeds = replica_seeds
+        self.kernel_initializer = MultiInitializer(
+            single_initializer=kernel_initializer, replica_seeds=replica_seeds
+        )
+        self.bias_initializer = MultiInitializer(
+            single_initializer=self.bias_initializer, replica_seeds=replica_seeds
+        )
+        self.replica_input = replica_input
+
+    def build(self, input_shape):
+        input_dim = input_shape[-1]
+        self.kernel = self.add_weight(
+            name="kernel",
+            shape=(self.replicas, input_dim, self.units),
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            constraint=self.kernel_constraint,
+        )
+        if self.use_bias:
+            self.bias = self.add_weight(
+                name="bias",
+                shape=(self.replicas, 1, self.units),
+                initializer=self.bias_initializer,
+                regularizer=self.bias_regularizer,
+                constraint=self.bias_constraint,
+            )
+        else:
+            self.bias = None
+        self.input_spec.axes = {-1: input_dim}
+        self.built = True
+
+    def call(self, inputs):
+        """
+        Compute output of shape (batch_size, replicas, gridsize, units).
+
+        For the first layer, (self.replica_input is False), this is equivalent to
+        applying each replica separately and concatenating along the last axis.
+        If the input already contains multiple replica outputs, it is equivalent
+        to applying each replica to its corresponding input.
+        """
+        if inputs.dtype.base_dtype != self._compute_dtype_object.base_dtype:
+            inputs = tf.cast(inputs, dtype=self._compute_dtype_object)
+
+        input_axes = 'brnf' if self.replica_input else 'bnf'
+        einrule = input_axes + ',rfg->brng'
+        outputs = tf.einsum(einrule, inputs, self.kernel)
+
+        # Reshape the output back to the original ndim of the input.
+        if not tf.executing_eagerly():
+            output_shape = self.compute_output_shape(inputs.shape.as_list())
+            outputs.set_shape(output_shape)
+
+        if self.use_bias:
+            outputs = outputs + self.bias
+
+        if self.activation is not None:
+            outputs = self.activation(outputs)
+
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        # Remove the replica axis from the input shape.
+        if self.replica_input:
+            input_shape = input_shape[:1] + input_shape[2:]
+
+        output_shape = super().compute_output_shape(input_shape)
+
+        # Add back the replica axis to the output shape.
+        output_shape = output_shape[:1] + [self.replicas] + output_shape[1:]
+
+        return output_shape
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({"replica_input": self.replica_input, "replica_seeds": self.replica_seeds})
+        return config
+
+
+class MultiInitializer(Initializer):
+    """
+    Multi replica initializer that exactly replicates a stack of single replica initializers.
+
+    Weights are stacked on the first axis, and per replica seeds are added to a base seed of the
+    given single replica initializer.
+
+    Parameters
+    ----------
+        single_initializer: Initializer
+            Initializer class for the kernel.
+        replica_seeds: List[int]
+            List of seeds per replica for the kernel initializer.
+    """
+
+    def __init__(self, single_initializer: Initializer, replica_seeds: List[int]):
+        self.initializer_class = type(single_initializer)
+        self.initializer_config = single_initializer.get_config()
+        self.base_seed = single_initializer.seed if hasattr(single_initializer, "seed") else None
+        self.replica_seeds = replica_seeds
+
+    def __call__(self, shape, dtype=None, **kwargs):
+        shape = shape[1:]  # Remove the replica axis from the shape.
+        per_replica_weights = []
+        for replica_seed in self.replica_seeds:
+            if self.base_seed is not None:
+                self.initializer_config["seed"] = self.base_seed + replica_seed
+            single_initializer = self.initializer_class.from_config(self.initializer_config)
+
+            per_replica_weights.append(single_initializer(shape, dtype, **kwargs))
+
+        return tf.stack(per_replica_weights, axis=0)
diff --git a/n3fit/src/n3fit/checks.py b/n3fit/src/n3fit/checks.py
@@ -385,8 +385,8 @@ def check_consistent_parallel(parameters, parallel_models, same_trvl_per_replica
             "Replicas cannot be run in parallel with different training/validation "
             " masks, please set `same_trvl_per_replica` to True in the runcard"
         )
-    if parameters.get("layer_type") != "dense":
-        raise CheckError("Parallelization has only been tested with layer_type=='dense'")
+    if parameters.get("layer_type") == "dense_per_flavour":
+        raise CheckError("Parallelization has not been tested with layer_type=='dense_per_flavour'")
 
 
 @make_argcheck
@@ -427,10 +427,9 @@ def check_fiatlux_pdfs_id(replicas, fiatlux):
                 f"Cannot generate a photon replica with id larger than the number of replicas of the PDFs set {luxset.name}:\nreplica id={max_id}, replicas of {luxset.name} = {pdfs_ids}"
             )
 
+
 @make_argcheck
 def check_multireplica_qed(replicas, fiatlux):
     if fiatlux is not None:
         if len(replicas) > 1:
-            raise CheckError(
-                "At the moment, running a multireplica QED fits is not allowed."
-            )
+            raise CheckError("At the moment, running a multireplica QED fits is not allowed.")