diff --git a/tensorflow_privacy/privacy/fast_gradient_clipping/BUILD b/tensorflow_privacy/privacy/fast_gradient_clipping/BUILD
index ffa666f3a..2df0e1cb3 100644
--- a/tensorflow_privacy/privacy/fast_gradient_clipping/BUILD
+++ b/tensorflow_privacy/privacy/fast_gradient_clipping/BUILD
@@ -6,12 +6,20 @@ py_library(
     name = "gradient_clipping_utils",
     srcs = ["gradient_clipping_utils.py"],
     srcs_version = "PY3",
+    deps = [":layer_registry"],
+)
+
+py_library(
+    name = "einsum_utils",
+    srcs = ["einsum_utils.py"],
+    srcs_version = "PY3",
 )
 
 py_library(
     name = "layer_registry",
     srcs = ["layer_registry.py"],
     srcs_version = "PY3",
+    deps = [":einsum_utils"],
 )
 
 py_library(
diff --git a/tensorflow_privacy/privacy/fast_gradient_clipping/clip_grads.py b/tensorflow_privacy/privacy/fast_gradient_clipping/clip_grads.py
index 6a37ae30f..42fdc1e95 100644
--- a/tensorflow_privacy/privacy/fast_gradient_clipping/clip_grads.py
+++ b/tensorflow_privacy/privacy/fast_gradient_clipping/clip_grads.py
@@ -21,11 +21,20 @@
 `compute_gradient_norms()` function).
 """
 
+from typing import Union, Iterable, Text, TypeAlias
+
 import tensorflow as tf
 from tensorflow_privacy.privacy.fast_gradient_clipping import gradient_clipping_utils
+from tensorflow_privacy.privacy.fast_gradient_clipping import layer_registry as lr
+
+InputTensor: TypeAlias = Union[
+    tf.Tensor, Iterable[tf.Tensor], dict[Text, tf.Tensor]
+]
 
 
-def get_registry_generator_fn(tape, layer_registry):
+def get_registry_generator_fn(
+    tape: tf.GradientTape, layer_registry: lr.LayerRegistry
+):
   """Creates the generator function for `compute_gradient_norms()`."""
   if layer_registry is None:
     # Needed for backwards compatibility.
@@ -53,7 +62,12 @@ def registry_generator_fn(layer_instance, args, kwargs):
     return registry_generator_fn
 
 
-def compute_gradient_norms(input_model, x_batch, y_batch, layer_registry):
+def compute_gradient_norms(
+    input_model: tf.keras.Model,
+    x_batch: InputTensor,
+    y_batch: tf.Tensor,
+    layer_registry: lr.LayerRegistry,
+):
   """Computes the per-example loss gradient norms for given data.
 
   Applies a variant of the approach given in
@@ -62,7 +76,7 @@ def compute_gradient_norms(input_model, x_batch, y_batch, layer_registry):
   Args:
     input_model: The `tf.keras.Model` from which to obtain the layers from. The
       loss of the model *must* be a scalar loss.
-    x_batch: A `tf.Tensor` representing a batch of inputs to the model. The
+    x_batch: An `InputTensor` representing a batch of inputs to the model. The
       first axis must be the batch dimension.
     y_batch: A `tf.Tensor` representing a batch of output labels. The first axis
       must be the batch dimension. The number of examples should match the
@@ -106,7 +120,7 @@ def compute_gradient_norms(input_model, x_batch, y_batch, layer_registry):
   return tf.sqrt(tf.reduce_sum(sqr_norm_tsr, axis=1))
 
 
-def compute_clip_weights(l2_norm_clip, gradient_norms):
+def compute_clip_weights(l2_norm_clip: float, gradient_norms: tf.Tensor):
   """Computes the per-example loss/clip weights for clipping.
 
   When the sum of the per-example losses is replaced a weighted sum, where
@@ -132,7 +146,11 @@ def compute_clip_weights(l2_norm_clip, gradient_norms):
 
 
 def compute_pred_and_clipped_gradients(
-    input_model, x_batch, y_batch, l2_norm_clip, layer_registry
+    input_model: tf.keras.Model,
+    x_batch: InputTensor,
+    y_batch: tf.Tensor,
+    l2_norm_clip: float,
+    layer_registry: lr.LayerRegistry,
 ):
   """Computes the per-example predictions and per-example clipped loss gradient.
 
@@ -147,7 +165,7 @@ def compute_pred_and_clipped_gradients(
 
   Args:
     input_model: The `tf.keras.Model` from which to obtain the layers from.
-    x_batch: A `tf.Tensor` representing a batch of inputs to the model. The
+    x_batch: An `InputTensor` representing a batch of inputs to the model. The
       first axis must be the batch dimension.
     y_batch: A `tf.Tensor` representing a batch of output labels. The first axis
       must be the batch dimension. The number of examples should match the
diff --git a/tensorflow_privacy/privacy/fast_gradient_clipping/clip_grads_test.py b/tensorflow_privacy/privacy/fast_gradient_clipping/clip_grads_test.py
index 183d890ce..6f923db8f 100644
--- a/tensorflow_privacy/privacy/fast_gradient_clipping/clip_grads_test.py
+++ b/tensorflow_privacy/privacy/fast_gradient_clipping/clip_grads_test.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 import itertools
+from typing import Callable, Any, List, TypeAlias
+
 from absl.testing import parameterized
 import tensorflow as tf
 
@@ -20,23 +22,37 @@
 from tensorflow_privacy.privacy.fast_gradient_clipping import layer_registry
 
 
+# ==============================================================================
+# Type aliases
+# ==============================================================================
+LayerGenerator: TypeAlias = Callable[
+    [List[int], List[int]], tf.keras.layers.Layer
+]
+
+ModelGenerator: TypeAlias = Callable[
+    [LayerGenerator, List[int], List[int]], tf.keras.Model
+]
+
+
 # ==============================================================================
 # Helper functions and classes.
 # ==============================================================================
 class DoubleDense(tf.keras.layers.Layer):
   """Generates two dense layers nested together."""
 
-  def __init__(self, units):
+  def __init__(self, shape: List[int]):
     super().__init__()
-    self.dense1 = tf.keras.layers.Dense(units)
+    self.dense1 = tf.keras.layers.Dense(shape)
     self.dense2 = tf.keras.layers.Dense(1)
 
-  def call(self, inputs):
+  def call(self, inputs: Any):
     x = self.dense1(inputs)
     return self.dense2(x)
 
 
-def double_dense_layer_computation(layer_instance, inputs, tape):
+def double_dense_layer_computation(
+    layer_instance: tf.keras.layers.Layer, inputs: Any, tape: tf.GradientTape
+):
   """Layer registry function for the custom `DoubleDense` layer class."""
   vars1, outputs, sqr_norm_fn1 = layer_registry.dense_layer_computation(
       layer_instance.dense1, inputs, tape
@@ -53,7 +69,9 @@ def sqr_norm_fn(base_vars):
   return [vars1, vars2], outputs, sqr_norm_fn
 
 
-def compute_true_gradient_norms(input_model, x_batch, y_batch):
+def compute_true_gradient_norms(
+    input_model: tf.keras.Model, x_batch: tf.Tensor, y_batch: tf.Tensor
+):
   """Computes the real gradient norms for an input `(model, x, y)`."""
   loss_config = input_model.loss.get_config()
   loss_config['reduction'] = tf.keras.losses.Reduction.NONE
@@ -73,14 +91,14 @@ def compute_true_gradient_norms(input_model, x_batch, y_batch):
 
 
 def get_computed_and_true_norms(
-    model_generator,
-    layer_generator,
-    input_dims,
-    output_dim,
-    is_eager,
-    x_input,
-    rng_seed=777,
-    registry=None,
+    model_generator: ModelGenerator,
+    layer_generator: LayerGenerator,
+    input_dims: List[int],
+    output_dims: List[int],
+    is_eager: bool,
+    x_input: tf.Tensor,
+    rng_seed: int = 777,
+    registry: layer_registry.LayerRegistry = None,
 ):
   """Obtains the true and computed gradient norms for a model and batch input.
 
@@ -96,7 +114,7 @@ def get_computed_and_true_norms(
       Returns a `tf.keras.layers.Layer` that accepts input tensors of dimension
       `idim` and returns output tensors of dimension `odim`.
     input_dims: The input dimension(s) of the test `tf.keras.Model` instance.
-    output_dim: The output dimension of the test `tf.keras.Model` instance.
+    output_dims: The output dimension(s) of the test `tf.keras.Model` instance.
     is_eager: A `bool` that is `True` if the model should be run eagerly.
     x_input: `tf.Tensor` inputs to be tested.
     rng_seed: An `int` used to initialize model weights.
@@ -109,7 +127,7 @@ def get_computed_and_true_norms(
     model and layer generators. The second element contains the true clipped
     gradient norms under the aforementioned setting.
   """
-  model = model_generator(layer_generator, input_dims, output_dim)
+  model = model_generator(layer_generator, input_dims, output_dims)
   model.compile(
       optimizer=tf.keras.optimizers.SGD(learning_rate=1.0),
       loss=tf.keras.losses.MeanSquaredError(
@@ -131,61 +149,71 @@ def get_computed_and_true_norms(
 # ==============================================================================
 # Model generators.
 # ==============================================================================
-def make_two_layer_sequential_model(layer_generator, input_dim, output_dim):
+def make_one_layer_sequential_model(layer_generator, input_dims, output_dims):
+  """Creates a 1-layer sequential model."""
+  inputs = tf.keras.Input(shape=input_dims)
+  layer1 = layer_generator(input_dims, output_dims)
+  temp1 = layer1(inputs)
+  reduction_axes1 = tf.range(1, len(temp1.shape))
+  outputs = tf.reduce_sum(temp1, axis=reduction_axes1, keepdims=True)
+  return tf.keras.Model(inputs=inputs, outputs=outputs)
+
+
+def make_two_layer_sequential_model(layer_generator, input_dims, output_dims):
   """Creates a 2-layer sequential model."""
   model = tf.keras.Sequential()
-  model.add(tf.keras.Input(shape=(input_dim,)))
-  model.add(layer_generator(input_dim, output_dim))
+  model.add(tf.keras.Input(shape=input_dims))
+  model.add(layer_generator(input_dims, output_dims))
   model.add(tf.keras.layers.Dense(1))
   return model
 
 
-def make_three_layer_sequential_model(layer_generator, input_dim, output_dim):
+def make_three_layer_sequential_model(layer_generator, input_dims, output_dims):
   """Creates a 3-layer sequential model."""
   model = tf.keras.Sequential()
-  model.add(tf.keras.Input(shape=(input_dim,)))
-  layer1 = layer_generator(input_dim, output_dim)
+  model.add(tf.keras.Input(shape=input_dims))
+  layer1 = layer_generator(input_dims, output_dims)
   model.add(layer1)
   if isinstance(layer1, tf.keras.layers.Embedding):
     # Having multiple consecutive embedding layers does not make sense since
     # embedding layers only map integers to real-valued vectors.
-    model.add(tf.keras.layers.Dense(output_dim))
+    model.add(tf.keras.layers.Dense(output_dims))
   else:
-    model.add(layer_generator(output_dim, output_dim))
+    model.add(layer_generator(output_dims, output_dims))
   model.add(tf.keras.layers.Dense(1))
   return model
 
 
-def make_two_layer_functional_model(layer_generator, input_dim, output_dim):
+def make_two_layer_functional_model(layer_generator, input_dims, output_dims):
   """Creates a 2-layer 1-input functional model with a pre-output square op."""
-  inputs = tf.keras.Input(shape=(input_dim,))
-  layer1 = layer_generator(input_dim, output_dim)
+  inputs = tf.keras.Input(shape=input_dims)
+  layer1 = layer_generator(input_dims, output_dims)
   temp1 = layer1(inputs)
   temp2 = tf.square(temp1)
   outputs = tf.keras.layers.Dense(1)(temp2)
   return tf.keras.Model(inputs=inputs, outputs=outputs)
 
 
-def make_two_tower_model(layer_generator, input_dim, output_dim):
+def make_two_tower_model(layer_generator, input_dims, output_dims):
   """Creates a 2-layer 2-input functional model."""
-  inputs1 = tf.keras.Input(shape=(input_dim,))
-  layer1 = layer_generator(input_dim, output_dim)
+  inputs1 = tf.keras.Input(shape=input_dims)
+  layer1 = layer_generator(input_dims, output_dims)
   temp1 = layer1(inputs1)
-  inputs2 = tf.keras.Input(shape=(input_dim,))
-  layer2 = layer_generator(input_dim, output_dim)
+  inputs2 = tf.keras.Input(shape=input_dims)
+  layer2 = layer_generator(input_dims, output_dims)
   temp2 = layer2(inputs2)
   temp3 = tf.add(temp1, temp2)
   outputs = tf.keras.layers.Dense(1)(temp3)
   return tf.keras.Model(inputs=[inputs1, inputs2], outputs=outputs)
 
 
-def make_bow_model(layer_generator, input_dims, output_dim):
+def make_bow_model(layer_generator, input_dims, output_dims):
   del layer_generator
   inputs = tf.keras.Input(shape=input_dims)
   # For the Embedding layer, input_dim is the vocabulary size. This should
   # be distinguished from the input_dim argument, which is the number of ids
   # in eache example.
-  emb_layer = tf.keras.layers.Embedding(input_dim=10, output_dim=output_dim)
+  emb_layer = tf.keras.layers.Embedding(input_dim=10, output_dim=output_dims[0])
   feature_embs = emb_layer(inputs)
   reduction_axes = tf.range(1, len(feature_embs.shape))
   example_embs = tf.expand_dims(
@@ -194,7 +222,7 @@ def make_bow_model(layer_generator, input_dims, output_dim):
   return tf.keras.Model(inputs=inputs, outputs=example_embs)
 
 
-def make_dense_bow_model(layer_generator, input_dims, output_dim):
+def make_dense_bow_model(layer_generator, input_dims, output_dims):
   del layer_generator
   inputs = tf.keras.Input(shape=input_dims)
   # For the Embedding layer, input_dim is the vocabulary size. This should
@@ -202,7 +230,7 @@ def make_dense_bow_model(layer_generator, input_dims, output_dim):
   # in eache example.
   cardinality = 10
   emb_layer = tf.keras.layers.Embedding(
-      input_dim=cardinality, output_dim=output_dim
+      input_dim=cardinality, output_dim=output_dims[0]
   )
   feature_embs = emb_layer(inputs)
   reduction_axes = tf.range(1, len(feature_embs.shape))
@@ -213,7 +241,7 @@ def make_dense_bow_model(layer_generator, input_dims, output_dim):
   return tf.keras.Model(inputs=inputs, outputs=outputs)
 
 
-def make_weighted_bow_model(layer_generator, input_dims, output_dim):
+def make_weighted_bow_model(layer_generator, input_dims, output_dims):
   # NOTE: This model only accepts dense input tensors.
   del layer_generator
   inputs = tf.keras.Input(shape=input_dims)
@@ -222,7 +250,7 @@ def make_weighted_bow_model(layer_generator, input_dims, output_dim):
   # in eache example.
   cardinality = 10
   emb_layer = tf.keras.layers.Embedding(
-      input_dim=cardinality, output_dim=output_dim
+      input_dim=cardinality, output_dim=output_dims[0]
   )
   feature_embs = emb_layer(inputs)
   feature_weights = tf.random.uniform(tf.shape(feature_embs))
@@ -238,7 +266,7 @@ def make_weighted_bow_model(layer_generator, input_dims, output_dim):
 # ==============================================================================
 # Factory functions.
 # ==============================================================================
-def get_nd_test_tensors(n):
+def get_nd_test_tensors(n: int):
   """Returns a list of candidate tests for a given dimension n."""
   return [
       tf.zeros((n,), dtype=tf.float64),
@@ -246,7 +274,7 @@ def get_nd_test_tensors(n):
   ]
 
 
-def get_nd_test_batches(n):
+def get_nd_test_batches(n: int):
   """Returns a list of candidate input batches of dimension n."""
   result = []
   tensors = get_nd_test_tensors(n)
@@ -263,17 +291,17 @@ def sigmoid_dense_layer(b):
     return tf.keras.layers.Dense(b, activation='sigmoid')
 
   return {
-      'pure_dense': lambda a, b: tf.keras.layers.Dense(b),
-      'sigmoid_dense': lambda a, b: sigmoid_dense_layer(b),
+      'pure_dense': lambda a, b: tf.keras.layers.Dense(b[0]),
+      'sigmoid_dense': lambda a, b: sigmoid_dense_layer(b[0]),
   }
 
 
 def get_dense_model_generators():
   return {
-      'seq1': make_two_layer_sequential_model,
-      'seq2': make_three_layer_sequential_model,
-      'func1': make_two_layer_functional_model,
-      'tower1': make_two_tower_model,
+      'seq2': make_two_layer_sequential_model,
+      'seq3': make_three_layer_sequential_model,
+      'func2': make_two_layer_functional_model,
+      'tower2': make_two_tower_model,
   }
 
 
@@ -285,6 +313,64 @@ def get_embedding_model_generators():
   }
 
 
+def get_einsum_layer_generators():
+  def pure_einsum_layer(equation, output_dims, bias_axes):
+    return tf.keras.layers.EinsumDense(
+        equation, output_dims, bias_axes=bias_axes
+    )
+
+  def sigmoid_einsum_layer(equation, output_dims, bias_axes):
+    return tf.keras.layers.EinsumDense(
+        equation, output_dims, bias_axes=bias_axes, activation='sigmoid'
+    )
+
+  return {
+      'pure_einsum': pure_einsum_layer,
+      'sigmoid_einsum': sigmoid_einsum_layer,
+  }
+
+
+def get_einsum_model_generators():
+  return {
+      'seq1': make_one_layer_sequential_model,
+  }
+
+
+def get_einsum_parameter_tuples():
+  """Consists of (equation, input_dims, output_dims, bias_axes)."""
+  return [
+      # Case (C1).
+      ('ab,bc->ac', [2], [3], None),
+      ('ab,bc->ac', [2], [3], 'c'),
+      ('abc,cd->abd', [2, 3], [2, 4], None),
+      ('abc,cd->abd', [2, 3], [2, 4], 'b'),
+      ('abc,cd->abd', [2, 3], [2, 4], 'd'),
+      ('abc,cd->abd', [2, 3], [2, 4], 'bd'),
+      ('abc,cef->abef', [2, 3], [2, 4, 5], None),
+      ('abc,cef->abef', [2, 3], [2, 4, 5], 'bf'),
+      # Case (C2).
+      ('...b,bc->...c', [2, 3], [4], None),
+      ('...b,bc->...c', [2, 3], [4], 'c'),
+      ('...ab,bc->...ac', [2, 3], [2, 4], None),
+      ('...ab,bc->...ac', [2, 4], [2, 4], 'c'),
+      ('...abc,cd->...abd', [2, 3, 4], [2, 3, 5], None),
+      ('...abc,cd->...abd', [2, 3, 4], [2, 3, 5], 'b'),
+      ('...abc,cd->...abd', [2, 3, 4], [2, 3, 5], 'd'),
+      ('...abc,cd->...abd', [2, 3, 4], [2, 3, 5], 'bd'),
+      ('...abc,cef->...abef', [2, 3, 4], [2, 3, 5, 6], None),
+      ('...abc,cef->...abef', [2, 3, 4], [2, 3, 5, 6], 'bf'),
+      # Case (C3).
+      ('ab...,bc->ac...', [2, 3], [4, 3], None),
+      ('ab...,bc->ac...', [2, 3], [4, 3], 'c'),
+      ('abc...,cd->abd...', [2, 3, 4], [2, 5, 4], None),
+      ('abc...,cd->abd...', [2, 3, 4], [2, 5, 4], 'b'),
+      ('abc...,cd->abd...', [2, 3, 4], [2, 5, 4], 'd'),
+      ('abc...,cd->abd...', [2, 3, 4], [2, 5, 4], 'bd'),
+      ('abc...,cef->abef...', [2, 3, 4], [2, 5, 6, 4], None),
+      ('abc...,cef->abef...', [2, 3, 4], [2, 5, 6, 4], 'bf'),
+  ]
+
+
 # ==============================================================================
 # Main tests.
 # ==============================================================================
@@ -301,7 +387,7 @@ def test_clip_weights(self, input_dim, clip_value):
       self.assertAllLessEqual(t * weights, clip_value + tol)
 
 
-class ClipGradsDenseLayerTest(tf.test.TestCase, parameterized.TestCase):
+class ClipGradsOneDimDenseLayerTest(tf.test.TestCase, parameterized.TestCase):
 
   @parameterized.product(
       model_name=list(get_dense_model_generators().keys()),
@@ -318,15 +404,15 @@ def test_gradient_norms_on_various_models(
     x_batches = get_nd_test_batches(input_dim)
     default_registry = layer_registry.make_default_layer_registry()
     for x_batch in x_batches:
-      if model_name == 'tower1':
+      if model_name == 'tower2':
         x_input = [x_batch, x_batch]
       else:
         x_input = x_batch
       (computed_norms, true_norms) = get_computed_and_true_norms(
           model_generator,
           layer_generator,
-          input_dim,
-          output_dim,
+          [input_dim],
+          [output_dim],
           is_eager,
           x_input,
           registry=default_registry,
@@ -334,6 +420,34 @@ def test_gradient_norms_on_various_models(
       self.assertAllClose(computed_norms, true_norms, rtol=1e-3, atol=1e-2)
 
 
+class ClipGradsTwoDimDenseLayerTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.product(
+      layer_name=list(get_dense_layer_generators().keys()),
+      is_eager=[True, False],
+  )
+  def test_gradient_norms_on_various_models(self, layer_name, is_eager):
+    batch_size = 2
+    input_dims = [3, 4]
+    output_dims = [5]
+    model_generator = make_one_layer_sequential_model
+    layer_generator = get_dense_layer_generators()[layer_name]
+    example_size = tf.reduce_prod(input_dims)
+    example_values = tf.range(batch_size * example_size, dtype=tf.float32)
+    x_batch = tf.reshape(example_values, [batch_size] + input_dims)
+    default_registry = layer_registry.make_default_layer_registry()
+    (computed_norms, true_norms) = get_computed_and_true_norms(
+        model_generator,
+        layer_generator,
+        input_dims,
+        output_dims,
+        is_eager,
+        x_batch,
+        registry=default_registry,
+    )
+    self.assertAllClose(computed_norms, true_norms, rtol=1e-3, atol=1e-2)
+
+
 class ClipGradsEmbeddingLayerTest(tf.test.TestCase, parameterized.TestCase):
 
   # TODO(wkong): Test sparse input tensors when the GitHub CI environment
@@ -374,7 +488,7 @@ def test_gradient_norms_on_various_models(
           model_generator=model_generator,
           layer_generator=None,
           input_dims=x_batch.shape[1:],
-          output_dim=output_dim,
+          output_dims=[output_dim],
           is_eager=is_eager,
           x_input=x_batch,
           registry=default_registry,
@@ -398,9 +512,9 @@ def test_gradient_norms_on_various_models(
     for x_batch in x_batches:
       (computed_norms, true_norms) = get_computed_and_true_norms(
           model_generator=make_two_layer_sequential_model,
-          layer_generator=lambda a, b: DoubleDense(b),
-          input_dims=input_dim,
-          output_dim=output_dim,
+          layer_generator=lambda a, b: DoubleDense(b[0]),
+          input_dims=[input_dim],
+          output_dims=[output_dim],
           is_eager=is_eager,
           x_input=x_batch,
           registry=registry,
@@ -408,5 +522,41 @@ def test_gradient_norms_on_various_models(
       self.assertAllClose(computed_norms, true_norms, rtol=1e-3, atol=1e-2)
 
 
+class ClipGradsEinsumDenseTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.product(
+      model_name=list(get_einsum_model_generators().keys()),
+      layer_name=list(get_einsum_layer_generators().keys()),
+      param_tuple=get_einsum_parameter_tuples(),
+      is_eager=[False, True],
+  )
+  def test_gradient_norms_on_various_models(
+      self, model_name, layer_name, param_tuple, is_eager
+  ):
+    equation, input_dims, output_dims, bias_axes = param_tuple
+    model_generator = get_einsum_model_generators()[model_name]
+    einsum_generator = get_einsum_layer_generators()[layer_name]
+    registry = layer_registry.make_default_layer_registry()
+
+    def curried_generator(a, b):  # pylint: disable=unused-argument
+      return einsum_generator(equation, output_dims, bias_axes)
+
+    # Each batched input is a reshape of a `tf.range()` call.
+    batch_size = 2
+    example_size = tf.reduce_prod(input_dims)
+    example_values = tf.range(batch_size * example_size, dtype=tf.float32)
+    x_batch = tf.reshape(example_values, [batch_size] + input_dims)
+    (computed_norms, true_norms) = get_computed_and_true_norms(
+        model_generator=model_generator,
+        layer_generator=curried_generator,
+        input_dims=input_dims,
+        output_dims=output_dims,
+        is_eager=is_eager,
+        x_input=x_batch,
+        registry=registry,
+    )
+    self.assertAllClose(computed_norms, true_norms, rtol=1e-3, atol=1e-2)
+
+
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow_privacy/privacy/fast_gradient_clipping/einsum_utils.py b/tensorflow_privacy/privacy/fast_gradient_clipping/einsum_utils.py
new file mode 100644
index 000000000..ab3e325c2
--- /dev/null
+++ b/tensorflow_privacy/privacy/fast_gradient_clipping/einsum_utils.py
@@ -0,0 +1,302 @@
+# Copyright 2023, The TensorFlow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Various helper functions related to `tf.keras.layers.EinsumDense`."""
+
+import itertools
+import re
+
+import numpy as np
+import tensorflow as tf
+
+
+def _is_batch_of_vectors(t: tf.Tensor) -> bool:
+  """Checks if an input is a batch of 1D vectors."""
+  num_nontrivial_indices = 0
+  for s in t.shape[1:]:
+    if num_nontrivial_indices > 1:
+      return False
+    if s > 1:
+      num_nontrivial_indices += 1
+  return num_nontrivial_indices <= 1
+
+
+def _parse_einsum_equation(equation: str) -> tuple[int, tuple[str, str, str]]:
+  """Returns a case number and I/O substrings of an einsum equation."""
+  case_number = 0
+  match1 = re.match(r"([a-zA-Z]+),([a-zA-Z]+)->([a-zA-Z]+)", equation)
+  if match1 is not None:
+    case_number = 1
+  match2 = re.match(
+      r"\.\.\.([a-zA-Z]+),([a-zA-Z]+)->\.\.\.([a-zA-Z]+)", equation
+  )
+  if match2 is not None:
+    case_number = 2
+  match3 = re.match(
+      r"([a-zA-Z]+)\.\.\.,([a-zA-Z]+)->([a-zA-Z]+)\.\.\.", equation
+  )
+  if match3 is not None:
+    case_number = 3
+  matched = [g for g in [match1, match2, match3] if g is not None]
+  if len(matched) != 1:
+    raise ValueError(
+        "Invalid Einsum eqution string "
+        + equation
+        + " ."
+        "Must be one of the forms {ab,bc->ac}, {...ab,bc->...ac}, "
+        "{ab...,bc->ac...}"
+    )
+  return case_number, matched[0].groups()
+
+
+def _reshape_einsum_inputs(
+    input_tensor: tf.Tensor,
+    equation: str,
+) -> tf.Tensor:
+  """Converts input tensor to a batched matrix according to an einsum equation.
+
+  Args:
+    input_tensor: A `tf.Tensor` corresponding to the first input of the einsum
+      equation.
+    equation: The einsum equation `string`.
+
+  Returns:
+    A rank-3 `tf.Tensor` whose first dimension is the batch dimension. The
+    product of the non-trivial dimensions of the output should be equal to
+    the product of the non-trivial dimensions of `input_tensor`.
+  """
+  # Find the components `ab`, `bc`, and `ac` given that `equation` can only be
+  # one of the following mutually exclusive forms:
+  #
+  #   (C1) ab,bc->ac,
+  #   (C2) ...ab,bc->...ac
+  #   (C3) ab...,bc->ac...
+  #
+  # NOTE: `a`, `b`, and `c` are (possibly) also substrings.
+
+  # Compute the first index of the `b` part of the `ab` component.
+  input_shape = input_tensor.shape
+  input_len = len(input_shape)
+  case_number, (ab_str, bc_str, ac_str) = _parse_einsum_equation(equation)
+  if case_number == 2:
+    # In case (C2), the `a` part of this component can be empty, so we have no
+    # choice but to compare the `c` part of `ac` with the `bc` component.
+    c_len = 0
+    for s1, s2 in itertools.zip_longest(reversed(bc_str), reversed(ac_str)):
+      if s1 == s2:
+        c_len += 1
+      else:
+        break
+    b_len = len(bc_str) - c_len
+    b_idx = input_len - b_len
+  else:
+    # For the other cases, we simply compare `ab` with `ac` to get the length
+    # of the `a` component, i.e., the first index of `b`.
+    b_idx = 0
+    for s1, s2 in itertools.zip_longest(ab_str, ac_str):
+      if s1 == s2:
+        b_idx += 1
+      else:
+        break
+  # Prepare `input_tensor` for reshaping and get the pivot index of the prepped
+  # tensor. Note that case (C3) requires a transpose to ensure that matrix
+  # multiplication is performed by the caller.
+  if case_number == 3:
+    ellipses_idx = len(ab_str)
+    # Convert `ab...` to `a...b`.
+    new_ordering = (
+        list(range(0, b_idx))
+        + list(range(ellipses_idx, input_len))
+        + list(range(b_idx, ellipses_idx))
+    )
+    base_tensor = tf.transpose(input_tensor, perm=new_ordering)
+    ellipses_len = input_len - ellipses_idx
+    pivot_idx = b_idx + ellipses_len
+  else:
+    base_tensor = input_tensor
+    pivot_idx = b_idx
+  # The output tensor is a batched set of matrices, split at the pivot index
+  # of the previously prepped tensor.
+  base_tensor_shape = base_tensor.shape
+  batch_size = base_tensor_shape[0]
+  num_rows = int(np.product(base_tensor_shape[1:pivot_idx]))
+  num_columns = int(np.product(base_tensor_shape[pivot_idx:]))
+  return tf.reshape(base_tensor, shape=[batch_size, num_rows, num_columns])
+
+
+def _reshape_einsum_outputs(
+    output_tensor: tf.Tensor,
+    equation: str,
+) -> tf.Tensor:
+  """Converts output tensor to a batched matrix according to an einsum equation.
+
+  The logic is almost the same as in `_reshape_einsum_inputs()` except
+  in the case where the equation is left-elided by ellipses. For this case,
+  we need to pass in a reversed kernel shape.
+
+  Args:
+    output_tensor: A `tf.Tensor` corresponding to the output of the einsum
+      equation.
+    equation: The einsum equation `string`.
+
+  Returns:
+    A rank-3 `tf.Tensor` whose first dimension is the batch dimension. The
+    product of the non-trivial dimensions of the output should be equal to
+    the product of the non-trivial dimensions of `output_tensor`.
+  """
+  match = re.match(r"([a-zA-Z|.]+),([a-zA-Z|.]+)->([a-zA-Z|.]+)", equation)
+  if match is not None:
+    s1, s2, s3 = match.groups()
+  else:
+    raise ValueError(
+        "Invalid Einsum eqution string "
+        + equation
+        + " ."
+        "Must be one of the forms {ab,bc->ac}, {...ab,bc->...ac}, "
+        "{ab...,bc->ac...}"
+    )
+  reversed_equation = s3 + "," + s2[::-1] + "->" + s1
+  return _reshape_einsum_inputs(output_tensor, reversed_equation)
+
+
+def _get_einsum_bias_adjoint_reduction_axes(
+    equation: str,
+    bias_axes: str,
+    grad_shape: tf.TensorShape,
+) -> list[int]:
+  """Computes axes related to the adjoint of the einsum bias broadcast op."""
+  reduction_axes = []
+  case_number, (_, _, ac_str) = _parse_einsum_equation(equation)
+  # If `equation` of the form `...ab,bc->...ac`, i.e., case (C2), we do a
+  # right to left traversal; the other cases do a left to right traversal.
+  left_elided = case_number == 2
+  grad_indices = range(len(grad_shape))
+  traversal_zip = (
+      itertools.zip_longest(reversed(grad_indices), reversed(ac_str))
+      if left_elided
+      else itertools.zip_longest(grad_indices, ac_str)
+  )
+  bias_traversal_str = bias_axes[::-1] if left_elided else bias_axes
+  # Perform the traversal.
+  ptr = 0
+  for idx, output_chr in traversal_zip:
+    if idx != 0:
+      if output_chr is not None and ptr < len(bias_axes):
+        if bias_traversal_str[ptr] == output_chr:
+          ptr += 1
+        else:
+          reduction_axes.append(idx)
+      else:
+        reduction_axes.append(idx)
+  return reduction_axes
+
+
+def compute_fast_einsum_squared_gradient_norm(
+    equation: str,
+    input_tensor: tf.Tensor,
+    grad_tensor: tf.Tensor,
+    bias_axes: str | None,
+):
+  """Computes the batch gradient norms of an Einsum gradient decompostion.
+
+  This logic generalizes the one for `tf.keras.layers.Dense`. For reference,
+  we describe part of the mathematical analysis below. It can be safely skipped
+  upon first reading of this docstring.
+
+  -----------------------------------------------------------------------------
+  BEGIN ANALYSIS
+  -----------------------------------------------------------------------------
+  Recall that the einsum dense computation for a single example is of the form
+  ```
+  output = tf.einsum(equation, input, kernel) + bias,
+  ```
+  where `bias` is broadcasted and summed with the output of the `tf.einsum()`
+  call, and equation has one of the following forms:
+
+    (C1) ab,bc->ac,
+    (C2) ...ab,bc->...ac
+    (C3) ab...,bc->ac...
+
+  Mathematically, the above computation is equivalent to:
+  ```
+  output = tf.matmul(X, W) + Q(bias)
+  ```
+  where `X` (resp. `W`) is a 2D tensor reshaped from `input` (resp. `kernel`)
+  and `Q` is a linear operator that transforms `bias` to comport with the
+  tensor output by the `tf.matmul()` call.
+
+  Following the same trick as for `tf.keras.layers.Dense` layers, suppose that
+  we have:
+  ```
+  loss = f(base_vars)
+  G = tape.gradient(loss, base_vars)
+  ```
+  Then, using the chain rule and denoting `A'` to be the adjoint of a matrix
+  `A`, it is straightforward to show that the gradient of `loss` with respect
+  to `W` is given by the block matrix `K := [X' G; Q' G]`. Hence, the square
+  norm of `K`, i.e., what is returned by `sqr_norm_fn` is given by
+  ```
+  sqr_norm = <X X', G G'> + || Q' G ||_F^2
+  ```
+  where `||.||_F` is the Frobenius norm and `<.,.>` is the Euclidean inner
+  product for matrices.
+  -----------------------------------------------------------------------------
+  END ANALYSIS
+  -----------------------------------------------------------------------------
+
+  Args:
+    equation: A `string` representing the einsum equation.
+    input_tensor: A `tf.Tensor` reprenting the einsum input.
+    grad_tensor: A `tf.Tensor` that is the gradient of the scalar loss with
+      respect to the pre-activation tensor.
+    bias_axes: A `string` that specifies the einsum biases in `equation`.
+
+  Returns:
+    A 1D `tf.Tensor` whose i-th entry is the squared gradient corresponding
+    to the i-th example in `input_tensor`.
+  """
+  # NOTE: When the input/gradient tensors are 1D, it is MUCH faster to do
+  # a `tf.square()` + `tf.reduce_sum()` than a single `tf.matmul()`.
+
+  # Compute the matrix `X X'` for each example.
+  x = _reshape_einsum_inputs(input_tensor, equation)
+  if _is_batch_of_vectors(x):
+    x_matrix = tf.reshape(x, [x.shape[0], -1])
+    batch_xxt = tf.reduce_sum(tf.square(x_matrix), axis=1)
+  else:
+    batch_xxt = tf.matmul(x, x, transpose_b=True)
+  # Compute the matrix `G G'` for each example.
+  g = _reshape_einsum_outputs(grad_tensor, equation)
+  if _is_batch_of_vectors(g):
+    g_matrix = tf.reshape(g, [g.shape[0], -1])
+    batch_ggt = tf.reduce_sum(tf.square(g_matrix), axis=1)
+  else:
+    batch_ggt = tf.matmul(g, g, transpose_b=True)
+  # Compute the inner product and adjust for bias (if it exists).
+  reduction_axes = tf.range(1, len(batch_ggt.shape))
+  sqr_norms = tf.reduce_sum(batch_xxt * batch_ggt, axis=reduction_axes)
+  if bias_axes is not None:
+    # The adjoint operator `Q` on `G` is a reduce sum on the axes in `G` that
+    # are not broadcasted from `bias`.
+    grads_shape = grad_tensor.shape
+    adjoint_reduction_axes = _get_einsum_bias_adjoint_reduction_axes(
+        equation,
+        bias_axes,
+        grads_shape,
+    )
+    qg = tf.reduce_sum(grad_tensor, axis=adjoint_reduction_axes)
+    qg_reduction_axes = tf.range(1, len(qg.shape))
+    bias_sqr_norms = tf.reduce_sum(tf.square(qg), axis=qg_reduction_axes)
+    sqr_norms += bias_sqr_norms
+
+  return sqr_norms
diff --git a/tensorflow_privacy/privacy/fast_gradient_clipping/gradient_clipping_utils.py b/tensorflow_privacy/privacy/fast_gradient_clipping/gradient_clipping_utils.py
index 6dd0d49af..d5284ebc6 100644
--- a/tensorflow_privacy/privacy/fast_gradient_clipping/gradient_clipping_utils.py
+++ b/tensorflow_privacy/privacy/fast_gradient_clipping/gradient_clipping_utils.py
@@ -13,11 +13,20 @@
 # limitations under the License.
 """Utility functions that help in the computation of per-example gradient norms."""
 
+from typing import Any, Union, Iterable, Text, Callable, Tuple, TypeAlias
+
 from absl import logging
 import tensorflow as tf
 
+from tensorflow_privacy.privacy.fast_gradient_clipping import layer_registry as lr
+
+InputTensor: TypeAlias = Union[
+    tf.Tensor, Iterable[tf.Tensor], dict[Text, tf.Tensor]
+]
+GeneratorFunction: TypeAlias = Callable[[Any, Tuple, dict], Tuple[Any, Any]]
+
 
-def has_internal_compute_graph(input_object):
+def has_internal_compute_graph(input_object: Any):
   """Checks if input is a TF model and has a TF internal compute graph."""
   return (
       isinstance(input_object, tf.keras.Model)
@@ -28,7 +37,7 @@ def has_internal_compute_graph(input_object):
   )
 
 
-def _get_internal_layers(input_layer):
+def _get_internal_layers(input_layer: tf.keras.layers.Layer):
   """Returns a list of layers that are nested within a given layer."""
   internal_layers = []
   if isinstance(input_layer, tf.keras.Model) and hasattr(input_layer, 'layers'):
@@ -39,7 +48,11 @@ def _get_internal_layers(input_layer):
   return internal_layers
 
 
-def model_forward_pass(input_model, inputs, generator_fn=None):
+def model_forward_pass(
+    input_model: tf.keras.Model,
+    inputs: InputTensor,
+    generator_fn: GeneratorFunction | None = None,
+):
   """Does a forward pass of a model and returns useful intermediates.
 
   NOTE: the graph traversal algorithm is an adaptation of the logic in the
@@ -118,7 +131,9 @@ def generator_fn(layer_instance, args, kwargs):
   return node_layer_outputs, generator_outputs_list
 
 
-def all_trainable_layers_are_registered(input_model, layer_registry):
+def all_trainable_layers_are_registered(
+    input_model: tf.keras.Model, layer_registry: lr.LayerRegistry
+):
   """Check if an input model's trainable layers are all registered.
 
   Args:
@@ -140,7 +155,11 @@ def all_trainable_layers_are_registered(input_model, layer_registry):
 
 
 def add_aggregate_noise(
-    input_model, x_batch, clipped_grads, l2_norm_clip, noise_multiplier
+    input_model: tf.keras.Model,
+    x_batch: InputTensor,
+    clipped_grads: list[tf.Tensor],
+    l2_norm_clip: float,
+    noise_multiplier: float,
 ):
   """Adds noise to a collection of clipped gradients.
 
@@ -148,10 +167,9 @@ def add_aggregate_noise(
   input model's loss function.
 
   Args:
-    input_model: The Keras model to obtain the layers from.
-    x_batch: A collection of Tensors to be fed into the input layer of the
-      model.
-    clipped_grads: A list of tensors representing the clipped gradients.
+    input_model: The `tf.keras.Model` to obtain the layers from.
+    x_batch: An `InputTensor` to be fed into the input layer of the model.
+    clipped_grads: A list of `tf.Tensor`s representing the clipped gradients.
     l2_norm_clip: Clipping norm (max L2 norm of each gradient).
     noise_multiplier: Ratio of the standard deviation to the clipping norm.
 
@@ -187,7 +205,7 @@ def add_noise(g):
   return tf.nest.map_structure(add_noise, clipped_grads)
 
 
-def generate_model_outputs_using_core_keras_layers(input_model):
+def generate_model_outputs_using_core_keras_layers(input_model: tf.keras.Model):
   """Returns the model outputs generated by only core Keras layers."""
   cust_obj_dict = dict.copy(tf.keras.utils.get_custom_objects())
   cust_hash_set = set([hash(v) for v in cust_obj_dict.values()])
diff --git a/tensorflow_privacy/privacy/fast_gradient_clipping/layer_registry.py b/tensorflow_privacy/privacy/fast_gradient_clipping/layer_registry.py
index c8279baf4..b19bb18cb 100644
--- a/tensorflow_privacy/privacy/fast_gradient_clipping/layer_registry.py
+++ b/tensorflow_privacy/privacy/fast_gradient_clipping/layer_registry.py
@@ -40,8 +40,22 @@
 Details of this decomposition can be found in https://arxiv.org/abs/1510.01799
 """
 
+from typing import Callable, Type, Any, TypeAlias
 import tensorflow as tf
 
+from tensorflow_privacy.privacy.fast_gradient_clipping import einsum_utils
+
+# ==============================================================================
+# Type aliases
+# ==============================================================================
+SquareNormFunction: TypeAlias = Callable[[Any], tf.Tensor]
+
+RegistryFunctionOutput: TypeAlias = tuple[Any, tf.Tensor, SquareNormFunction]
+
+RegistryFunction: TypeAlias = Callable[
+    [Any, tuple[Any], tf.GradientTape], RegistryFunctionOutput
+]
+
 
 # ==============================================================================
 # Main class
@@ -54,15 +68,19 @@ def __init__(self):
     self._layer_class_dict = {}
     self._registry = {}
 
-  def is_elem(self, layer_instance):
+  def is_elem(self, layer_instance: tf.keras.layers.Layer) -> bool:
     """Checks if a layer instance's class is in the registry."""
     return hash(layer_instance.__class__) in self._registry
 
-  def lookup(self, layer_instance):
+  def lookup(self, layer_instance: tf.keras.layers.Layer) -> RegistryFunction:
     """Returns the layer registry function for a given layer instance."""
     return self._registry[hash(layer_instance.__class__)]
 
-  def insert(self, layer_class, layer_registry_function):
+  def insert(
+      self,
+      layer_class: Type[tf.keras.layers.Layer],
+      layer_registry_function: RegistryFunction,
+  ):
     """Inserts a layer registry function into the internal dictionaries."""
     layer_key = hash(layer_class)
     self._layer_class_dict[layer_key] = layer_class
@@ -72,7 +90,11 @@ def insert(self, layer_class, layer_registry_function):
 # ==============================================================================
 # Supported Keras layers
 # ==============================================================================
-def dense_layer_computation(layer_instance, inputs, tape):
+def dense_layer_computation(
+    layer_instance: tf.keras.layers.Dense,
+    inputs: tuple[tf.Tensor],
+    tape: tf.GradientTape,
+) -> RegistryFunctionOutput:
   """Registry function for `tf.keras.layers.Dense`.
 
   The logic for this computation is based on the following paper:
@@ -106,26 +128,70 @@ def dense_layer_computation(layer_instance, inputs, tape):
   tape.watch(base_vars)
   layer_instance.activation = orig_activation
   outputs = orig_activation(base_vars) if orig_activation else base_vars
-  def sqr_norm_fn(base_vars_grads):
-    sqr_inputs = tf.square(*inputs)
-    inputs_reduction_axes = tf.range(1, tf.rank(sqr_inputs))
-    input_sqr_norms = tf.reduce_sum(sqr_inputs, axis=inputs_reduction_axes)
-    if layer_instance.use_bias:
-      # Adding a bias term is equivalent to a layer with no bias term and which
-      # adds an additional variable to the layer input that only takes a
-      # constant value of 1.0. This is thus equivalent to adding 1.0 to the sum
-      # of the squared values of the inputs.
-      input_sqr_norms += tf.cast(1.0, dtype=input_sqr_norms.dtype)
-    reduction_axes = tf.range(1, tf.rank(base_vars_grads))
-    base_vars_sqr_norms = tf.reduce_sum(
-        tf.square(base_vars_grads), axis=reduction_axes
+  def sqr_norm_fn(grads):
+    # `Dense` layers are special instances of `EinsumDense` layers
+    return einsum_utils.compute_fast_einsum_squared_gradient_norm(
+        "...b,bc->...c",
+        inputs[0],
+        grads,
+        "c" if layer_instance.use_bias else None,
+    )
+
+  return base_vars, outputs, sqr_norm_fn
+
+
+def einsum_layer_computation(
+    layer_instance: tf.keras.layers.EinsumDense,
+    inputs: tuple[tf.Tensor],
+    tape: tf.GradientTape,
+):
+  """Registry function for `tf.keras.layers.EinsumDense`.
+
+  For the technical details, see the documentation of
+  `einsum_utils.compute_fast_einsum_gradient_norm()`.
+
+  Args:
+    layer_instance: A `tf.keras.layers.EinsumDense` instance.
+    inputs: A `tf.Tensor` which can be passed into the layer instance, i.e.,
+      `layer_instance(inputs)` returns a valid output.
+    tape: A `tf.GradientTape` instance that will be used to watch the output
+      `base_vars`.
+
+  Returns:
+    A `tuple` `(base_vars, outputs, sqr_norm_fn)`. `base_vars` is the
+    intermediate Tensor used in the chain-rule / "fast" clipping trick,
+    `outputs` is the result of `layer_instance(*inputs)`, and `sqr_norm_fn` is
+    a function that takes one input, a `tf.Tensor` that represents the output
+    of the call `tape.gradient(summed_loss, base_vars)` where `tape` is a
+    `tf.GradientTape` instance that records the dense layer computation and
+    `summed_loss` is the sum of the per-example losses of the underlying model.
+    This function then returns the per-example squared L2 gradient norms of the
+    trainable variables in `layer_instance`. These squared norms should be a 1D
+    `tf.Tensor` of length `batch_size`.
+  """
+  orig_activation = layer_instance.activation
+  layer_instance.activation = None
+  base_vars = layer_instance(*inputs)
+  tape.watch(base_vars)
+  layer_instance.activation = orig_activation
+  outputs = orig_activation(base_vars) if orig_activation else base_vars
+
+  def sqr_norm_fn(grads):
+    return einsum_utils.compute_fast_einsum_squared_gradient_norm(
+        layer_instance.equation,
+        inputs[0],
+        grads,
+        layer_instance.bias_axes,
     )
-    return input_sqr_norms * base_vars_sqr_norms
 
   return base_vars, outputs, sqr_norm_fn
 
 
-def embedding_layer_computation(layer_instance, inputs, tape):
+def embedding_layer_computation(
+    layer_instance: tf.keras.layers.Embedding,
+    inputs: tuple[tf.Tensor],
+    tape: tf.GradientTape,
+) -> RegistryFunctionOutput:
   """Registry function for `tf.keras.layers.Embedding`.
 
   The logic of this computation is based on the `tf.keras.layers.Dense`
@@ -225,8 +291,9 @@ def sqr_norm_fn(base_vars_grads):
 # ==============================================================================
 # Main factory methods
 # ==============================================================================
-def make_default_layer_registry():
+def make_default_layer_registry() -> LayerRegistry:
   registry = LayerRegistry()
   registry.insert(tf.keras.layers.Dense, dense_layer_computation)
   registry.insert(tf.keras.layers.Embedding, embedding_layer_computation)
+  registry.insert(tf.keras.layers.EinsumDense, einsum_layer_computation)
   return registry