Distributed Flax Shakespeare model training example

patnotz · Google-ML-Automation · commit b8387f7426e2 · 2025-08-21T16:39:50.000-07:00
PiperOrigin-RevId: 797963267
diff --git a/jax_tpu_embedding/sparsecore/examples/models/shakespeare/flax_model.py b/jax_tpu_embedding/sparsecore/examples/models/shakespeare/flax_model.py
@@ -20,7 +20,6 @@
 from jax_tpu_embedding.sparsecore.lib.nn import embedding
 from jax_tpu_embedding.sparsecore.lib.nn import embedding_spec
 
-
 shard_map = jax.experimental.shard_map.shard_map
 Nested = embedding.Nested
 
@@ -30,19 +29,39 @@
 ################################################################################
 class Model(nn.Module):
   """Shakespeare model using embedding layer."""
-  feature_specs: Nested[embedding_spec.FeatureSpec]
 
+  feature_specs: Nested[embedding_spec.FeatureSpec]
   global_batch_size: int
   vocab_size: int
   seq_len: int
   embedding_size: int
-  table_name: str = 'shakespeare_table'
   feature_name: str = 'shakespeare_feature'
   mesh: jax.sharding.Mesh | None = None
   sharding_axis: str = 'sparsecore_sharding'
 
+  def add_sharding_constraint(self, x: jax.Array, names: tuple[str | None]):
+    # Add a sharding constraint to the array.
+    #
+    # Add a sharding constraint to the array to ensure that the sharding
+    # information is not lost during compilation. This may not be necessary but
+    # it helps SPMD and ensures that the sharding information is as expected.
+    #
+    # Args:
+    #   x: The array to add the sharding constraint to.
+    #   names: The mesh axes for the partition spec.
+    #
+    # Returns:
+    #   The array with the sharding constraint added.
+    return jax.lax.with_sharding_constraint(
+        x,
+        jax.sharding.NamedSharding(
+            self.mesh, jax.sharding.PartitionSpec(*names)
+        ),
+    )
+
   @nn.compact
-  def __call__(self, embedding_lookup_inputs: embed.EmbeddingLookupInput):
+  def __call__(self, embedding_lookup_inputs: embedding.PreprocessedInput):
+    # Run the embedding layer.
     x = embed.SparseCoreEmbed(
         feature_specs=self.feature_specs,
         mesh=self.mesh,
@@ -52,9 +71,28 @@ def __call__(self, embedding_lookup_inputs: embed.EmbeddingLookupInput):
     # Unpack the activations.
     x = x[self.feature_name]
     x = jnp.reshape(x, (self.global_batch_size, -1))
+    x = self.add_sharding_constraint(x, (self.sharding_axis,))
 
-    # Apply the model.
-    x = nn.Dense(self.embedding_size)(x)
-    x = nn.Dense(self.vocab_size)(x)
+    # Apply the dense portion of the model.
+    x = nn.Dense(
+        self.embedding_size,
+        kernel_init=nn.with_partitioning(
+            nn.initializers.xavier_uniform(), (self.sharding_axis,)
+        ),
+        bias_init=nn.with_partitioning(
+            nn.initializers.zeros, (self.sharding_axis,)
+        ),
+    )(x)
+    x = self.add_sharding_constraint(x, (self.sharding_axis,))
+    x = nn.Dense(
+        self.vocab_size,
+        kernel_init=nn.with_partitioning(
+            nn.initializers.xavier_uniform(), (self.sharding_axis,)
+        ),
+        bias_init=nn.with_partitioning(
+            nn.initializers.zeros, (self.sharding_axis,)
+        ),
+    )(x)
+    x = self.add_sharding_constraint(x, (self.sharding_axis,))
 
     return x
diff --git a/jax_tpu_embedding/sparsecore/lib/flax/embed.py b/jax_tpu_embedding/sparsecore/lib/flax/embed.py
@@ -32,7 +32,6 @@
   DLL = layout.DeviceLocalLayout  # type: ignore
 Layout = layout.Format
 LogicalNames = typing.LogicalNames
-P = jax.sharding.PartitionSpec
 shard_map = jax.experimental.shard_map.shard_map
 Nested = embedding.Nested
 EmbeddingLookupInput = embedding.PreprocessedInput
@@ -60,7 +59,8 @@ def with_sparsecore_layout(
     fn: Callable[..., Any],
     names: LogicalNames,
     mesh: jax.sharding.Mesh,
-):
+) -> Callable[..., Any]:
+  """Wraps a function to add a SparseCore layout."""
   @functools.wraps(fn)
   def wrapper(*args, **kwargs):
     return WithSparseCoreLayout(fn(*args, **kwargs), names, mesh=mesh)
@@ -73,7 +73,7 @@ class SparseCoreEmbed(nn.Module):
 
   # A sequence of FeatureSpecs to specify the configurations for the
   # input feature.
-  feature_specs: Nested[embedding_spec.FeatureSpec]
+  feature_specs: embedding.Nested[embedding_spec.FeatureSpec]
   # Axis in the mesh to use for sharding.
   sharding_axis: str = 'sparsecore_sharding'
   # Mesh to use for the embedding layer.
@@ -94,8 +94,10 @@ def __post_init__(self):
     super().__post_init__()
 
   def setup(self):
-    self.embedding_table_partition = P(self.sharding_axis, None)
-    self.data_partition = P(self.sharding_axis)
+    self.embedding_table_partition = jax.sharding.PartitionSpec(
+        self.sharding_axis, None
+    )
+    self.data_partition = jax.sharding.PartitionSpec(self.sharding_axis)
     self.num_shards = self.mesh.shape[self.sharding_axis]
 
     initializer = functools.partial(
@@ -118,17 +120,17 @@ def _wrap_initializer(
       self, initializer: Callable[[jax.Array], tuple[jax.Array, ...]]
   ):
     return with_sparsecore_layout(
-        initializer,
-        (self.sharding_axis,),
+        fn=initializer,
+        names=(self.sharding_axis, None),
         mesh=self.mesh,
     )
 
   def preprocess_inputs(
       self,
       step: int,
-      features: Nested[np.ndarray],
-      features_weights: Nested[np.ndarray],
-  ) -> EmbeddingLookupInput:
+      features: embedding.Nested[np.ndarray],
+      features_weights: embedding.Nested[np.ndarray],
+  ) -> embedding.PreprocessedInput:
     """Preprocesses the input for sparse dense matmul.
 
     This method do not need to be invoked with module.apply().
@@ -157,8 +159,8 @@ def preprocess_inputs(
     )[0]
 
   def __call__(
-      self, embedding_lookup_inputs: EmbeddingLookupInput
-  ) -> Nested[jax.Array]:
+      self, embedding_lookup_inputs: embedding.PreprocessedInput
+  ) -> embedding.Nested[jax.Array]:
     """Computes the embedding activations.
 
     Args:
@@ -175,8 +177,8 @@ def __call__(
 
   def apply_gradient(
       self,
-      gradients: Nested[jax.Array],
-      embedding_lookup_inputs: EmbeddingLookupInput,
+      gradients: embedding.Nested[jax.Array],
+      embedding_lookup_inputs: embedding.PreprocessedInput,
   ) -> Mapping[str, Mapping[str, jax.Array]]:
     """Apply the gradients to the embedding variables.
 
@@ -202,7 +204,7 @@ def apply_gradient(
 @functools.partial(jax.custom_vjp, nondiff_argnums=(0,))
 def _emb_lookup(
     embedding_layer: SparseCoreEmbed,
-    embedding_lookup_inputs: EmbeddingLookupInput,
+    embedding_lookup_inputs: embedding.PreprocessedInput,
     emb_table: Mapping[str, tuple[jax.Array, ...]],
 ):
   pt = embedding_layer.embedding_table_partition
@@ -226,7 +228,7 @@ def _emb_lookup(
 
 def _emb_lookup_fwd(
     embedding_layer: SparseCoreEmbed,
-    embedding_lookup_inputs: EmbeddingLookupInput,
+    embedding_lookup_inputs: embedding.PreprocessedInput,
     emb_table: Mapping[str, tuple[jax.Array, ...]],
 ):
   return _emb_lookup(
diff --git a/jax_tpu_embedding/sparsecore/lib/flax/tests/autograd_test.py b/jax_tpu_embedding/sparsecore/lib/flax/tests/autograd_test.py
@@ -22,7 +22,6 @@
 import jax.numpy as jnp
 from jax_tpu_embedding.sparsecore.examples.models.shakespeare import dataset as shakespeare_data
 from jax_tpu_embedding.sparsecore.examples.models.shakespeare import flax_model as shakespeare_model
-from jax_tpu_embedding.sparsecore.lib.flax import embed
 from jax_tpu_embedding.sparsecore.lib.flax import embed_optimizer
 from jax_tpu_embedding.sparsecore.lib.nn import embedding
 from jax_tpu_embedding.sparsecore.lib.nn import embedding_spec
@@ -173,7 +172,7 @@ def process_inputs(batch_number, feature_batch):
     )
     def train_step(
         params: Any,
-        embedding_lookup_inputs: embed.EmbeddingLookupInput,
+        embedding_lookup_inputs: embedding.PreprocessedInput,
         labels: jax.Array,
         opt_state,
     ):
diff --git a/jax_tpu_embedding/sparsecore/lib/flax/tests/embed_test.py b/jax_tpu_embedding/sparsecore/lib/flax/tests/embed_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import functools
-from typing import Tuple
 
 from absl.testing import absltest
 import einops
@@ -197,8 +196,8 @@ class EmbeddingLayerTest(absltest.TestCase):
 
   def _row_initialize_with_padding(
       self,
-      shape: Tuple[int, ...],
-      padded_shape: Tuple[int, ...],
+      shape: tuple[int, ...],
+      padded_shape: tuple[int, ...],
       offset: int = 0,
       pad_value: float = _PAD_VALUE,
   ):
diff --git a/jax_tpu_embedding/sparsecore/lib/nn/embedding.py b/jax_tpu_embedding/sparsecore/lib/nn/embedding.py
@@ -16,7 +16,7 @@
 import collections
 import dataclasses
 import functools
-from typing import List, Mapping, NamedTuple, Sequence, Tuple, TypeAlias, TypeVar, Union
+from typing import List, Mapping, NamedTuple, Sequence, TypeAlias, TypeVar, Union
 import warnings
 
 from absl import logging
@@ -119,8 +119,8 @@ class PreprocessedInput(struct.PyTreeNode):
   """
 
   sparse_dense_matmul_input: SparseDenseMatmulInput
-  num_minibatches: jnp.ndarray = struct.field(
-      default_factory=lambda: jnp.array(1)
+  num_minibatches: np.ndarray = struct.field(
+      default_factory=lambda: np.array(1)
   )
 
   # Backward compatibility properties and functions. This class acts as a
@@ -1068,7 +1068,7 @@ def _init_stacked_embedding_table(
     stack_name: str,
     table_specs: List[embedding_spec.TableSpec],
     global_sharding: jax.sharding.NamedSharding,
-    sharding_axis: str | Tuple[str, ...],
+    sharding_axis: str | tuple[str, ...],
     num_sparsecore_per_device: int | None = None,
 ) -> EmbeddingVariables:
   """Initializes a stacked embedding table."""