keras-team · mattdangerw · Mar 14, 2024 · Mar 6, 2024 · Mar 12, 2024 · Mar 14, 2024
diff --git a/keras_nlp/models/gemma/gemma_backbone.py b/keras_nlp/models/gemma/gemma_backbone.py
@@ -249,20 +249,23 @@ def get_layout_map(device_mesh, model_parallel_dim_name="model"):
                 f"device_mesh.axis_names. {device_mesh.axis_name=}"
             )
         model_dim = model_parallel_dim_name
-        # The sharding is partition for the hidden_dim of the model.
+        # The sharding is set to replicated the hidden_dim of the model.
+        # So that the contrasting dimensions for qkv matmul are replicated.
+        # and will be run as local computation.
+        # See https://github.com/keras-team/keras-nlp/issues/1464 for more details.
         layout_map = keras.distribution.LayoutMap(device_mesh)
-        layout_map["token_embedding/embeddings"] = (None, model_dim)
+        layout_map["token_embedding/embeddings"] = (model_dim, None)
         layout_map["decoder_block.*attention.*(query|key|value).*kernel"] = (
             None,
-            model_dim,
             None,
+            model_dim,
         )
         layout_map["decoder_block.*attention_output.*kernel"] = (
-            None,
             None,
             model_dim,
+            None,
         )
-        layout_map["decoder_block.*ffw_gating.*kernel"] = (model_dim, None)
-        layout_map["decoder_block.*ffw_linear.*kernel"] = (None, model_dim)
+        layout_map["decoder_block.*ffw_gating.*kernel"] = (None, model_dim)
+        layout_map["decoder_block.*ffw_linear.*kernel"] = (model_dim, None)
 
         return layout_map
diff --git a/keras_nlp/models/gemma/gemma_backbone_test.py b/keras_nlp/models/gemma/gemma_backbone_test.py
@@ -106,26 +106,26 @@ def test_distribution(self):
 
         for w in model.weights:
             if "token_embedding/embeddings" in w.path:
-                self.assertEqual(tuple(w.value.sharding.spec), (None, "model"))
+                self.assertEqual(tuple(w.value.sharding.spec), ("model", None))
             if "attention/query/kernel" in w.path:
                 self.assertEqual(
-                    tuple(w.value.sharding.spec), (None, "model", None)
+                    tuple(w.value.sharding.spec), (None, None, "model")
                 )
             if "attention/key/kernel" in w.path:
                 self.assertEqual(
-                    tuple(w.value.sharding.spec), (None, "model", None)
+                    tuple(w.value.sharding.spec), (None, None, "model")
                 )
             if "attention/value/kernel" in w.path:
                 self.assertEqual(
-                    tuple(w.value.sharding.spec), (None, "model", None)
+                    tuple(w.value.sharding.spec), (None, None, "model")
                 )
             if "attention/attention_output/kernel" in w.path:
                 self.assertEqual(
-                    tuple(w.value.sharding.spec), (None, None, "model")
+                    tuple(w.value.sharding.spec), (None, "model", None)
                 )
             if "ffw_gating/kernel" in w.path:
-                self.assertEqual(tuple(w.value.sharding.spec), ("model", None))
+                self.assertEqual(tuple(w.value.sharding.spec), (None, "model"))
             if "ffw_gating_2/kernel" in w.path:
-                self.assertEqual(tuple(w.value.sharding.spec), ("model", None))
-            if "ffw_linearl" in w.path:
                 self.assertEqual(tuple(w.value.sharding.spec), (None, "model"))
+            if "ffw_linearl" in w.path:
+                self.assertEqual(tuple(w.value.sharding.spec), ("model", None))