Keep rope at float32 precision (#1497)

grasskin · web-flow · commit 09d2fdd3e15f · 2024-03-13T11:20:35.000-07:00
* Keep rope at float32 precision

* Carry out all of RoPE in float32

* Formatting

* Cleanup

* Do not cast x
diff --git a/keras_nlp/models/gemma/gemma_attention.py b/keras_nlp/models/gemma/gemma_attention.py
@@ -94,13 +94,14 @@ def _apply_rope(self, x, positions):
         # TODO: refactor to use RotaryEmbedding layer?
         max_wavelength = 10000
         x_shape = ops.shape(x)
-        freq_exponents = (2.0 / x_shape[-1]) * ops.cast(
-            ops.arange(x_shape[-1] // 2, dtype="float32"), self.compute_dtype
+        freq_exponents = (2.0 / x_shape[-1]) * ops.arange(
+            x_shape[-1] // 2, dtype="float32"
         )
         timescale = max_wavelength**freq_exponents
         radians = positions[..., None] / timescale[None, None, :]
         radians = radians[..., None, :]
-        sin, cos = ops.sin(radians), ops.cos(radians)
+        sin = ops.cast(ops.sin(radians), self.compute_dtype)
+        cos = ops.cast(ops.cos(radians), self.compute_dtype)
         x1, x2 = ops.split(x, 2, axis=-1)
         # Avoid `ops.concatenate` for now, to avoid a obscure bug with XLA
         # compilation on jax. We should be able to remove this once the
@@ -156,10 +157,9 @@ def call(
     ):
         seq_len = ops.shape(x)[1]
         start_index = cache_update_index
-        positions = ops.cast(
-            ops.arange(seq_len, dtype="float32"), self.compute_dtype
-        )
-        positions = positions + ops.cast(start_index, self.compute_dtype)
+        positions = ops.arange(seq_len, dtype="float32")
+
+        positions = positions + ops.cast(start_index, "float32")
         query = self.query_dense(x)
         query = self._apply_rope(query, positions)