From f910d4fcab1fa57386aa40645b8814833377b531 Mon Sep 17 00:00:00 2001 From: Lingvo Maintenance Date: Fri, 6 Sep 2024 12:13:20 -0700 Subject: [PATCH] Fix a bug in RotaryPositionalEmbeddingLayer. The timescale (float32) was cast to int32 (when passed from ROPE) which loses precision. PiperOrigin-RevId: 671851096 --- lingvo/core/layers.py | 4 +++- lingvo/core/layers_test.py | 11 ++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/lingvo/core/layers.py b/lingvo/core/layers.py index 12395ee76..8f3c8d44a 100644 --- a/lingvo/core/layers.py +++ b/lingvo/core/layers.py @@ -3515,7 +3515,9 @@ def FProp(self, theta, inputs, position=None): position = position[:, :, tf.newaxis, tf.newaxis] timescale = timescale[tf.newaxis, tf.newaxis, tf.newaxis, :] - sinusoid_inp = position / tf.cast(timescale, position.dtype) + sinusoid_inp = tf.cast( + tf.cast(position, timescale.dtype) / timescale, inputs.dtype + ) sin = tf.sin(sinusoid_inp) cos = tf.cos(sinusoid_inp) first_half, second_half = tf.split(inputs, 2, axis=-1) diff --git a/lingvo/core/layers_test.py b/lingvo/core/layers_test.py index fee1898bf..ff6aa0c5b 100644 --- a/lingvo/core/layers_test.py +++ b/lingvo/core/layers_test.py @@ -4385,7 +4385,8 @@ def testSinusoidalPositionalEmbeddingLayer(self): math.cos(p / 2 * math.pi)] for p in range(4)] self.assertAllClose(actual_position_embs, expected_output) - def testRotaryPositionalEmbeddingLayer(self): + @parameterized.named_parameters(('default', False), ('has_position', True)) + def testRotaryPositionalEmbeddingLayer(self, has_position=False): with self.session(use_gpu=False): p = layers.RotaryPositionalEmbeddingLayer.Params() p.name = 'position_emb' @@ -4394,15 +4395,19 @@ def testRotaryPositionalEmbeddingLayer(self): p.embedding_dim = 4 seq_length = 5 inputs = tf.ones([1, seq_length, 1, p.embedding_dim]) + if has_position: + positions = tf.range(seq_length)[tf.newaxis, :] + else: + positions = None pos_emb_layer = p.Instantiate() self.evaluate(tf.global_variables_initializer()) - position_embs = pos_emb_layer.FPropDefaultTheta(inputs) + position_embs = pos_emb_layer.FPropDefaultTheta(inputs, positions) position_embs = tf.squeeze(position_embs, axis=[0, 2]) actual_position_embs, = self.evaluate([position_embs]) expected_output = [ - [1., 1., 1., 1.], + [1.0, 1.0, 1.0, 1.0], [-0.30116868, 0.5603883, 1.3817732, 1.2984471], [-1.3254442, 0.04166961, 0.4931506, 1.4135995], [-1.1311125, -0.48293126, -0.8488725, 1.3292018],