From c8a0d33e2b3df075b8deaaaf566bf797bb0f0d0a Mon Sep 17 00:00:00 2001 From: Moritz Gunz Date: Mon, 22 Dec 2025 10:55:07 +0100 Subject: [PATCH 1/3] Transformer Decoder: extend docs on input embedding scale --- i6_models/assemblies/transformer/transformer_decoder_v1.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/i6_models/assemblies/transformer/transformer_decoder_v1.py b/i6_models/assemblies/transformer/transformer_decoder_v1.py index 773e1474..d2456eb8 100644 --- a/i6_models/assemblies/transformer/transformer_decoder_v1.py +++ b/i6_models/assemblies/transformer/transformer_decoder_v1.py @@ -128,7 +128,8 @@ class TransformerDecoderV1Config(ModelConfiguration): block_cfg: Configuration for TransformerDecoderV1. input_dropout: Dropout applied to the input embedding. input_embedding_scale: Scale applied to the input embedding. - Set to `None` to apply a (tuned) default. + Set to `None` to apply a default that is suitable for ASR AED decoder models. + When training a pure LM, scale 1.0 may be a better choice. num_blocks: Number of transformer blocks in the decoder. num_output: Number of output labels/vocab dim. logits_bias: Whether to add a bias to the output logits. From 0d38e018872140e7fd2e1dc85da4f9f35910ad5d Mon Sep 17 00:00:00 2001 From: Moritz Gunz Date: Tue, 20 Jan 2026 11:52:06 +0100 Subject: [PATCH 2/3] Remove LM mention --- i6_models/assemblies/transformer/transformer_decoder_v1.py | 1 - 1 file changed, 1 deletion(-) diff --git a/i6_models/assemblies/transformer/transformer_decoder_v1.py b/i6_models/assemblies/transformer/transformer_decoder_v1.py index d2456eb8..0c772d0d 100644 --- a/i6_models/assemblies/transformer/transformer_decoder_v1.py +++ b/i6_models/assemblies/transformer/transformer_decoder_v1.py @@ -129,7 +129,6 @@ class TransformerDecoderV1Config(ModelConfiguration): input_dropout: Dropout applied to the input embedding. input_embedding_scale: Scale applied to the input embedding. Set to `None` to apply a default that is suitable for ASR AED decoder models. - When training a pure LM, scale 1.0 may be a better choice. num_blocks: Number of transformer blocks in the decoder. num_output: Number of output labels/vocab dim. logits_bias: Whether to add a bias to the output logits. From 09816260644623693481edc97c5b900d26e4f4ab Mon Sep 17 00:00:00 2001 From: Moritz Gunz Date: Tue, 20 Jan 2026 14:30:56 +0100 Subject: [PATCH 3/3] describe scale --- i6_models/assemblies/transformer/transformer_decoder_v1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/i6_models/assemblies/transformer/transformer_decoder_v1.py b/i6_models/assemblies/transformer/transformer_decoder_v1.py index 0c772d0d..ab995857 100644 --- a/i6_models/assemblies/transformer/transformer_decoder_v1.py +++ b/i6_models/assemblies/transformer/transformer_decoder_v1.py @@ -128,7 +128,7 @@ class TransformerDecoderV1Config(ModelConfiguration): block_cfg: Configuration for TransformerDecoderV1. input_dropout: Dropout applied to the input embedding. input_embedding_scale: Scale applied to the input embedding. - Set to `None` to apply a default that is suitable for ASR AED decoder models. + Set to `None` to apply a default of sqrt(model_dim). num_blocks: Number of transformer blocks in the decoder. num_output: Number of output labels/vocab dim. logits_bias: Whether to add a bias to the output logits.