From 5dc67fe1acb864f3c771776f216b5c69be83ad9d Mon Sep 17 00:00:00 2001 From: Alex Hagele Date: Fri, 16 Aug 2024 10:05:13 +0000 Subject: [PATCH] bias init in case of use for moe --- src/nanotron/models/starcoder2.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/nanotron/models/starcoder2.py b/src/nanotron/models/starcoder2.py index b05a67bb..1f2eab7d 100644 --- a/src/nanotron/models/starcoder2.py +++ b/src/nanotron/models/starcoder2.py @@ -56,6 +56,7 @@ from nanotron.parallel.tied_parameters import tie_parameters from nanotron.random import RandomStates, branch_random_state from nanotron.utils import checkpoint_method +from nanotron.models.moe import ParallelDroplessMLP, SparseMLP def pad_to_right(tensor, mask, new_tensor=None): @@ -1524,6 +1525,9 @@ def init_model_randomly(self, config): module.bias.zero_() else: raise ValueError(f"Who the fuck is {param_name}?") + elif isinstance(module, ParallelDroplessMLP): + if hasattr(module, 'bias'): + module.bias.zero_() elif isinstance(module, TensorParallelRowLinear): if "weight" == param_name: nn.init.normal_(module.weight, mean=0.0, std=sigma / math.sqrt(2 * num_layers))