Skip to content

Commit

Permalink
bias init in case of use for moe
Browse files Browse the repository at this point in the history
  • Loading branch information
haeggee committed Aug 16, 2024
1 parent d14315f commit 5dc67fe
Showing 1 changed file with 4 additions and 0 deletions.
4 changes: 4 additions & 0 deletions src/nanotron/models/starcoder2.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
from nanotron.parallel.tied_parameters import tie_parameters
from nanotron.random import RandomStates, branch_random_state
from nanotron.utils import checkpoint_method
from nanotron.models.moe import ParallelDroplessMLP, SparseMLP


def pad_to_right(tensor, mask, new_tensor=None):
Expand Down Expand Up @@ -1524,6 +1525,9 @@ def init_model_randomly(self, config):
module.bias.zero_()
else:
raise ValueError(f"Who the fuck is {param_name}?")
elif isinstance(module, ParallelDroplessMLP):
if hasattr(module, 'bias'):
module.bias.zero_()
elif isinstance(module, TensorParallelRowLinear):
if "weight" == param_name:
nn.init.normal_(module.weight, mean=0.0, std=sigma / math.sqrt(2 * num_layers))
Expand Down

0 comments on commit 5dc67fe

Please sign in to comment.