0 init on residuals, silu backward pass to fp32

fattorib · fattorib · commit b5d843a5eaab · 2024-11-09T16:46:33.000-05:00
diff --git a/mamba/mamba.py b/mamba/mamba.py
@@ -125,6 +125,8 @@ def __init__(self, config: MambaConfig, use_cache: bool = False):
 
         self.scan_fn = selective_scan
 
+        self.resid_proj.weight.data.zero_()
+
     def _ssm(
         self,
         x,
@@ -246,14 +248,9 @@ def __init__(self, config: MambaConfig, use_cache=False):
 
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 
-        self.lm_head.weight = self.embed_tokens.weight
-
         self.apply(self._init_weights)
 
-        for name, p in self.named_parameters():
-            if name in ["resid_proj.weight"]:
-                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
-                p /= math.sqrt(1.0 * config.num_hidden_layers)
+        self.lm_head.weight.data.zero_()
 
     def _init_weights(self, module: nn.Module):
         """Initialize the weights"""
diff --git a/mamba/mamba_inner_fn.py b/mamba/mamba_inner_fn.py
@@ -127,7 +127,7 @@ def backward(ctx, grad_output):  # type: ignore
 
         dy = grad_output @ out_proj_w
 
-        dres = dy * y * silu_bwd(res)
+        dres = dy * y * silu_bwd(res.float())
 
         dout_proj_w = torch.sum(grad_output.mT @ y_f.to(x_conv_out.dtype), dim=0)