open-lm-engine · shawntan · Aug 26, 2025
diff --git a/lm_engine/hf_models/config/__init__.py b/lm_engine/hf_models/config/__init__.py
@@ -15,6 +15,7 @@
     _CausalConvolution,
     _GRUArgs,
     _Mamba2Args,
+    _MixtureOfAttentionArgs,
     _MultiHeadLatentAttentionArgs,
     _RNNArgs,
     _SoftmaxAttentionArgs,
@@ -73,6 +74,7 @@ def _update_with_key_value(block: dict, kwargs: dict, key: str) -> None:
     "rnn": _RNNArgs,
     "stickbreaking_attention": _StickbreakingAttentionArgs,
     "softmax_attention": _SoftmaxAttentionArgs,
+    "momha": _MixtureOfAttentionArgs,
 }
 
 _MLP_CONFIG_CLASSES = {"MLP": _MLPArgs, "MoE": _MoEArgs}
@@ -136,10 +138,10 @@ def __init__(
 
         self.rope_dim = rope_dim
         if self.rope_dim is None and position_embedding_type == "rope":
-            assert (
-                self.check_equal_for_all_and_get_value("sequence_mixer_blocks", "sequence_mixer_type")
-                == "softmax_attention"
-            ), "specify rope_dim"
+            assert self.check_equal_for_all_and_get_value("sequence_mixer_blocks", "sequence_mixer_type") in [
+                "softmax_attention",
+                "momha",
+            ], "specify rope_dim"
 
             self.rope_dim = divide_if_divisible(
                 self.hidden_size,

diff --git a/lm_engine/hf_models/config/sequence_mixer.py b/lm_engine/hf_models/config/sequence_mixer.py
@@ -20,6 +20,14 @@ def model_post_init(self, __context: Any) -> None:
         assert self.sequence_mixer_type == "softmax_attention"
 
 
+class _MixtureOfAttentionArgs(_SoftmaxAttentionArgs):
+    sequence_mixer_type: str = "momha"
+    num_experts: int = 8
+
+    def model_post_init(self, __context: Any) -> None:
+        assert self.sequence_mixer_type == "momha"
+
+
 class _MultiHeadLatentAttentionArgs(BaseArgs):
     sequence_mixer_type: str = "multihead_latent_attention"
     num_attention_heads: int | None = None

diff --git a/lm_engine/hf_models/mixins/dense/layer.py b/lm_engine/hf_models/mixins/dense/layer.py
@@ -80,7 +80,12 @@ def _sequence_mixer_forward(
         cu_seqlens: torch.Tensor | None = None,
         max_seqlen: int | None = None,
     ) -> torch.Tensor:
-        if self.sequence_mixer_type in ["softmax_attention", "stickbreaking_attention", "multihead_latent_attention"]:
+        if self.sequence_mixer_type in [
+            "softmax_attention",
+            "stickbreaking_attention",
+            "multihead_latent_attention",
+            "momha",
+        ]:
             hidden_states = self.sequence_mixer(
                 hidden_states,
                 past_key_values=past_key_values,

diff --git a/lm_engine/hf_models/modeling_utils/sequence_mixer_blocks/__init__.py b/lm_engine/hf_models/modeling_utils/sequence_mixer_blocks/__init__.py
@@ -11,6 +11,7 @@
 from .causal_convolution import CausalConvolution
 from .gru import GRU
 from .mamba2 import Mamba2
+from .mixture_of_attention import MixtureOfAttention
 from .multihead_latent_attention import MultiHeadLatentAttention
 from .rnn import RNN
 from .stickbreaking_attention import PaddingFreeSBAttention, SBAttention
@@ -26,6 +27,7 @@
     | RNN
     | SBAttention
     | PaddingFreeSBAttention
+    | MixtureOfAttention
 )
 
 
@@ -136,6 +138,13 @@ def get_sequence_mixer(
                 softmax_dropout=block.softmax_dropout,
                 use_padding_free_transformer=use_padding_free_transformer,
             )
+        elif sequence_mixer_type == "momha":
+            return MixtureOfAttention(
+                **sequence_mixer_kwargs,
+                num_experts=block.num_experts,
+                softmax_dropout=block.softmax_dropout,
+                use_padding_free_transformer=use_padding_free_transformer,
+            )
         elif sequence_mixer_type == "stickbreaking_attention":
             if use_padding_free_transformer:
                 return PaddingFreeSBAttention(**sequence_mixer_kwargs)