neuralmagic
diff --git a/‎docs/design/fused_moe_modular_kernel.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/design/fused_moe_modular_kernel.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/offline_inference/data_parallel.py‎
Lines changed: 7 additions & 1 deletion b/‎examples/offline_inference/data_parallel.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎vllm/config/parallel.py‎
Lines changed: 7 additions & 0 deletions b/‎vllm/config/parallel.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎vllm/engine/arg_utils.py‎
Lines changed: 8 additions & 0 deletions b/‎vllm/engine/arg_utils.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py‎
Lines changed: 86 additions & 72 deletions b/‎vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py‎
Lines changed: 86 additions & 72 deletions
@@ -57,6 +57,7 @@ The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExperts
 The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare`, `prepare_no_receive`  and `finalize` functions.
 The `prepare` function is responsible for input activation Quantization and All2All Dispatch. If implemented, The `prepare_no_receive` is like `prepare` except it does not wait to receive results from other workers.  Instead it returns a "receiver" callback that must be invoked to wait for the final results of worker. It is not required that this method is supported by all `FusedMoEPrepareAndFinalize` classes, but if it is available, it can be used to interleave work with the initial all to all communication, e.g. interleaving shared experts with fused experts.  The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section)
 
+
 ![](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png "FusedMoEPrepareAndFinalize Blocks")
 
 ### FusedMoEPermuteExpertsUnpermute
 
@@ -90,7 +90,13 @@ def parse_args():
     parser.add_argument(
         "--enable-microbatching",
         action="store_true",
-        help=("Enable microbatched execution"),
+        help=("Enable microbatched execution")
+    )
+    parser.add_argument(
+        "--compilation-config",
+        type=int,
+        default=0,
+        help=("Compilation optimization (O) level 0-3."),
     )
     parser.add_argument(
         "--compilation-config",
 
@@ -135,6 +135,13 @@ class ParallelConfig:
     request is greater than this threshold, microbatching will be used.
     Otherwise, the request will be processed in a single batch."""
 
+    microbatch_schedule: Literal["mlp_overlap", "ATTN_SHARED_OVERLAP"] = "mlp_overlap"
+    """Schedule policy for microbatch overlap coordination.
+
+    - "mlp_overlap": overlap MLP compute and communication across ubatches
+    - "ATTN_SHARED_OVERLAP": overlap MLA attention and communication across ubatches
+    """
+
     ray_workers_use_nsight: bool = False
     """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
 
 
@@ -317,6 +317,7 @@ class EngineArgs:
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
     enable_microbatching: bool = ParallelConfig.enable_microbatching
     microbatching_token_threshold: int = ParallelConfig.microbatching_token_threshold
+    microbatch_schedule: str = ParallelConfig.microbatch_schedule
     eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config")
     enable_eplb: bool = ParallelConfig.enable_eplb
     num_redundant_experts: int = EPLBConfig.num_redundant_experts
@@ -682,6 +683,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                                     **parallel_kwargs["enable_microbatching"])
         parallel_group.add_argument("--microbatching-token-threshold",
                                     **parallel_kwargs["microbatching_token_threshold"])
+        parallel_group.add_argument(
+            "--microbatch-schedule",
+            dest="microbatch_schedule",
+            **parallel_kwargs["microbatch_schedule"])
+        parallel_group.add_argument("--enable-async-comms",
+                                    **parallel_kwargs["enable_async_comms"])
         parallel_group.add_argument("--enable-eplb",
                                     **parallel_kwargs["enable_eplb"])
         parallel_group.add_argument("--eplb-config",
@@ -1304,6 +1311,7 @@ def create_engine_config(
             enable_expert_parallel=self.enable_expert_parallel,
             enable_microbatching=self.enable_microbatching,
             microbatching_token_threshold=self.microbatching_token_threshold,
+            microbatch_schedule=self.microbatch_schedule,
             enable_eplb=self.enable_eplb,
             eplb_config=self.eplb_config,
             max_parallel_loading_workers=self.max_parallel_loading_workers,
 
@@ -58,31 +58,63 @@ def _get_combine_config(self) -> Optional[deep_ep.Config]:
             return None
         return deep_ep.Buffer.get_combine_config(self.dp_size)
 
-    def _do_dispatch(
+    def _create_prepare_ops(
         self,
-        tokens: torch.Tensor,
-        token_scales: Optional[torch.Tensor],
-        rank_topk_ids: torch.Tensor,
-        rank_topk_weights: torch.Tensor,
-        num_experts: int,
+        a1: torch.Tensor,
         a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-    ) -> Callable:
+    ) -> mk.PrepareResultType:
+
+        # Apply router weights on input if requested (only supports topk=1)
+        if apply_router_weight_on_input:
+            topk = topk_ids.size(1)
+            assert topk == 1, (
+                "apply_router_weight_on_input is only implemented for topk=1")
+            a1 = a1 * topk_weights.to(a1.dtype)
+
+        # Quantize prior to dispatch for block-quantized path, otherwise defer
+        if quant_config.is_block_quantized:
+            a1q, a1q_scale = moe_kernel_quantize_input(
+                a1,
+                a1_scale,
+                quant_dtype=quant_config.quant_dtype,
+                per_act_token_quant=quant_config.per_act_token_quant,
+                block_shape=quant_config.block_shape,
+            )
+            if a1q_scale is not None and a1q_scale.numel() == 1:
+                a1q_scale = a1q_scale.view(1, 1)
+            a1_post_scale = None
+        else:
+            a1q = a1
+            a1q_scale = None
+            a1_post_scale = a1_scale
 
-        has_scales = token_scales is not None
+        # Inline dispatch (sync send+recv)
+        has_scales = a1q_scale is not None
 
         (num_tokens_per_rank, num_tokens_per_rdma_rank,
          dispatch_expert_num_tokens, is_token_in_rank,
          event) = self.buffer.get_dispatch_layout(
-             topk_idx=rank_topk_ids,
+             topk_idx=topk_ids,
              num_experts=num_experts,
              previous_event=None,
              async_finish=False,
              allocate_on_comm_stream=False)
 
-        token_data = tokens
+        token_data: Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]
+        token_data = a1q
         if has_scales:
-            token_data = (tokens, token_scales)
+            token_data = (a1q, a1q_scale)
+
+        ########################################################################
+        yield  # Pre-dispatch done
+        ########################################################################
 
         (
             token_data, expert_topk_ids, expert_topk_weights,
@@ -94,10 +126,8 @@ def _do_dispatch(
             num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
             is_token_in_rank=is_token_in_rank,
             num_tokens_per_expert=dispatch_expert_num_tokens,
-            topk_idx=rank_topk_ids,
-            topk_weights=rank_topk_weights,
-            # expert_alignment rounds the number of tokens per expert
-            # to this value.
+            topk_idx=topk_ids,
+            topk_weights=topk_weights,
             expert_alignment=1,
             config=self._get_dispatch_config(),
             previous_event=None,
@@ -131,9 +161,12 @@ def _receiver(
         if self.async_prepare:
             event.current_stream_wait()
 
+        # Unpack token data
         if has_scales:
+            assert isinstance(token_data, tuple)
             expert_x, expert_x_scale = token_data
         else:
+            assert isinstance(token_data, torch.Tensor)
             expert_x, expert_x_scale = token_data, None
 
         # The existing MOE kernels assume that all entries of topk_ids are
@@ -174,58 +207,14 @@ def _receiver(
                     per_act_token_quant=False,
                     block_shape=quant_config.block_shape)
 
+        ########################################################################
+        yield  # Dispatch send+recv done (sync)
+        ########################################################################
+
         return (expert_x, expert_x_scale, expert_tokens_meta, expert_topk_ids,
                 expert_topk_weights)
 
-    def supports_async(self) -> bool:
-        return True
-
-    def prepare_async(
-        self,
-        a1: torch.Tensor,
-        a1_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        apply_router_weight_on_input: bool,
-        quant_config: FusedMoEQuantConfig,
-    ) -> Callable:
-
-        if apply_router_weight_on_input:
-            topk = topk_ids.size(1)
-            # TODO: this only works for topK=1, will need to update for topK>1
-            assert topk == 1, (
-                "apply_router_weight_on_input is only implemented for topk=1")
-            a1 = a1 * topk_weights.to(a1.dtype)
-
-        if quant_config.is_block_quantized:
-            # Quant and Dispatch
-            a1q, a1q_scale = moe_kernel_quantize_input(
-                a1,
-                a1_scale,
-                quant_dtype=quant_config.quant_dtype,
-                per_act_token_quant=quant_config.per_act_token_quant,
-                block_shape=quant_config.block_shape,
-            )
-            if a1q_scale is not None and a1q_scale.numel() == 1:
-                a1q_scale = a1q_scale.view(1, 1)
-            a1_post_scale = None
-        else:
-            a1q = a1
-            a1q_scale = None
-            a1_post_scale = a1_scale
-
-        return self._do_dispatch(tokens=a1q,
-                                 token_scales=a1q_scale,
-                                 rank_topk_ids=topk_ids,
-                                 rank_topk_weights=topk_weights,
-                                 num_experts=num_experts,
-                                 a1_scale=a1_post_scale,
-                                 quant_config=quant_config)
-
-    def prepare(
+    def create_prepare_ops(
         self,
         a1: torch.Tensor,
         a1_scale: Optional[torch.Tensor],
@@ -236,14 +225,14 @@ def prepare(
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-    ) -> mk.PrepareResultType:
-        receiver = self.prepare_async(a1, a1_scale, a2_scale, topk_weights,
-                                      topk_ids, num_experts, expert_map,
-                                      apply_router_weight_on_input,
-                                      quant_config)
-        return receiver()
-
-    def finalize(
+    ) -> mk.SyncPrepareOps:
+        return mk.SyncPrepareOps.from_generator(
+            self._create_prepare_ops(a1, a1_scale, a2_scale, topk_weights,
+                                     topk_ids, num_experts, expert_map,
+                                     apply_router_weight_on_input,
+                                     quant_config))
+
+    def _create_finalize_ops(
         self,
         output: torch.Tensor,
         fused_expert_output: torch.Tensor,
@@ -268,6 +257,10 @@ def finalize(
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
 
+        ########################################################################
+        yield  # Pre-combine done
+        ########################################################################
+
         combined_x, _, event = self.buffer.combine(
             x=fused_expert_output,
             handle=self.handle,
@@ -278,3 +271,24 @@ def finalize(
             allocate_on_comm_stream=False)
         # Respect inplace outputs.
         output.copy_(combined_x, non_blocking=True)
+
+        ########################################################################
+        yield  # Combine send-recv done
+        ########################################################################
+
+        return None
+
+    def create_finalize_ops(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: mk.TopKWeightAndReduce,
+    ) -> mk.SyncFinalizeOps:
+        return mk.SyncFinalizeOps.from_generator(
+            self._create_finalize_ops(output, fused_expert_output,
+                                      topk_weights, topk_ids,
+                                      apply_router_weight_on_input,
+                                      weight_and_reduce_impl))