Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions tensorrt_llm/_torch/attention_backend/sparse/dsa.py
Original file line number Diff line number Diff line change
Expand Up @@ -1030,15 +1030,7 @@ def prepare(metadata: DSAtrtllmAttentionMetadata):
- Prepares schedule_metadata for fp8_paged_mqa_logits
- Stores generation request IDs for decode phase
"""
# Skip indexer preparation if the kv_cache_manager doesn't have index_head_dim.
# This can happen when the metadata is being used with a draft KV cache manager
# during MTP speculative decoding, which uses a regular KVCacheManager instead
# of DSACacheManager.
kv_cache_manager = metadata.kv_cache_manager
if kv_cache_manager is None or not hasattr(kv_cache_manager,
'index_head_dim'):
return

num_contexts = metadata.num_contexts
num_generations = metadata.num_generations
num_ctx_tokens = metadata.num_ctx_tokens
Expand Down
7 changes: 6 additions & 1 deletion tensorrt_llm/_torch/pyexecutor/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,11 @@ def _create_one_model_draft_kv_cache_manager(
draft_kv_cache_manager_cls = KVCacheManager

estimating_kv_cache = estimating_kv_cache and not self._skip_est
# For MTP with models using sparse attention (e.g., DeepSeek V3 with DSA),
# the draft layers share the same architecture as the target model and need
# the sparse_attention_config. Get it from effective_draft_config which
# falls back to the target model's config for MTP mode.
sparse_attn_config = effective_draft_config.sparse_attention_config
return _create_kv_cache_manager(
model_engine=None,
kv_cache_manager_cls=draft_kv_cache_manager_cls,
Expand All @@ -657,7 +662,7 @@ def _create_one_model_draft_kv_cache_manager(
max_seq_len=self._max_seq_len,
max_batch_size=self._max_batch_size,
spec_config=self._speculative_config,
sparse_attn_config=None, # Not applicable for draft in one-model mode
sparse_attn_config=sparse_attn_config,
max_num_tokens=self._max_num_tokens,
max_beam_width=self._max_beam_width,
kv_connector_manager=self._kv_connector_manager,
Expand Down
3 changes: 0 additions & 3 deletions tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -358,16 +358,13 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe
full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5948435)
accuracy/test_llm_api_pytorch.py::TestKimiK25::test_nvfp4[tp8] SKIP (https://nvbugs/5951789)
unittest/_torch/modeling -k "modeling_siglip" SKIP (https://nvbugs/5941242)
perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_grace_blackwell-v32_fp4_tep4_mtp3_1k1k] SKIP (https://nvbugspro.nvidia.com/bug/5919026)
perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_grace_blackwell-v32_fp4_tep4_mtp3_8k1k] SKIP (https://nvbugspro.nvidia.com/bug/5919026)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] SKIP (https://nvbugs/5955765)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] SKIP (https://nvbugs/5955773)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_mtp1] SKIP (https://nvbugs/5955792)
accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[bf16-4-trtllm] SKIP (https://nvbugs/5955803)
examples/test_visual_gen.py::test_vbench_dimension_score_wan22_a14b_nvfp4 SKIP (https://nvbugs/5961414)
examples/test_visual_gen.py::test_vbench_dimension_score_wan SKIP (https://nvbugs/5961414)
examples/test_visual_gen.py::test_vbench_dimension_score_wan22_a14b_fp8 SKIP (https://nvbugs/5961414)
perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_blackwell-v32_fp4_tep8_mtp3_8k1k] SKIP (https://nvbugs/5919026)
accuracy/test_llm_api_pytorch_multimodal.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm] SKIP (https://nvbugs/5961430)
perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] SKIP (https://nvbugs/5846166)
perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] SKIP (https://nvbugs/5846166)
Expand Down