From 91e969263edfb3f8fbc302c2df44f17fab4b39a3 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Mon, 5 Jan 2026 17:00:23 +0800
Subject: [PATCH 1/9] fix eb5 prefix bug

---
 .../cache_manager/prefix_cache_manager.py     | 77 +++++++++++++++----
 1 file changed, 61 insertions(+), 16 deletions(-)

diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py
index 91b23a29717..9c1f752a9ca 100644
--- a/fastdeploy/cache_manager/prefix_cache_manager.py
+++ b/fastdeploy/cache_manager/prefix_cache_manager.py
@@ -1422,25 +1422,24 @@ def _revert_match_blocks(
             if len(matche_nodes) == 0:
                 logger.error(f"req_id {request.request_id} revert nodes error, tokens: {revert_tokens}")
                 break
-            revert_tokens -= block_size
-            revert_block = matche_nodes.pop()
-            revert_block_id = revert_block.block_id
-            if revert_block_id in match_gpu_block_ids:
-                match_gpu_block_ids.remove(revert_block_id)
-                match_node_ids.remove(revert_block.node_id)
-                gpu_match_token_num -= block_size
-            elif revert_block_id in match_cpu_block_ids:
-                match_cpu_block_ids.remove(revert_block_id)
-                match_node_ids.remove(revert_block.node_id)
-                cpu_match_token_num -= block_size
-            else:
+            try:
+                revert_tokens -= block_size
+                gpu_match_token_num, cpu_match_token_num = self._roll_back_block(
+                    block_size=block_size,
+                    matche_nodes=matche_nodes,
+                    match_gpu_block_ids=match_gpu_block_ids,
+                    match_cpu_block_ids=match_cpu_block_ids,
+                    match_node_ids=match_node_ids,
+                    swap_node_ids=swap_node_ids,
+                    gpu_match_token_num=gpu_match_token_num,
+                    cpu_match_token_num=cpu_match_token_num,
+                )
+            except Exception as e:
                 logger.error(
-                    f"req_id {request.request_id} revert nodes error, nodes: {revert_block_id}, "
-                    f"match_gpu_block_ids: {match_gpu_block_ids}, match_cpu_block_ids: {match_cpu_block_ids}"
+                    f"req_id {request.request_id} revert block error: {e}, "
+                    f"revert tokens: {revert_tokens} from matched nodes: {match_block_ids}"
                 )
                 break
-            if revert_block_id in swap_node_ids:
-                swap_node_ids.remove(revert_block_id)
 
         if revert_tokens > 0:
             last_block_id = matche_nodes[-1].block_id
@@ -1549,6 +1548,21 @@ def mm_match_block(self, request, block_size):
             heapq.heapify(self.cpu_lru_leaf_heap)
 
         if self.cache_config.disable_chunked_mm_input:
+            if gpu_match_token_num + cpu_match_token_num == request.need_prefill_tokens:
+                # when a full hit is achieved, roll back one block_size
+                try:
+                    gpu_match_token_num, cpu_match_token_num = self._roll_back_block(
+                        block_size=block_size,
+                        matche_nodes=matche_nodes,
+                        match_gpu_block_ids=match_gpu_block_ids,
+                        match_cpu_block_ids=match_cpu_block_ids,
+                        match_node_ids=match_node_ids,
+                        swap_node_ids=swap_node_ids,
+                        gpu_match_token_num=gpu_match_token_num,
+                        cpu_match_token_num=cpu_match_token_num,
+                    )
+                except Exception as e:
+                    logger.error(f"req_id {request.request_id} revert block error: {e}")
             matched_token_num = gpu_match_token_num + cpu_match_token_num
             is_chunked, chunk_idx = self.is_chunked_mm_input(request.multimodal_inputs, matched_token_num)
             if is_chunked:
@@ -1580,6 +1594,37 @@ def mm_match_block(self, request, block_size):
             cpu_match_token_num,
         )
 
+    def _roll_back_block(
+        self,
+        block_size: int,
+        matche_nodes: list,
+        match_gpu_block_ids: list,
+        match_cpu_block_ids: list,
+        match_node_ids: list,
+        swap_node_ids: list,
+        gpu_match_token_num: int,
+        cpu_match_token_num: int,
+    ):
+        revert_block = matche_nodes.pop()
+        revert_block_id = revert_block.block_id
+        if revert_block_id in match_gpu_block_ids:
+            match_gpu_block_ids.remove(revert_block_id)
+            match_node_ids.remove(revert_block.node_id)
+            gpu_match_token_num -= block_size
+        elif revert_block_id in match_cpu_block_ids:
+            match_cpu_block_ids.remove(revert_block_id)
+            match_node_ids.remove(revert_block.node_id)
+            cpu_match_token_num -= block_size
+        else:
+            raise Exception(
+                f"revert nodes error, nodes: {revert_block_id}, match_gpu_block_ids: {match_gpu_block_ids}, "
+                f"match_cpu_block_ids: {match_cpu_block_ids}"
+            )
+
+        if revert_block_id in swap_node_ids:
+            swap_node_ids.remove(revert_block_id)
+        return gpu_match_token_num, cpu_match_token_num
+
     def match_block(self, req_id, input_ids, block_size):
         """
         Args:

From 77492a9197e918087cae91100c71ea7590a5fe5a Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Mon, 5 Jan 2026 19:11:57 +0800
Subject: [PATCH 2/9] update ci test

---
 tests/cache_manager/test_prefix_cache_manager.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/cache_manager/test_prefix_cache_manager.py b/tests/cache_manager/test_prefix_cache_manager.py
index a834ff75843..b169b21c89e 100644
--- a/tests/cache_manager/test_prefix_cache_manager.py
+++ b/tests/cache_manager/test_prefix_cache_manager.py
@@ -960,7 +960,7 @@ def test_mm_match_block_handles_multimodal_inputs(self):
         manager = _create_manager(num_gpu_blocks=4)
         block_size = 2
         manager.cache_config.disable_chunked_mm_input = False
-        input_ids = [1, 2, 3, 4]
+        input_ids = [1, 2, 3, 4, 5, 6, 7, 8]
         hash_input = get_hash_str(input_ids)
         hash_first = get_hash_str([1, 2])
         hash_second = get_hash_str([3, 4], [hash_first, "img"])
@@ -990,6 +990,7 @@ def test_mm_match_block_handles_multimodal_inputs(self):
                 "mm_hashes": ["img"],
             },
             num_total_tokens=4,
+            need_prefill_tokens=8,
         )
 
         match_gpu, match_cpu, swap_nodes, last_node, gpu_tokens, cpu_tokens = manager.mm_match_block(
@@ -1123,6 +1124,7 @@ def test_mm_match_block_reverts_chunked_inputs(self):
                 "mm_hashes": ["img"],
             },
             num_total_tokens=4,
+            need_prefill_tokens=4,
         )
 
         match_gpu, *_ = manager.mm_match_block(request, block_size)

From 77177f0cebb3341f060d7a790c6ab43047c0faac Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Mon, 5 Jan 2026 21:17:38 +0800
Subject: [PATCH 3/9] update code

---
 tests/cache_manager/test_prefix_cache_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/cache_manager/test_prefix_cache_manager.py b/tests/cache_manager/test_prefix_cache_manager.py
index b169b21c89e..6cba1471386 100644
--- a/tests/cache_manager/test_prefix_cache_manager.py
+++ b/tests/cache_manager/test_prefix_cache_manager.py
@@ -989,7 +989,7 @@ def test_mm_match_block_handles_multimodal_inputs(self):
                 "mm_positions": [SimpleNamespace(offset=2, length=2)],
                 "mm_hashes": ["img"],
             },
-            num_total_tokens=4,
+            num_total_tokens=8,
             need_prefill_tokens=8,
         )
 

From 185faedd547176b9ee0268a0046de337b6097b6b Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Tue, 6 Jan 2026 15:06:38 +0800
Subject: [PATCH 4/9] update code

---
 fastdeploy/cache_manager/prefix_cache_manager.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py
index 9c1f752a9ca..b2e180902d1 100644
--- a/fastdeploy/cache_manager/prefix_cache_manager.py
+++ b/fastdeploy/cache_manager/prefix_cache_manager.py
@@ -1777,6 +1777,9 @@ def mm_build_path(self, request, num_computed_tokens, block_size, last_node, num
                 prefix_block_key.extend(extra_keys)
                 hash_value = get_hash_str(current_block, prefix_block_key)
                 prefix_block_key = [hash_value]
+                if hash_value in node.children:
+                    node = node.children[hash_value]
+                    continue
                 allocated_block_id = gpu_block_ids.pop(0)
                 node_id = self.node_id_pool.pop()
                 unique_node_ids.append(node_id)

From 326840b1eb9d8d4da4947970655e3719c932d962 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Tue, 6 Jan 2026 15:11:43 +0800
Subject: [PATCH 5/9] update code

---
 fastdeploy/cache_manager/prefix_cache_manager.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py
index b2e180902d1..f2e692598a0 100644
--- a/fastdeploy/cache_manager/prefix_cache_manager.py
+++ b/fastdeploy/cache_manager/prefix_cache_manager.py
@@ -1779,6 +1779,7 @@ def mm_build_path(self, request, num_computed_tokens, block_size, last_node, num
                 prefix_block_key = [hash_value]
                 if hash_value in node.children:
                     node = node.children[hash_value]
+                    node.req_id_set.add(request.request_id)
                     continue
                 allocated_block_id = gpu_block_ids.pop(0)
                 node_id = self.node_id_pool.pop()

From 68241d479cbbbafef5d5989cc1112400c9d8c884 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Tue, 6 Jan 2026 16:47:05 +0800
Subject: [PATCH 6/9] update code

---
 fastdeploy/cache_manager/prefix_cache_manager.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py
index f2e692598a0..b4689ddcdbe 100644
--- a/fastdeploy/cache_manager/prefix_cache_manager.py
+++ b/fastdeploy/cache_manager/prefix_cache_manager.py
@@ -1777,11 +1777,12 @@ def mm_build_path(self, request, num_computed_tokens, block_size, last_node, num
                 prefix_block_key.extend(extra_keys)
                 hash_value = get_hash_str(current_block, prefix_block_key)
                 prefix_block_key = [hash_value]
+                allocated_block_id = gpu_block_ids.pop(0)
                 if hash_value in node.children:
                     node = node.children[hash_value]
                     node.req_id_set.add(request.request_id)
+                    self.recycle_gpu_blocks(allocated_block_id)
                     continue
-                allocated_block_id = gpu_block_ids.pop(0)
                 node_id = self.node_id_pool.pop()
                 unique_node_ids.append(node_id)
                 new_last_node = BlockNode(

From 7ee3ef36aaeb3421cc481c7bc004f1319c3474c2 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Wed, 7 Jan 2026 10:20:24 +0800
Subject: [PATCH 7/9] update code

---
 .../cache_manager/prefix_cache_manager.py     | 72 +++++++++----------
 .../engine/sched/resource_manager_v1.py       | 34 +++++++--
 2 files changed, 66 insertions(+), 40 deletions(-)

diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py
index b4689ddcdbe..96d8a40b0d7 100644
--- a/fastdeploy/cache_manager/prefix_cache_manager.py
+++ b/fastdeploy/cache_manager/prefix_cache_manager.py
@@ -1547,42 +1547,42 @@ def mm_match_block(self, request, block_size):
         if has_modified_cpu_lru_leaf_heap:
             heapq.heapify(self.cpu_lru_leaf_heap)
 
-        if self.cache_config.disable_chunked_mm_input:
-            if gpu_match_token_num + cpu_match_token_num == request.need_prefill_tokens:
-                # when a full hit is achieved, roll back one block_size
-                try:
-                    gpu_match_token_num, cpu_match_token_num = self._roll_back_block(
-                        block_size=block_size,
-                        matche_nodes=matche_nodes,
-                        match_gpu_block_ids=match_gpu_block_ids,
-                        match_cpu_block_ids=match_cpu_block_ids,
-                        match_node_ids=match_node_ids,
-                        swap_node_ids=swap_node_ids,
-                        gpu_match_token_num=gpu_match_token_num,
-                        cpu_match_token_num=cpu_match_token_num,
-                    )
-                except Exception as e:
-                    logger.error(f"req_id {request.request_id} revert block error: {e}")
-            matched_token_num = gpu_match_token_num + cpu_match_token_num
-            is_chunked, chunk_idx = self.is_chunked_mm_input(request.multimodal_inputs, matched_token_num)
-            if is_chunked:
-                (
-                    gpu_match_token_num,
-                    cpu_match_token_num,
-                    current_match_node,
-                ) = self._revert_match_blocks(
-                    request=request,
-                    matched_token_num=matched_token_num,
-                    block_size=block_size,
-                    chunk_idx=chunk_idx,
-                    match_node_ids=match_node_ids,
-                    matche_nodes=matche_nodes,
-                    match_gpu_block_ids=match_gpu_block_ids,
-                    match_cpu_block_ids=match_cpu_block_ids,
-                    gpu_match_token_num=gpu_match_token_num,
-                    cpu_match_token_num=cpu_match_token_num,
-                    swap_node_ids=swap_node_ids,
-                )
+        # if self.cache_config.disable_chunked_mm_input:
+        #     if gpu_match_token_num + cpu_match_token_num == request.need_prefill_tokens:
+        #         # when a full hit is achieved, roll back one block_size
+        #         try:
+        #             gpu_match_token_num, cpu_match_token_num = self._roll_back_block(
+        #                 block_size=block_size,
+        #                 matche_nodes=matche_nodes,
+        #                 match_gpu_block_ids=match_gpu_block_ids,
+        #                 match_cpu_block_ids=match_cpu_block_ids,
+        #                 match_node_ids=match_node_ids,
+        #                 swap_node_ids=swap_node_ids,
+        #                 gpu_match_token_num=gpu_match_token_num,
+        #                 cpu_match_token_num=cpu_match_token_num,
+        #             )
+        #         except Exception as e:
+        #             logger.error(f"req_id {request.request_id} revert block error: {e}")
+        #     matched_token_num = gpu_match_token_num + cpu_match_token_num
+        #     is_chunked, chunk_idx = self.is_chunked_mm_input(request.multimodal_inputs, matched_token_num)
+        #     if is_chunked:
+        #         (
+        #             gpu_match_token_num,
+        #             cpu_match_token_num,
+        #             current_match_node,
+        #         ) = self._revert_match_blocks(
+        #             request=request,
+        #             matched_token_num=matched_token_num,
+        #             block_size=block_size,
+        #             chunk_idx=chunk_idx,
+        #             match_node_ids=match_node_ids,
+        #             matche_nodes=matche_nodes,
+        #             match_gpu_block_ids=match_gpu_block_ids,
+        #             match_cpu_block_ids=match_cpu_block_ids,
+        #             gpu_match_token_num=gpu_match_token_num,
+        #             cpu_match_token_num=cpu_match_token_num,
+        #             swap_node_ids=swap_node_ids,
+        #         )
 
         logger.info(f"match_block: req_id {request.request_id} matched nodes: {match_node_ids}")
         return (
diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py
index 3d6157b538f..a29bbd01e2d 100644
--- a/fastdeploy/engine/sched/resource_manager_v1.py
+++ b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -356,6 +356,24 @@ def _is_mm_request(self, request):
 
         return False
 
+    def revert_chunked_mm_input(self, mm_inputs, matched_token_num):
+        """
+        revert mm_inputs that is chunked
+        """
+        if mm_inputs is None or "mm_positions" not in mm_inputs or len(mm_inputs["mm_positions"]) == 0:
+            return matched_token_num
+
+        for idx in range(len(mm_inputs["mm_positions"])):
+            position = mm_inputs["mm_positions"][idx]
+            if position.offset < matched_token_num < position.offset + position.length:
+                llm_logger.debug(
+                    f"revert_chunked_mm_input, match token num: {matched_token_num}, revert tokens: {matched_token_num - position.offset}"
+                )
+                return matched_token_num - position.offset
+            elif matched_token_num < position.offset:
+                break
+        return matched_token_num
+
     def _get_num_new_tokens(self, request, token_budget):
         # TODO: set condition to new _get_num_new_tokens
         num_new_tokens = request.need_prefill_tokens - request.num_computed_tokens
@@ -937,11 +955,19 @@ def get_prefix_cached_blocks(self, request: Request):
             main_process_metrics.prefix_gpu_cache_token_num.inc(request.metrics.gpu_cache_token_num)
             main_process_metrics.prefix_cpu_cache_token_num.inc(request.metrics.gpu_cache_token_num)
 
-            if matched_token_num == request.need_prefill_tokens:
-                request.num_computed_tokens = matched_token_num - self.config.cache_config.block_size
-                request.skip_allocate = True
+            if self.config.cache_config.disable_chunked_mm_input:
+                if matched_token_num == request.need_prefill_tokens:
+                    matched_token_num = matched_token_num - self.config.cache_config.block_size
+                    request.skip_allocate = True
+                request.num_computed_tokens = self.revert_chunked_mm_input(
+                    request.multimodal_inputs, matched_token_num
+                )
             else:
-                request.num_computed_tokens = matched_token_num
+                if matched_token_num == request.need_prefill_tokens:
+                    request.num_computed_tokens = matched_token_num - self.config.cache_config.block_size
+                    request.skip_allocate = True
+                else:
+                    request.num_computed_tokens = matched_token_num
             return True
         except Exception as e:
             llm_logger.error(f"prefix match blocks error: {e}, {str(traceback.format_exc())} waiting reschedule...")

From 6be33b72c1190aadeff14520d3d2bfc89a28a4df Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Wed, 7 Jan 2026 11:14:35 +0800
Subject: [PATCH 8/9] update code

---
 .../cache_manager/prefix_cache_manager.py     | 127 --------
 .../engine/sched/resource_manager_v1.py       |   6 +-
 .../test_prefix_cache_manager.py              |  39 ---
 tests/v1/cache_manager/test_revert_blocks.py  | 302 ------------------
 tests/v1/test_resource_manager_v1.py          |  97 +++++-
 5 files changed, 98 insertions(+), 473 deletions(-)
 delete mode 100644 tests/v1/cache_manager/test_revert_blocks.py

diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py
index 3ba2bddcc40..c2169127bb5 100644
--- a/fastdeploy/cache_manager/prefix_cache_manager.py
+++ b/fastdeploy/cache_manager/prefix_cache_manager.py
@@ -1398,65 +1398,6 @@ def get_block_hash_extra_keys(self, request, start_idx, end_idx, mm_idx):
                 hash_keys.append(mm_inputs["mm_hashes"][img_idx])
         return len(mm_inputs["mm_positions"]) - 1, hash_keys
 
-    def _revert_match_blocks(
-        self,
-        request,
-        matched_token_num: int,
-        block_size: int,
-        chunk_idx: int,
-        match_node_ids: list,
-        matche_nodes: list,
-        match_gpu_block_ids: list,
-        match_cpu_block_ids: list,
-        gpu_match_token_num: int,
-        cpu_match_token_num: int,
-        swap_node_ids: list,
-    ):
-        # position = request.multimodal_inputs["mm_positions"][chunk_idx]
-        # revert_tokens = matched_token_num - position.offset
-        # TODO(chengyanfu): fix when is_chunked_mm_input=True, revert all matched tokens
-        revert_tokens = matched_token_num
-        match_block_ids = [node.block_id for node in matche_nodes]
-        logger.warning(
-            f"match_block: req_id {request.request_id} revert tokens: {revert_tokens} from matched nodes: {match_block_ids}"
-        )
-        while revert_tokens >= block_size:
-            if len(matche_nodes) == 0:
-                logger.error(f"req_id {request.request_id} revert nodes error, tokens: {revert_tokens}")
-                break
-            try:
-                revert_tokens -= block_size
-                gpu_match_token_num, cpu_match_token_num = self._roll_back_block(
-                    block_size=block_size,
-                    matche_nodes=matche_nodes,
-                    match_gpu_block_ids=match_gpu_block_ids,
-                    match_cpu_block_ids=match_cpu_block_ids,
-                    match_node_ids=match_node_ids,
-                    swap_node_ids=swap_node_ids,
-                    gpu_match_token_num=gpu_match_token_num,
-                    cpu_match_token_num=cpu_match_token_num,
-                )
-            except Exception as e:
-                logger.error(
-                    f"req_id {request.request_id} revert block error: {e}, "
-                    f"revert tokens: {revert_tokens} from matched nodes: {match_block_ids}"
-                )
-                break
-
-        if revert_tokens > 0:
-            last_block_id = matche_nodes[-1].block_id
-            if last_block_id in match_gpu_block_ids:
-                gpu_match_token_num -= revert_tokens
-            elif last_block_id in match_cpu_block_ids:
-                cpu_match_token_num -= revert_tokens
-            else:
-                logger.error(
-                    f"req_id {request.request_id} revert nodes error, revert_tokens: {revert_tokens}, nodes: {last_block_id}, "
-                    f"match_gpu_block_ids: {match_gpu_block_ids}, match_cpu_block_ids: {match_cpu_block_ids}"
-                )
-        current_node = self.radix_tree_root if len(matche_nodes) == 0 else matche_nodes[-1]
-        return gpu_match_token_num, cpu_match_token_num, current_node
-
     def mm_match_block(self, request, block_size):
         """
         Match and retrieve cached blocks for multimodal requests using a radix tree structure.
@@ -1549,43 +1490,6 @@ def mm_match_block(self, request, block_size):
         if has_modified_cpu_lru_leaf_heap:
             heapq.heapify(self.cpu_lru_leaf_heap)
 
-        # if self.cache_config.disable_chunked_mm_input:
-        #     if gpu_match_token_num + cpu_match_token_num == request.need_prefill_tokens:
-        #         # when a full hit is achieved, roll back one block_size
-        #         try:
-        #             gpu_match_token_num, cpu_match_token_num = self._roll_back_block(
-        #                 block_size=block_size,
-        #                 matche_nodes=matche_nodes,
-        #                 match_gpu_block_ids=match_gpu_block_ids,
-        #                 match_cpu_block_ids=match_cpu_block_ids,
-        #                 match_node_ids=match_node_ids,
-        #                 swap_node_ids=swap_node_ids,
-        #                 gpu_match_token_num=gpu_match_token_num,
-        #                 cpu_match_token_num=cpu_match_token_num,
-        #             )
-        #         except Exception as e:
-        #             logger.error(f"req_id {request.request_id} revert block error: {e}")
-        #     matched_token_num = gpu_match_token_num + cpu_match_token_num
-        #     is_chunked, chunk_idx = self.is_chunked_mm_input(request.multimodal_inputs, matched_token_num)
-        #     if is_chunked:
-        #         (
-        #             gpu_match_token_num,
-        #             cpu_match_token_num,
-        #             current_match_node,
-        #         ) = self._revert_match_blocks(
-        #             request=request,
-        #             matched_token_num=matched_token_num,
-        #             block_size=block_size,
-        #             chunk_idx=chunk_idx,
-        #             match_node_ids=match_node_ids,
-        #             matche_nodes=matche_nodes,
-        #             match_gpu_block_ids=match_gpu_block_ids,
-        #             match_cpu_block_ids=match_cpu_block_ids,
-        #             gpu_match_token_num=gpu_match_token_num,
-        #             cpu_match_token_num=cpu_match_token_num,
-        #             swap_node_ids=swap_node_ids,
-        #         )
-
         logger.info(f"match_block: req_id {request.request_id} matched nodes: {match_node_ids}")
         return (
             match_gpu_block_ids,
@@ -1596,37 +1500,6 @@ def mm_match_block(self, request, block_size):
             cpu_match_token_num,
         )
 
-    def _roll_back_block(
-        self,
-        block_size: int,
-        matche_nodes: list,
-        match_gpu_block_ids: list,
-        match_cpu_block_ids: list,
-        match_node_ids: list,
-        swap_node_ids: list,
-        gpu_match_token_num: int,
-        cpu_match_token_num: int,
-    ):
-        revert_block = matche_nodes.pop()
-        revert_block_id = revert_block.block_id
-        if revert_block_id in match_gpu_block_ids:
-            match_gpu_block_ids.remove(revert_block_id)
-            match_node_ids.remove(revert_block.node_id)
-            gpu_match_token_num -= block_size
-        elif revert_block_id in match_cpu_block_ids:
-            match_cpu_block_ids.remove(revert_block_id)
-            match_node_ids.remove(revert_block.node_id)
-            cpu_match_token_num -= block_size
-        else:
-            raise Exception(
-                f"revert nodes error, nodes: {revert_block_id}, match_gpu_block_ids: {match_gpu_block_ids}, "
-                f"match_cpu_block_ids: {match_cpu_block_ids}"
-            )
-
-        if revert_block_id in swap_node_ids:
-            swap_node_ids.remove(revert_block_id)
-        return gpu_match_token_num, cpu_match_token_num
-
     def match_block(self, req_id, input_ids, block_size):
         """
         Args:
diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py
index e1f6659fefd..bbdb5d0e7aa 100644
--- a/fastdeploy/engine/sched/resource_manager_v1.py
+++ b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -366,10 +366,7 @@ def revert_chunked_mm_input(self, mm_inputs, matched_token_num):
         for idx in range(len(mm_inputs["mm_positions"])):
             position = mm_inputs["mm_positions"][idx]
             if position.offset < matched_token_num < position.offset + position.length:
-                llm_logger.debug(
-                    f"revert_chunked_mm_input, match token num: {matched_token_num}, revert tokens: {matched_token_num - position.offset}"
-                )
-                return matched_token_num - position.offset
+                return position.offset
             elif matched_token_num < position.offset:
                 break
         return matched_token_num
@@ -988,6 +985,7 @@ def get_prefix_cached_blocks(self, request: Request):
                     request.skip_allocate = True
                 else:
                     request.num_computed_tokens = matched_token_num
+            llm_logger.info(f"request {request.request_id} num_computed_tokens: {request.num_computed_tokens}")
             return True
         except Exception as e:
             llm_logger.error(f"prefix match blocks error: {e}, {str(traceback.format_exc())} waiting reschedule...")
diff --git a/tests/cache_manager/test_prefix_cache_manager.py b/tests/cache_manager/test_prefix_cache_manager.py
index e4b540240df..7cd5d0aff20 100644
--- a/tests/cache_manager/test_prefix_cache_manager.py
+++ b/tests/cache_manager/test_prefix_cache_manager.py
@@ -821,17 +821,6 @@ def test_update_cache_blocks_refreshes_mappings(self):
         self.assertIn(req_id, manager.leaf_req_map[new_leaf])
         self.assertEqual(task.num_cached_blocks, 2)
 
-    def test_is_chunked_mm_input_detects_overlap(self):
-        manager = _create_manager()
-        mm_inputs = {
-            "mm_positions": [SimpleNamespace(offset=2, length=3)],
-            "mm_hashes": ["img"],
-        }
-
-        chunked, idx = manager.is_chunked_mm_input(mm_inputs, matched_token_num=3)
-        self.assertTrue(chunked)
-        self.assertEqual(idx, 0)
-
     def test_issue_and_sync_swap_tasks(self):
         manager = _create_manager()
         manager.cache_task_queue = _DummyEngineCacheQueue()
@@ -1196,34 +1185,6 @@ def test_clear_prefix_cache_resets_on_signal(self):
             with self.assertRaises(SystemExit):
                 manager.clear_prefix_cache()
 
-    @unittest.skip("Skip TestRevertMatchBlocks")
-    def test_revert_match_blocks_adjusts_lists(self):
-        manager = _create_manager()
-        request = SimpleNamespace(
-            request_id="revert",
-            multimodal_inputs={"mm_positions": [SimpleNamespace(offset=2, length=2)]},
-        )
-        node = BlockNode(120, [1, 2], 0, 1, 0, 2, get_hash_str([1, 2]), 0, parent=manager.radix_tree_root)
-        matche_nodes = [node]
-        match_gpu = [0]
-        match_node_ids = [node.node_id]
-        swap_nodes = [node.block_id]
-        gpu_tokens, cpu_tokens, current = manager._revert_match_blocks(
-            request=request,
-            matched_token_num=4,
-            block_size=2,
-            chunk_idx=0,
-            match_node_ids=match_node_ids,
-            matche_nodes=matche_nodes,
-            match_gpu_block_ids=match_gpu,
-            match_cpu_block_ids=[],
-            gpu_match_token_num=4,
-            cpu_match_token_num=0,
-            swap_node_ids=swap_nodes,
-        )
-        self.assertEqual(gpu_tokens, 2)
-        self.assertEqual(current, manager.radix_tree_root)
-
 
 # Coverage-oriented tests. These are used to lightly exercise specific
 # implementation details without constraining core behavior.
diff --git a/tests/v1/cache_manager/test_revert_blocks.py b/tests/v1/cache_manager/test_revert_blocks.py
deleted file mode 100644
index 0cc3def4ae7..00000000000
--- a/tests/v1/cache_manager/test_revert_blocks.py
+++ /dev/null
@@ -1,302 +0,0 @@
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from dataclasses import asdict
-from types import SimpleNamespace
-
-from fastdeploy.cache_manager.cache_data import BlockNode
-from fastdeploy.cache_manager.prefix_cache_manager import PrefixCacheManager
-from fastdeploy.config import CacheConfig, FDConfig, ParallelConfig
-from fastdeploy.engine.args_utils import EngineArgs
-from fastdeploy.engine.request import ImagePosition, Request
-from fastdeploy.scheduler import SchedulerConfig
-
-
-def make_prefix_cache_manager(max_num_seqs, enable_mm=False, num_gpu_blocks_override=100, max_num_batched_tokens=3200):
-    engine_args = EngineArgs(
-        max_num_seqs=max_num_seqs,
-        num_gpu_blocks_override=num_gpu_blocks_override,
-        max_num_batched_tokens=max_num_batched_tokens,
-    )
-    args = asdict(engine_args)
-    cache_cfg = CacheConfig(args)
-    model_cfg = SimpleNamespace(enable_mm=enable_mm, max_model_len=4196)
-    speculative_cfg = SimpleNamespace(method=None)
-    model_cfg.print = print
-    model_cfg.architectures = ["test_model"]
-    cache_cfg.bytes_per_layer_per_block = 1
-    parallel_cfg = ParallelConfig(args)
-    scheduler_cfg = SchedulerConfig(args)
-    graph_opt_cfg = engine_args.create_graph_optimization_config()
-    fd_config = FDConfig(
-        model_config=model_cfg,
-        cache_config=cache_cfg,
-        parallel_config=parallel_cfg,
-        graph_opt_config=graph_opt_cfg,
-        speculative_config=speculative_cfg,
-        scheduler_config=scheduler_cfg,
-    )
-    return PrefixCacheManager(config=fd_config, tensor_parallel_size=8, splitwise_role="mixed")
-
-
-class TestIsChunkedMMInput(unittest.TestCase):
-    def setUp(self):
-        self.cache_manager = make_prefix_cache_manager(max_num_seqs=3, enable_mm=True, num_gpu_blocks_override=100)
-
-    def test_is_chunked_mm_input_none_input(self):
-        result, idx = self.cache_manager.is_chunked_mm_input(None, 10)
-        self.assertFalse(result)
-        self.assertEqual(idx, 0)
-
-    def test_is_chunked_mm_input_no_mm_positions(self):
-        mm_inputs = {"other_field": "value"}
-        result, idx = self.cache_manager.is_chunked_mm_input(mm_inputs, 10)
-        self.assertFalse(result)
-        self.assertEqual(idx, 0)
-
-    def test_is_chunked_mm_input_empty_positions(self):
-        mm_inputs = {"mm_positions": []}
-        result, idx = self.cache_manager.is_chunked_mm_input(mm_inputs, 10)
-        self.assertFalse(result)
-        self.assertEqual(idx, 0)
-
-    def test_is_chunked_mm_input_matched_in_chunk(self):
-        mm_inputs = {
-            "mm_positions": [
-                ImagePosition(offset=5, length=10),
-                ImagePosition(offset=20, length=10),
-            ]
-        }
-        result, idx = self.cache_manager.is_chunked_mm_input(mm_inputs, 8)
-        self.assertTrue(result)
-        self.assertEqual(idx, 0)
-
-    def test_is_chunked_mm_input_matched_in_second_chunk(self):
-        mm_inputs = {
-            "mm_positions": [
-                ImagePosition(offset=5, length=10),
-                ImagePosition(offset=20, length=10),
-            ]
-        }
-        result, idx = self.cache_manager.is_chunked_mm_input(mm_inputs, 25)
-        self.assertTrue(result)
-        self.assertEqual(idx, 1)
-
-    def test_is_chunked_mm_input_before_first_chunk(self):
-        mm_inputs = {
-            "mm_positions": [
-                ImagePosition(offset=5, length=10),
-                ImagePosition(offset=20, length=10),
-            ]
-        }
-        result, idx = self.cache_manager.is_chunked_mm_input(mm_inputs, 3)
-        self.assertFalse(result)
-        self.assertEqual(idx, 0)
-
-    def test_is_chunked_mm_input_after_last_chunk(self):
-        mm_inputs = {
-            "mm_positions": [
-                ImagePosition(offset=5, length=10),
-                ImagePosition(offset=20, length=10),
-            ]
-        }
-        result, idx = self.cache_manager.is_chunked_mm_input(mm_inputs, 35)
-        self.assertFalse(result)
-        self.assertEqual(idx, 0)
-
-
-@unittest.skip("Skip TestRevertMatchBlocks")
-class TestRevertMatchBlocks(unittest.TestCase):
-    def setUp(self):
-        self.block_size = 64
-        self.cache_manager = make_prefix_cache_manager(max_num_seqs=3, enable_mm=True, num_gpu_blocks_override=100)
-
-    def make_match_blocks(self, gpu_block_num, cpu_block_num):
-        block_num = gpu_block_num + cpu_block_num
-        matched_token_num = block_num * self.block_size
-        match_node_ids = []
-        matche_nodes = []
-        match_gpu_block_ids = []
-        match_cpu_block_ids = []
-        for idx in range(block_num):
-            node_id = idx + 10
-            block = BlockNode(node_id, [], 0, 0, idx, 0, None, None, None)
-            match_node_ids.append(node_id)
-            matche_nodes.append(block)
-            match_gpu_block_ids.append(idx)
-
-        for _ in range(cpu_block_num):
-            match_cpu_block_ids.append(match_gpu_block_ids.pop())
-
-        gpu_match_token_num = len(match_gpu_block_ids) * self.block_size
-        cpu_match_token_num = len(match_cpu_block_ids) * self.block_size
-        return (
-            matched_token_num,
-            match_node_ids,
-            matche_nodes,
-            match_gpu_block_ids,
-            match_cpu_block_ids,
-            gpu_match_token_num,
-            cpu_match_token_num,
-        )
-
-    def test_revert_full_blocks(self):
-        # Setup test data
-        multimodal_inputs = {
-            "mm_positions": [ImagePosition(offset=0, length=1200)],
-            "mm_hashes": ["image1"],
-        }
-        req_dict = {
-            "request_id": "req1",
-            "prompt_token_ids": [-1] * 1200 + [2] * 120,
-            "prompt_token_ids_len": 1320,
-            "multimodal_inputs": multimodal_inputs,
-        }
-
-        (
-            matched_token_num,
-            match_node_ids,
-            matche_nodes,
-            match_gpu_block_ids,
-            match_cpu_block_ids,
-            gpu_match_token_num,
-            cpu_match_token_num,
-        ) = self.make_match_blocks(gpu_block_num=2, cpu_block_num=0)
-
-        # Call method
-        (
-            gpu_match_token_num,
-            cpu_match_token_num,
-            current_match_node,
-        ) = self.cache_manager._revert_match_blocks(
-            request=Request.from_dict(req_dict),
-            matched_token_num=matched_token_num,
-            block_size=self.block_size,
-            chunk_idx=0,
-            match_node_ids=match_node_ids,
-            matche_nodes=matche_nodes,
-            match_gpu_block_ids=match_gpu_block_ids,
-            match_cpu_block_ids=match_cpu_block_ids,
-            gpu_match_token_num=gpu_match_token_num,
-            cpu_match_token_num=cpu_match_token_num,
-            swap_node_ids=[],
-        )
-
-        # Assertions
-        self.assertEqual(gpu_match_token_num, 0)
-        self.assertEqual(cpu_match_token_num, 0)
-        self.assertEqual(len(match_node_ids), 0)
-        self.assertEqual(len(match_gpu_block_ids), 0)
-
-    def test_revert_partial_block(self):
-        # Setup test data
-        multimodal_inputs = {
-            "mm_positions": [ImagePosition(offset=120, length=1200)],
-            "mm_hashes": ["image1"],
-        }
-        req_dict = {
-            "request_id": "req1",
-            "prompt_token_ids": [1] * 120 + [-1] * 1200 + [2] * 120,
-            "prompt_token_ids_len": 1440,
-            "multimodal_inputs": multimodal_inputs,
-        }
-
-        (
-            matched_token_num,
-            match_node_ids,
-            matche_nodes,
-            match_gpu_block_ids,
-            match_cpu_block_ids,
-            gpu_match_token_num,
-            cpu_match_token_num,
-        ) = self.make_match_blocks(gpu_block_num=20, cpu_block_num=0)
-
-        # Call method
-        (
-            gpu_match_token_num,
-            cpu_match_token_num,
-            current_match_node,
-        ) = self.cache_manager._revert_match_blocks(
-            request=Request.from_dict(req_dict),
-            matched_token_num=matched_token_num,
-            block_size=self.block_size,
-            chunk_idx=0,
-            match_node_ids=match_node_ids,
-            matche_nodes=matche_nodes,
-            match_gpu_block_ids=match_gpu_block_ids,
-            match_cpu_block_ids=match_cpu_block_ids,
-            gpu_match_token_num=gpu_match_token_num,
-            cpu_match_token_num=cpu_match_token_num,
-            swap_node_ids=[],
-        )
-
-        # Assertions
-        self.assertEqual(gpu_match_token_num, 120)
-        self.assertEqual(cpu_match_token_num, 0)
-        self.assertEqual(len(match_node_ids), 2)
-        self.assertEqual(len(match_gpu_block_ids), 2)
-
-    def test_revert_with_cpu_blocks(self):
-        # Setup test data
-        multimodal_inputs = {
-            "mm_positions": [ImagePosition(offset=120, length=1200), ImagePosition(offset=1440, length=420)],
-            "mm_hashes": ["image1", "image2"],
-        }
-        req_dict = {
-            "request_id": "req1",
-            "prompt_token_ids": [1] * 120 + [-1] * 1200 + [2] * 120 + [-1] * 420,
-            "prompt_token_ids_len": 1860,
-            "multimodal_inputs": multimodal_inputs,
-        }
-
-        (
-            matched_token_num,
-            match_node_ids,
-            matche_nodes,
-            match_gpu_block_ids,
-            match_cpu_block_ids,
-            gpu_match_token_num,
-            cpu_match_token_num,
-        ) = self.make_match_blocks(gpu_block_num=22, cpu_block_num=6)
-
-        # Call method
-        (
-            gpu_match_token_num,
-            cpu_match_token_num,
-            current_match_node,
-        ) = self.cache_manager._revert_match_blocks(
-            request=Request.from_dict(req_dict),
-            matched_token_num=matched_token_num,
-            block_size=self.block_size,
-            chunk_idx=1,
-            match_node_ids=match_node_ids,
-            matche_nodes=matche_nodes,
-            match_gpu_block_ids=match_gpu_block_ids,
-            match_cpu_block_ids=match_cpu_block_ids,
-            gpu_match_token_num=gpu_match_token_num,
-            cpu_match_token_num=cpu_match_token_num,
-            swap_node_ids=[],
-        )
-
-        # Assertions
-        self.assertEqual(gpu_match_token_num, 22 * self.block_size)
-        self.assertEqual(cpu_match_token_num, 32)
-        self.assertEqual(len(match_node_ids), 23)
-        self.assertEqual(len(match_gpu_block_ids), 22)
-        self.assertEqual(len(match_cpu_block_ids), 1)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/v1/test_resource_manager_v1.py b/tests/v1/test_resource_manager_v1.py
index 06f55865e8b..0e1d748edb0 100644
--- a/tests/v1/test_resource_manager_v1.py
+++ b/tests/v1/test_resource_manager_v1.py
@@ -25,7 +25,7 @@
 
 from fastdeploy.config import CacheConfig, FDConfig, ParallelConfig, SchedulerConfig
 from fastdeploy.engine.args_utils import EngineArgs
-from fastdeploy.engine.request import Request
+from fastdeploy.engine.request import ImagePosition, Request
 from fastdeploy.engine.sched.resource_manager_v1 import ResourceManagerV1
 
 
@@ -189,5 +189,100 @@ def test_download_features_retry(self):
         self.assertEqual(self.request.error_code, 530)
 
 
+class TestRevertChunkedMMInput(unittest.TestCase):
+    def setUp(self):
+        max_num_seqs = 2
+        engine_args = EngineArgs(
+            max_num_seqs=max_num_seqs,
+            num_gpu_blocks_override=102,
+            max_num_batched_tokens=3200,
+        )
+        args = asdict(engine_args)
+
+        cache_cfg = CacheConfig(args)
+        model_cfg = SimpleNamespace(enable_mm=True)  # Enable multimodal for feature testing
+        speculative_cfg = SimpleNamespace(method=None)
+        model_cfg.print = print
+        model_cfg.max_model_len = 5120
+        model_cfg.architectures = ["test_model"]
+        cache_cfg.bytes_per_layer_per_block = 1
+        parallel_cfg = ParallelConfig(args)
+        scheduler_cfg = SchedulerConfig(args)
+        graph_opt_cfg = engine_args.create_graph_optimization_config()
+
+        fd_config = FDConfig(
+            model_config=model_cfg,
+            cache_config=cache_cfg,
+            parallel_config=parallel_cfg,
+            graph_opt_config=graph_opt_cfg,
+            speculative_config=speculative_cfg,
+            scheduler_config=scheduler_cfg,
+        )
+        self.manager = ResourceManagerV1(
+            max_num_seqs=max_num_seqs, config=fd_config, tensor_parallel_size=8, splitwise_role="mixed"
+        )
+        req_dict = {
+            "request_id": "test_request",
+            "multimodal_inputs": {},
+        }
+        self.request = Request.from_dict(req_dict)
+        self.request.async_process_futures = []
+        self.request.multimodal_inputs = {}
+
+    def test_revert_chunked_mm_input_none_input(self):
+        result = self.manager.revert_chunked_mm_input(None, 10)
+        self.assertEqual(result, 10)
+
+    def test_revert_chunked_mm_input_no_mm_positions(self):
+        mm_inputs = {"other_field": "value"}
+        result = self.manager.revert_chunked_mm_input(mm_inputs, 10)
+        self.assertEqual(result, 10)
+
+    def test_revert_chunked_mm_input_empty_positions(self):
+        mm_inputs = {"mm_positions": []}
+        result = self.manager.revert_chunked_mm_input(mm_inputs, 10)
+        self.assertEqual(result, 10)
+
+    def test_revert_chunked_mm_input_matched_in_chunk(self):
+        mm_inputs = {
+            "mm_positions": [
+                ImagePosition(offset=5, length=10),
+                ImagePosition(offset=20, length=10),
+            ]
+        }
+        result = self.manager.revert_chunked_mm_input(mm_inputs, 8)
+        self.assertEqual(result, 5)
+
+    def test_revert_chunked_mm_input_matched_in_second_chunk(self):
+        mm_inputs = {
+            "mm_positions": [
+                ImagePosition(offset=5, length=10),
+                ImagePosition(offset=20, length=10),
+            ]
+        }
+        result = self.manager.revert_chunked_mm_input(mm_inputs, 25)
+        self.assertEqual(result, 20)
+
+    def test_revert_chunked_mm_input_before_first_chunk(self):
+        mm_inputs = {
+            "mm_positions": [
+                ImagePosition(offset=5, length=10),
+                ImagePosition(offset=20, length=10),
+            ]
+        }
+        result = self.manager.revert_chunked_mm_input(mm_inputs, 3)
+        self.assertEqual(result, 3)
+
+    def test_revert_chunked_mm_input_after_last_chunk(self):
+        mm_inputs = {
+            "mm_positions": [
+                ImagePosition(offset=5, length=10),
+                ImagePosition(offset=20, length=10),
+            ]
+        }
+        result = self.manager.revert_chunked_mm_input(mm_inputs, 35)
+        self.assertEqual(result, 35)
+
+
 if __name__ == "__main__":
     unittest.main()

From 7fadd26f0277ddc9dc84a0b0ef5e5ed9f1181212 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Wed, 7 Jan 2026 11:19:36 +0800
Subject: [PATCH 9/9] update code

---
 .../cache_manager/prefix_cache_manager.py     |  5 ---
 .../test_prefix_cache_manager.py              | 33 ++-----------------
 2 files changed, 2 insertions(+), 36 deletions(-)

diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py
index c2169127bb5..5ad94a3b550 100644
--- a/fastdeploy/cache_manager/prefix_cache_manager.py
+++ b/fastdeploy/cache_manager/prefix_cache_manager.py
@@ -1653,11 +1653,6 @@ def mm_build_path(self, request, num_computed_tokens, block_size, last_node, num
                 hash_value = get_hash_str(current_block, prefix_block_key)
                 prefix_block_key = [hash_value]
                 allocated_block_id = gpu_block_ids.pop(0)
-                if hash_value in node.children:
-                    node = node.children[hash_value]
-                    node.req_id_set.add(request.request_id)
-                    self.recycle_gpu_blocks(allocated_block_id)
-                    continue
                 node_id = self.node_id_pool.pop()
                 unique_node_ids.append(node_id)
                 new_last_node = BlockNode(
diff --git a/tests/cache_manager/test_prefix_cache_manager.py b/tests/cache_manager/test_prefix_cache_manager.py
index 7cd5d0aff20..52d1c01040e 100644
--- a/tests/cache_manager/test_prefix_cache_manager.py
+++ b/tests/cache_manager/test_prefix_cache_manager.py
@@ -949,7 +949,7 @@ def test_mm_match_block_handles_multimodal_inputs(self):
         manager = _create_manager(num_gpu_blocks=4)
         block_size = 2
         manager.cache_config.disable_chunked_mm_input = False
-        input_ids = [1, 2, 3, 4, 5, 6, 7, 8]
+        input_ids = [1, 2, 3, 4]
         hash_input = get_hash_str(input_ids)
         hash_first = get_hash_str([1, 2])
         hash_second = get_hash_str([3, 4], [hash_first, "img"])
@@ -978,8 +978,7 @@ def test_mm_match_block_handles_multimodal_inputs(self):
                 "mm_positions": [SimpleNamespace(offset=2, length=2)],
                 "mm_hashes": ["img"],
             },
-            num_total_tokens=8,
-            need_prefill_tokens=8,
+            num_total_tokens=4,
         )
 
         match_gpu, match_cpu, swap_nodes, last_node, gpu_tokens, cpu_tokens = manager.mm_match_block(
@@ -1091,34 +1090,6 @@ def test_free_block_ids_async_consumes_finished_future(self):
         self.assertIsNone(manager.gpu_free_task_future)
         self.assertTrue(finished.result_called)
 
-    def test_mm_match_block_reverts_chunked_inputs(self):
-        manager = _create_manager(num_gpu_blocks=4)
-        manager.cache_config.disable_chunked_mm_input = True
-        block_size = 2
-        input_ids = [1, 2, 3, 4]
-        hash_input = get_hash_str(input_ids)
-        hash_first = get_hash_str([1, 2])
-        hash_second = get_hash_str([3, 4], ["img"])
-        node1 = BlockNode(80, input_ids, hash_input, 1, 0, block_size, hash_first, 0, parent=manager.radix_tree_root)
-        node2 = BlockNode(81, input_ids, hash_input, 2, 1, block_size, hash_second, 0, parent=node1)
-        manager.radix_tree_root.children[hash_first] = node1
-        node1.children[hash_second] = node2
-
-        request = SimpleNamespace(
-            prompt_token_ids=input_ids,
-            output_token_ids=[],
-            request_id="chunk-req",
-            multimodal_inputs={
-                "mm_positions": [SimpleNamespace(offset=1, length=3)],
-                "mm_hashes": ["img"],
-            },
-            num_total_tokens=4,
-            need_prefill_tokens=4,
-        )
-
-        match_gpu, *_ = manager.mm_match_block(request, block_size)
-        self.assertEqual(match_gpu, [])
-
     def test_mm_build_path_creates_new_nodes(self):
         manager = _create_manager(num_gpu_blocks=6)
         request = SimpleNamespace(