From ba5a461ae33def4301c0f0a54e2ffa5fb289b710 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Mon, 5 Jan 2026 17:20:32 +0800
Subject: [PATCH 1/5] fix eb5 prefix bug

---
 .../cache_manager/prefix_cache_manager.py     | 78 +++++++++++++++----
 1 file changed, 61 insertions(+), 17 deletions(-)

diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py
index 4142aeccaa2..df022547ac0 100644
--- a/fastdeploy/cache_manager/prefix_cache_manager.py
+++ b/fastdeploy/cache_manager/prefix_cache_manager.py
@@ -1296,25 +1296,23 @@ def _revert_match_blocks(
             if len(matche_nodes) == 0:
                 logger.error(f"req_id {request.request_id} revert nodes error, tokens: {revert_tokens}")
                 break
-            revert_tokens -= block_size
-            revert_block = matche_nodes.pop()
-            revert_block_id = revert_block.block_id
-            if revert_block_id in match_gpu_block_ids:
-                match_gpu_block_ids.remove(revert_block_id)
-                match_node_ids.remove(revert_block.node_id)
-                gpu_match_token_num -= block_size
-            elif revert_block_id in match_cpu_block_ids:
-                match_cpu_block_ids.remove(revert_block_id)
-                match_node_ids.remove(revert_block.node_id)
-                cpu_match_token_num -= block_size
-            else:
+            try:
+                revert_tokens -= block_size
+                gpu_match_token_num, cpu_match_token_num = self._roll_back_block(
+                    block_size=block_size,
+                    matche_nodes=matche_nodes,
+                    match_gpu_block_ids=match_gpu_block_ids,
+                    match_cpu_block_ids=match_cpu_block_ids,
+                    match_node_ids=match_node_ids,
+                    swap_node_ids=swap_node_ids,
+                    gpu_match_token_num=gpu_match_token_num,
+                    cpu_match_token_num=cpu_match_token_num,
+                )
+            except Exception as e:
                 logger.error(
-                    f"req_id {request.request_id} revert nodes error, nodes: {revert_block_id}, "
-                    f"match_gpu_block_ids: {match_gpu_block_ids}, match_cpu_block_ids: {match_cpu_block_ids}"
+                    f"req_id {request.request_id} revert block error: {e}, "
+                    f"revert tokens: {revert_tokens} from matched nodes: {match_block_ids}"
                 )
-                break
-            if revert_block_id in swap_node_ids:
-                swap_node_ids.remove(revert_block_id)
 
         if revert_tokens > 0:
             last_block_id = matche_nodes[-1].block_id
@@ -1419,6 +1417,21 @@ def mm_match_block(self, request, block_size):
             heapq.heapify(self.cpu_lru_leaf_heap)
 
         if self.cache_config.disable_chunked_mm_input:
+            if gpu_match_token_num + cpu_match_token_num == request.need_prefill_tokens:
+                # when a full hit is achieved, roll back one block_size
+                try:
+                    gpu_match_token_num, cpu_match_token_num = self._roll_back_block(
+                        block_size=block_size,
+                        matche_nodes=matche_nodes,
+                        match_gpu_block_ids=match_gpu_block_ids,
+                        match_cpu_block_ids=match_cpu_block_ids,
+                        match_node_ids=match_node_ids,
+                        swap_node_ids=swap_node_ids,
+                        gpu_match_token_num=gpu_match_token_num,
+                        cpu_match_token_num=cpu_match_token_num,
+                    )
+                except Exception as e:
+                    logger.error(f"req_id {request.request_id} revert block error: {e}")
             matched_token_num = gpu_match_token_num + cpu_match_token_num
             is_chunked, chunk_idx = self.is_chunked_mm_input(request.multimodal_inputs, matched_token_num)
             if is_chunked:
@@ -1450,6 +1463,37 @@ def mm_match_block(self, request, block_size):
             cpu_match_token_num,
         )
 
+    def _roll_back_block(
+        self,
+        block_size: int,
+        matche_nodes: list,
+        match_gpu_block_ids: list,
+        match_cpu_block_ids: list,
+        match_node_ids: list,
+        swap_node_ids: list,
+        gpu_match_token_num: int,
+        cpu_match_token_num: int,
+    ):
+        revert_block = matche_nodes.pop()
+        revert_block_id = revert_block.block_id
+        if revert_block_id in match_gpu_block_ids:
+            match_gpu_block_ids.remove(revert_block_id)
+            match_node_ids.remove(revert_block.node_id)
+            gpu_match_token_num -= block_size
+        elif revert_block_id in match_cpu_block_ids:
+            match_cpu_block_ids.remove(revert_block_id)
+            match_node_ids.remove(revert_block.node_id)
+            cpu_match_token_num -= block_size
+        else:
+            raise Exception(
+                f"revert nodes error, nodes: {revert_block_id}, match_gpu_block_ids: {match_gpu_block_ids}, "
+                f"match_cpu_block_ids: {match_cpu_block_ids}"
+            )
+
+        if revert_block_id in swap_node_ids:
+            swap_node_ids.remove(revert_block_id)
+        return gpu_match_token_num, cpu_match_token_num
+
     def match_block(self, req_id, input_ids, block_size):
         """
         Args:

From cbda9a675ccc2f285b587d20f47473ed6830814c Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Tue, 6 Jan 2026 15:14:15 +0800
Subject: [PATCH 2/5] update code

---
 fastdeploy/cache_manager/prefix_cache_manager.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py
index df022547ac0..85b20800281 100644
--- a/fastdeploy/cache_manager/prefix_cache_manager.py
+++ b/fastdeploy/cache_manager/prefix_cache_manager.py
@@ -1638,6 +1638,10 @@ def mm_build_path(self, request, num_computed_tokens, block_size, last_node, num
                     mm_idx=mm_idx,
                 )
                 hash_value = self.hash_block_features(current_block, extra_keys)
+                if hash_value in node.children:
+                    node = node.children[hash_value]
+                    node.req_id_set.add(request.request_id)
+                    continue
                 allocated_block_id = gpu_block_ids.pop(0)
                 node_id = self.node_id_pool.pop()
                 unique_node_ids.append(node_id)

From c768a5ab60a264e5d15de5486146309359f5dc5e Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Tue, 6 Jan 2026 16:45:45 +0800
Subject: [PATCH 3/5] update code

---
 fastdeploy/cache_manager/prefix_cache_manager.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py
index 85b20800281..d1b176975e2 100644
--- a/fastdeploy/cache_manager/prefix_cache_manager.py
+++ b/fastdeploy/cache_manager/prefix_cache_manager.py
@@ -1638,11 +1638,12 @@ def mm_build_path(self, request, num_computed_tokens, block_size, last_node, num
                     mm_idx=mm_idx,
                 )
                 hash_value = self.hash_block_features(current_block, extra_keys)
+                allocated_block_id = gpu_block_ids.pop(0)
                 if hash_value in node.children:
                     node = node.children[hash_value]
                     node.req_id_set.add(request.request_id)
+                    self.recycle_gpu_blocks(allocated_block_id)
                     continue
-                allocated_block_id = gpu_block_ids.pop(0)
                 node_id = self.node_id_pool.pop()
                 unique_node_ids.append(node_id)
                 new_last_node = BlockNode(

From cbe8937d471b787cbc121fd0cb6cad6b60687125 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Wed, 7 Jan 2026 11:35:20 +0800
Subject: [PATCH 4/5] update code

---
 .../cache_manager/prefix_cache_manager.py     | 100 ------
 .../engine/sched/resource_manager_v1.py       |  32 +-
 tests/v1/cache_manager/test_revert_blocks.py  | 302 ------------------
 tests/v1/test_resource_manager_v1.py          |  97 +++++-
 4 files changed, 124 insertions(+), 407 deletions(-)
 delete mode 100644 tests/v1/cache_manager/test_revert_blocks.py

diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py
index d1b176975e2..1d757a996f9 100644
--- a/fastdeploy/cache_manager/prefix_cache_manager.py
+++ b/fastdeploy/cache_manager/prefix_cache_manager.py
@@ -1270,64 +1270,6 @@ def hash_block_features(self, input_ids, extra_keys: list = []):
         """
         return hashlib.sha256(pickle.dumps((input_ids, extra_keys))).hexdigest()
 
-    def _revert_match_blocks(
-        self,
-        request,
-        matched_token_num: int,
-        block_size: int,
-        chunk_idx: int,
-        match_node_ids: list,
-        matche_nodes: list,
-        match_gpu_block_ids: list,
-        match_cpu_block_ids: list,
-        gpu_match_token_num: int,
-        cpu_match_token_num: int,
-        swap_node_ids: list,
-    ):
-        # position = request.multimodal_inputs["mm_positions"][chunk_idx]
-        # revert_tokens = matched_token_num - position.offset
-        # TODO(chengyanfu): fix when is_chunked_mm_input=True, revert all matched tokens
-        revert_tokens = matched_token_num
-        match_block_ids = [node.block_id for node in matche_nodes]
-        logger.warning(
-            f"match_block: req_id {request.request_id} revert tokens: {revert_tokens} from matched nodes: {match_block_ids}"
-        )
-        while revert_tokens >= block_size:
-            if len(matche_nodes) == 0:
-                logger.error(f"req_id {request.request_id} revert nodes error, tokens: {revert_tokens}")
-                break
-            try:
-                revert_tokens -= block_size
-                gpu_match_token_num, cpu_match_token_num = self._roll_back_block(
-                    block_size=block_size,
-                    matche_nodes=matche_nodes,
-                    match_gpu_block_ids=match_gpu_block_ids,
-                    match_cpu_block_ids=match_cpu_block_ids,
-                    match_node_ids=match_node_ids,
-                    swap_node_ids=swap_node_ids,
-                    gpu_match_token_num=gpu_match_token_num,
-                    cpu_match_token_num=cpu_match_token_num,
-                )
-            except Exception as e:
-                logger.error(
-                    f"req_id {request.request_id} revert block error: {e}, "
-                    f"revert tokens: {revert_tokens} from matched nodes: {match_block_ids}"
-                )
-
-        if revert_tokens > 0:
-            last_block_id = matche_nodes[-1].block_id
-            if last_block_id in match_gpu_block_ids:
-                gpu_match_token_num -= revert_tokens
-            elif last_block_id in match_cpu_block_ids:
-                cpu_match_token_num -= revert_tokens
-            else:
-                logger.error(
-                    f"req_id {request.request_id} revert nodes error, revert_tokens: {revert_tokens}, nodes: {last_block_id}, "
-                    f"match_gpu_block_ids: {match_gpu_block_ids}, match_cpu_block_ids: {match_cpu_block_ids}"
-                )
-        current_node = self.radix_tree_root if len(matche_nodes) == 0 else matche_nodes[-1]
-        return gpu_match_token_num, cpu_match_token_num, current_node
-
     def mm_match_block(self, request, block_size):
         """
         Match and retrieve cached blocks for multimodal requests using a radix tree structure.
@@ -1416,43 +1358,6 @@ def mm_match_block(self, request, block_size):
         if has_modified_cpu_lru_leaf_heap:
             heapq.heapify(self.cpu_lru_leaf_heap)
 
-        if self.cache_config.disable_chunked_mm_input:
-            if gpu_match_token_num + cpu_match_token_num == request.need_prefill_tokens:
-                # when a full hit is achieved, roll back one block_size
-                try:
-                    gpu_match_token_num, cpu_match_token_num = self._roll_back_block(
-                        block_size=block_size,
-                        matche_nodes=matche_nodes,
-                        match_gpu_block_ids=match_gpu_block_ids,
-                        match_cpu_block_ids=match_cpu_block_ids,
-                        match_node_ids=match_node_ids,
-                        swap_node_ids=swap_node_ids,
-                        gpu_match_token_num=gpu_match_token_num,
-                        cpu_match_token_num=cpu_match_token_num,
-                    )
-                except Exception as e:
-                    logger.error(f"req_id {request.request_id} revert block error: {e}")
-            matched_token_num = gpu_match_token_num + cpu_match_token_num
-            is_chunked, chunk_idx = self.is_chunked_mm_input(request.multimodal_inputs, matched_token_num)
-            if is_chunked:
-                (
-                    gpu_match_token_num,
-                    cpu_match_token_num,
-                    current_match_node,
-                ) = self._revert_match_blocks(
-                    request=request,
-                    matched_token_num=matched_token_num,
-                    block_size=block_size,
-                    chunk_idx=chunk_idx,
-                    match_node_ids=match_node_ids,
-                    matche_nodes=matche_nodes,
-                    match_gpu_block_ids=match_gpu_block_ids,
-                    match_cpu_block_ids=match_cpu_block_ids,
-                    gpu_match_token_num=gpu_match_token_num,
-                    cpu_match_token_num=cpu_match_token_num,
-                    swap_node_ids=swap_node_ids,
-                )
-
         logger.info(f"match_block: req_id {request.request_id} matched nodes: {match_node_ids}")
         return (
             match_gpu_block_ids,
@@ -1639,11 +1544,6 @@ def mm_build_path(self, request, num_computed_tokens, block_size, last_node, num
                 )
                 hash_value = self.hash_block_features(current_block, extra_keys)
                 allocated_block_id = gpu_block_ids.pop(0)
-                if hash_value in node.children:
-                    node = node.children[hash_value]
-                    node.req_id_set.add(request.request_id)
-                    self.recycle_gpu_blocks(allocated_block_id)
-                    continue
                 node_id = self.node_id_pool.pop()
                 unique_node_ids.append(node_id)
                 new_last_node = BlockNode(
diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py
index 1106b56f9fe..9b1303682c9 100644
--- a/fastdeploy/engine/sched/resource_manager_v1.py
+++ b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -353,6 +353,21 @@ def _is_mm_request(self, request):
 
         return False
 
+    def revert_chunked_mm_input(self, mm_inputs, matched_token_num):
+        """
+        revert mm_inputs that is chunked
+        """
+        if mm_inputs is None or "mm_positions" not in mm_inputs or len(mm_inputs["mm_positions"]) == 0:
+            return matched_token_num
+
+        for idx in range(len(mm_inputs["mm_positions"])):
+            position = mm_inputs["mm_positions"][idx]
+            if position.offset < matched_token_num < position.offset + position.length:
+                return position.offset
+            elif matched_token_num < position.offset:
+                break
+        return matched_token_num
+
     def _get_num_new_tokens(self, request, token_budget):
         # TODO: set condition to new _get_num_new_tokens
         num_new_tokens = request.need_prefill_tokens - request.num_computed_tokens
@@ -904,11 +919,20 @@ def get_prefix_cached_blocks(self, request: Request):
             main_process_metrics.prefix_gpu_cache_token_num.inc(request.gpu_cache_token_num)
             main_process_metrics.prefix_cpu_cache_token_num.inc(request.cpu_cache_token_num)
 
-            if matched_token_num == request.need_prefill_tokens:
-                request.num_computed_tokens = matched_token_num - self.config.cache_config.block_size
-                request.skip_allocate = True
+            if self.config.cache_config.disable_chunked_mm_input:
+                if matched_token_num == request.need_prefill_tokens:
+                    matched_token_num = matched_token_num - self.config.cache_config.block_size
+                    request.skip_allocate = True
+                request.num_computed_tokens = self.revert_chunked_mm_input(
+                    request.multimodal_inputs, matched_token_num
+                )
             else:
-                request.num_computed_tokens = matched_token_num
+                if matched_token_num == request.need_prefill_tokens:
+                    request.num_computed_tokens = matched_token_num - self.config.cache_config.block_size
+                    request.skip_allocate = True
+                else:
+                    request.num_computed_tokens = matched_token_num
+            llm_logger.info(f"request {request.request_id} num_computed_tokens: {request.num_computed_tokens}")
             request.cache_prepare_time = time.time() - cache_prepare_time
             return True
         except Exception as e:
diff --git a/tests/v1/cache_manager/test_revert_blocks.py b/tests/v1/cache_manager/test_revert_blocks.py
deleted file mode 100644
index 0cc3def4ae7..00000000000
--- a/tests/v1/cache_manager/test_revert_blocks.py
+++ /dev/null
@@ -1,302 +0,0 @@
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from dataclasses import asdict
-from types import SimpleNamespace
-
-from fastdeploy.cache_manager.cache_data import BlockNode
-from fastdeploy.cache_manager.prefix_cache_manager import PrefixCacheManager
-from fastdeploy.config import CacheConfig, FDConfig, ParallelConfig
-from fastdeploy.engine.args_utils import EngineArgs
-from fastdeploy.engine.request import ImagePosition, Request
-from fastdeploy.scheduler import SchedulerConfig
-
-
-def make_prefix_cache_manager(max_num_seqs, enable_mm=False, num_gpu_blocks_override=100, max_num_batched_tokens=3200):
-    engine_args = EngineArgs(
-        max_num_seqs=max_num_seqs,
-        num_gpu_blocks_override=num_gpu_blocks_override,
-        max_num_batched_tokens=max_num_batched_tokens,
-    )
-    args = asdict(engine_args)
-    cache_cfg = CacheConfig(args)
-    model_cfg = SimpleNamespace(enable_mm=enable_mm, max_model_len=4196)
-    speculative_cfg = SimpleNamespace(method=None)
-    model_cfg.print = print
-    model_cfg.architectures = ["test_model"]
-    cache_cfg.bytes_per_layer_per_block = 1
-    parallel_cfg = ParallelConfig(args)
-    scheduler_cfg = SchedulerConfig(args)
-    graph_opt_cfg = engine_args.create_graph_optimization_config()
-    fd_config = FDConfig(
-        model_config=model_cfg,
-        cache_config=cache_cfg,
-        parallel_config=parallel_cfg,
-        graph_opt_config=graph_opt_cfg,
-        speculative_config=speculative_cfg,
-        scheduler_config=scheduler_cfg,
-    )
-    return PrefixCacheManager(config=fd_config, tensor_parallel_size=8, splitwise_role="mixed")
-
-
-class TestIsChunkedMMInput(unittest.TestCase):
-    def setUp(self):
-        self.cache_manager = make_prefix_cache_manager(max_num_seqs=3, enable_mm=True, num_gpu_blocks_override=100)
-
-    def test_is_chunked_mm_input_none_input(self):
-        result, idx = self.cache_manager.is_chunked_mm_input(None, 10)
-        self.assertFalse(result)
-        self.assertEqual(idx, 0)
-
-    def test_is_chunked_mm_input_no_mm_positions(self):
-        mm_inputs = {"other_field": "value"}
-        result, idx = self.cache_manager.is_chunked_mm_input(mm_inputs, 10)
-        self.assertFalse(result)
-        self.assertEqual(idx, 0)
-
-    def test_is_chunked_mm_input_empty_positions(self):
-        mm_inputs = {"mm_positions": []}
-        result, idx = self.cache_manager.is_chunked_mm_input(mm_inputs, 10)
-        self.assertFalse(result)
-        self.assertEqual(idx, 0)
-
-    def test_is_chunked_mm_input_matched_in_chunk(self):
-        mm_inputs = {
-            "mm_positions": [
-                ImagePosition(offset=5, length=10),
-                ImagePosition(offset=20, length=10),
-            ]
-        }
-        result, idx = self.cache_manager.is_chunked_mm_input(mm_inputs, 8)
-        self.assertTrue(result)
-        self.assertEqual(idx, 0)
-
-    def test_is_chunked_mm_input_matched_in_second_chunk(self):
-        mm_inputs = {
-            "mm_positions": [
-                ImagePosition(offset=5, length=10),
-                ImagePosition(offset=20, length=10),
-            ]
-        }
-        result, idx = self.cache_manager.is_chunked_mm_input(mm_inputs, 25)
-        self.assertTrue(result)
-        self.assertEqual(idx, 1)
-
-    def test_is_chunked_mm_input_before_first_chunk(self):
-        mm_inputs = {
-            "mm_positions": [
-                ImagePosition(offset=5, length=10),
-                ImagePosition(offset=20, length=10),
-            ]
-        }
-        result, idx = self.cache_manager.is_chunked_mm_input(mm_inputs, 3)
-        self.assertFalse(result)
-        self.assertEqual(idx, 0)
-
-    def test_is_chunked_mm_input_after_last_chunk(self):
-        mm_inputs = {
-            "mm_positions": [
-                ImagePosition(offset=5, length=10),
-                ImagePosition(offset=20, length=10),
-            ]
-        }
-        result, idx = self.cache_manager.is_chunked_mm_input(mm_inputs, 35)
-        self.assertFalse(result)
-        self.assertEqual(idx, 0)
-
-
-@unittest.skip("Skip TestRevertMatchBlocks")
-class TestRevertMatchBlocks(unittest.TestCase):
-    def setUp(self):
-        self.block_size = 64
-        self.cache_manager = make_prefix_cache_manager(max_num_seqs=3, enable_mm=True, num_gpu_blocks_override=100)
-
-    def make_match_blocks(self, gpu_block_num, cpu_block_num):
-        block_num = gpu_block_num + cpu_block_num
-        matched_token_num = block_num * self.block_size
-        match_node_ids = []
-        matche_nodes = []
-        match_gpu_block_ids = []
-        match_cpu_block_ids = []
-        for idx in range(block_num):
-            node_id = idx + 10
-            block = BlockNode(node_id, [], 0, 0, idx, 0, None, None, None)
-            match_node_ids.append(node_id)
-            matche_nodes.append(block)
-            match_gpu_block_ids.append(idx)
-
-        for _ in range(cpu_block_num):
-            match_cpu_block_ids.append(match_gpu_block_ids.pop())
-
-        gpu_match_token_num = len(match_gpu_block_ids) * self.block_size
-        cpu_match_token_num = len(match_cpu_block_ids) * self.block_size
-        return (
-            matched_token_num,
-            match_node_ids,
-            matche_nodes,
-            match_gpu_block_ids,
-            match_cpu_block_ids,
-            gpu_match_token_num,
-            cpu_match_token_num,
-        )
-
-    def test_revert_full_blocks(self):
-        # Setup test data
-        multimodal_inputs = {
-            "mm_positions": [ImagePosition(offset=0, length=1200)],
-            "mm_hashes": ["image1"],
-        }
-        req_dict = {
-            "request_id": "req1",
-            "prompt_token_ids": [-1] * 1200 + [2] * 120,
-            "prompt_token_ids_len": 1320,
-            "multimodal_inputs": multimodal_inputs,
-        }
-
-        (
-            matched_token_num,
-            match_node_ids,
-            matche_nodes,
-            match_gpu_block_ids,
-            match_cpu_block_ids,
-            gpu_match_token_num,
-            cpu_match_token_num,
-        ) = self.make_match_blocks(gpu_block_num=2, cpu_block_num=0)
-
-        # Call method
-        (
-            gpu_match_token_num,
-            cpu_match_token_num,
-            current_match_node,
-        ) = self.cache_manager._revert_match_blocks(
-            request=Request.from_dict(req_dict),
-            matched_token_num=matched_token_num,
-            block_size=self.block_size,
-            chunk_idx=0,
-            match_node_ids=match_node_ids,
-            matche_nodes=matche_nodes,
-            match_gpu_block_ids=match_gpu_block_ids,
-            match_cpu_block_ids=match_cpu_block_ids,
-            gpu_match_token_num=gpu_match_token_num,
-            cpu_match_token_num=cpu_match_token_num,
-            swap_node_ids=[],
-        )
-
-        # Assertions
-        self.assertEqual(gpu_match_token_num, 0)
-        self.assertEqual(cpu_match_token_num, 0)
-        self.assertEqual(len(match_node_ids), 0)
-        self.assertEqual(len(match_gpu_block_ids), 0)
-
-    def test_revert_partial_block(self):
-        # Setup test data
-        multimodal_inputs = {
-            "mm_positions": [ImagePosition(offset=120, length=1200)],
-            "mm_hashes": ["image1"],
-        }
-        req_dict = {
-            "request_id": "req1",
-            "prompt_token_ids": [1] * 120 + [-1] * 1200 + [2] * 120,
-            "prompt_token_ids_len": 1440,
-            "multimodal_inputs": multimodal_inputs,
-        }
-
-        (
-            matched_token_num,
-            match_node_ids,
-            matche_nodes,
-            match_gpu_block_ids,
-            match_cpu_block_ids,
-            gpu_match_token_num,
-            cpu_match_token_num,
-        ) = self.make_match_blocks(gpu_block_num=20, cpu_block_num=0)
-
-        # Call method
-        (
-            gpu_match_token_num,
-            cpu_match_token_num,
-            current_match_node,
-        ) = self.cache_manager._revert_match_blocks(
-            request=Request.from_dict(req_dict),
-            matched_token_num=matched_token_num,
-            block_size=self.block_size,
-            chunk_idx=0,
-            match_node_ids=match_node_ids,
-            matche_nodes=matche_nodes,
-            match_gpu_block_ids=match_gpu_block_ids,
-            match_cpu_block_ids=match_cpu_block_ids,
-            gpu_match_token_num=gpu_match_token_num,
-            cpu_match_token_num=cpu_match_token_num,
-            swap_node_ids=[],
-        )
-
-        # Assertions
-        self.assertEqual(gpu_match_token_num, 120)
-        self.assertEqual(cpu_match_token_num, 0)
-        self.assertEqual(len(match_node_ids), 2)
-        self.assertEqual(len(match_gpu_block_ids), 2)
-
-    def test_revert_with_cpu_blocks(self):
-        # Setup test data
-        multimodal_inputs = {
-            "mm_positions": [ImagePosition(offset=120, length=1200), ImagePosition(offset=1440, length=420)],
-            "mm_hashes": ["image1", "image2"],
-        }
-        req_dict = {
-            "request_id": "req1",
-            "prompt_token_ids": [1] * 120 + [-1] * 1200 + [2] * 120 + [-1] * 420,
-            "prompt_token_ids_len": 1860,
-            "multimodal_inputs": multimodal_inputs,
-        }
-
-        (
-            matched_token_num,
-            match_node_ids,
-            matche_nodes,
-            match_gpu_block_ids,
-            match_cpu_block_ids,
-            gpu_match_token_num,
-            cpu_match_token_num,
-        ) = self.make_match_blocks(gpu_block_num=22, cpu_block_num=6)
-
-        # Call method
-        (
-            gpu_match_token_num,
-            cpu_match_token_num,
-            current_match_node,
-        ) = self.cache_manager._revert_match_blocks(
-            request=Request.from_dict(req_dict),
-            matched_token_num=matched_token_num,
-            block_size=self.block_size,
-            chunk_idx=1,
-            match_node_ids=match_node_ids,
-            matche_nodes=matche_nodes,
-            match_gpu_block_ids=match_gpu_block_ids,
-            match_cpu_block_ids=match_cpu_block_ids,
-            gpu_match_token_num=gpu_match_token_num,
-            cpu_match_token_num=cpu_match_token_num,
-            swap_node_ids=[],
-        )
-
-        # Assertions
-        self.assertEqual(gpu_match_token_num, 22 * self.block_size)
-        self.assertEqual(cpu_match_token_num, 32)
-        self.assertEqual(len(match_node_ids), 23)
-        self.assertEqual(len(match_gpu_block_ids), 22)
-        self.assertEqual(len(match_cpu_block_ids), 1)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/v1/test_resource_manager_v1.py b/tests/v1/test_resource_manager_v1.py
index 3864f41eb88..6d00e6d3d9d 100644
--- a/tests/v1/test_resource_manager_v1.py
+++ b/tests/v1/test_resource_manager_v1.py
@@ -9,7 +9,7 @@
 
 from fastdeploy.config import CacheConfig, FDConfig, ParallelConfig, SchedulerConfig
 from fastdeploy.engine.args_utils import EngineArgs
-from fastdeploy.engine.request import Request
+from fastdeploy.engine.request import ImagePosition, Request
 from fastdeploy.engine.sched.resource_manager_v1 import ResourceManagerV1
 
 
@@ -173,5 +173,100 @@ def test_download_features_retry(self):
         self.assertEqual(self.request.error_code, 530)
 
 
+class TestRevertChunkedMMInput(unittest.TestCase):
+    def setUp(self):
+        max_num_seqs = 2
+        engine_args = EngineArgs(
+            max_num_seqs=max_num_seqs,
+            num_gpu_blocks_override=102,
+            max_num_batched_tokens=3200,
+        )
+        args = asdict(engine_args)
+
+        cache_cfg = CacheConfig(args)
+        model_cfg = SimpleNamespace(enable_mm=True)  # Enable multimodal for feature testing
+        speculative_cfg = SimpleNamespace(method=None)
+        model_cfg.print = print
+        model_cfg.max_model_len = 5120
+        model_cfg.architectures = ["test_model"]
+        cache_cfg.bytes_per_layer_per_block = 1
+        parallel_cfg = ParallelConfig(args)
+        scheduler_cfg = SchedulerConfig(args)
+        graph_opt_cfg = engine_args.create_graph_optimization_config()
+
+        fd_config = FDConfig(
+            model_config=model_cfg,
+            cache_config=cache_cfg,
+            parallel_config=parallel_cfg,
+            graph_opt_config=graph_opt_cfg,
+            speculative_config=speculative_cfg,
+            scheduler_config=scheduler_cfg,
+        )
+        self.manager = ResourceManagerV1(
+            max_num_seqs=max_num_seqs, config=fd_config, tensor_parallel_size=8, splitwise_role="mixed"
+        )
+        req_dict = {
+            "request_id": "test_request",
+            "multimodal_inputs": {},
+        }
+        self.request = Request.from_dict(req_dict)
+        self.request.async_process_futures = []
+        self.request.multimodal_inputs = {}
+
+    def test_revert_chunked_mm_input_none_input(self):
+        result = self.manager.revert_chunked_mm_input(None, 10)
+        self.assertEqual(result, 10)
+
+    def test_revert_chunked_mm_input_no_mm_positions(self):
+        mm_inputs = {"other_field": "value"}
+        result = self.manager.revert_chunked_mm_input(mm_inputs, 10)
+        self.assertEqual(result, 10)
+
+    def test_revert_chunked_mm_input_empty_positions(self):
+        mm_inputs = {"mm_positions": []}
+        result = self.manager.revert_chunked_mm_input(mm_inputs, 10)
+        self.assertEqual(result, 10)
+
+    def test_revert_chunked_mm_input_matched_in_chunk(self):
+        mm_inputs = {
+            "mm_positions": [
+                ImagePosition(offset=5, length=10),
+                ImagePosition(offset=20, length=10),
+            ]
+        }
+        result = self.manager.revert_chunked_mm_input(mm_inputs, 8)
+        self.assertEqual(result, 5)
+
+    def test_revert_chunked_mm_input_matched_in_second_chunk(self):
+        mm_inputs = {
+            "mm_positions": [
+                ImagePosition(offset=5, length=10),
+                ImagePosition(offset=20, length=10),
+            ]
+        }
+        result = self.manager.revert_chunked_mm_input(mm_inputs, 25)
+        self.assertEqual(result, 20)
+
+    def test_revert_chunked_mm_input_before_first_chunk(self):
+        mm_inputs = {
+            "mm_positions": [
+                ImagePosition(offset=5, length=10),
+                ImagePosition(offset=20, length=10),
+            ]
+        }
+        result = self.manager.revert_chunked_mm_input(mm_inputs, 3)
+        self.assertEqual(result, 3)
+
+    def test_revert_chunked_mm_input_after_last_chunk(self):
+        mm_inputs = {
+            "mm_positions": [
+                ImagePosition(offset=5, length=10),
+                ImagePosition(offset=20, length=10),
+            ]
+        }
+        result = self.manager.revert_chunked_mm_input(mm_inputs, 35)
+        self.assertEqual(result, 35)
+
+
 if __name__ == "__main__":
     unittest.main()

From b3f208dcabf08815bcdb198950ea7a9e5c05eda7 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Wed, 7 Jan 2026 11:37:26 +0800
Subject: [PATCH 5/5] update code

---
 .../cache_manager/prefix_cache_manager.py     | 31 -------------------
 1 file changed, 31 deletions(-)

diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py
index 1d757a996f9..5a99a12cce7 100644
--- a/fastdeploy/cache_manager/prefix_cache_manager.py
+++ b/fastdeploy/cache_manager/prefix_cache_manager.py
@@ -1368,37 +1368,6 @@ def mm_match_block(self, request, block_size):
             cpu_match_token_num,
         )
 
-    def _roll_back_block(
-        self,
-        block_size: int,
-        matche_nodes: list,
-        match_gpu_block_ids: list,
-        match_cpu_block_ids: list,
-        match_node_ids: list,
-        swap_node_ids: list,
-        gpu_match_token_num: int,
-        cpu_match_token_num: int,
-    ):
-        revert_block = matche_nodes.pop()
-        revert_block_id = revert_block.block_id
-        if revert_block_id in match_gpu_block_ids:
-            match_gpu_block_ids.remove(revert_block_id)
-            match_node_ids.remove(revert_block.node_id)
-            gpu_match_token_num -= block_size
-        elif revert_block_id in match_cpu_block_ids:
-            match_cpu_block_ids.remove(revert_block_id)
-            match_node_ids.remove(revert_block.node_id)
-            cpu_match_token_num -= block_size
-        else:
-            raise Exception(
-                f"revert nodes error, nodes: {revert_block_id}, match_gpu_block_ids: {match_gpu_block_ids}, "
-                f"match_cpu_block_ids: {match_cpu_block_ids}"
-            )
-
-        if revert_block_id in swap_node_ids:
-            swap_node_ids.remove(revert_block_id)
-        return gpu_match_token_num, cpu_match_token_num
-
     def match_block(self, req_id, input_ids, block_size):
         """
         Args: