diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index 9d8925ceaf..c1e92b69ec 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -333,7 +333,7 @@ def schedule(self): # Prepare decoding task scheduled_reqs.append(self._prepare_decode_task(request)) num_decoding_req_nums += 1 - token_budget -= 1 + token_budget -= 1 else: # need to prefill llm_logger.debug( f"scheduler prefill task: {request} request.need_prefill_tokens {request.need_prefill_tokens} request.num_computed_tokens {request.num_computed_tokens}" diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 3817b49326..a51b5c2a2e 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1167,11 +1167,12 @@ def initialize_kv_cache(self, profile: bool = False) -> None: fill_value=0, dtype=cache_type, ) - cache_kvs[f"value_caches_{i}"] = paddle.full( - shape=kv_cache_shape, - fill_value=0, - dtype=cache_type, - ) + if "deepseek" not in self.fd_config.model_config.model_type: + cache_kvs[f"value_caches_{i}"] = paddle.full( + shape=kv_cache_shape, + fill_value=0, + dtype=cache_type, + ) if kv_cache_quant_type == "block_wise_fp8": cache_kvs[f"key_cache_scales_{i}"] = paddle.full( shape=kv_cache_scale_shape, @@ -1898,7 +1899,15 @@ def cal_theortical_kvcache(self): if self.speculative_method in ["mtp"] else self.model_config.num_hidden_layers ) - required_memory = byte_of_dtype * 2 * (self.cache_config.block_size * hidden_dim) * num_layers # k + v + if "deepseek" in self.fd_config.model_config.model_type: + required_memory = ( + byte_of_dtype + * (self.fd_config.model_config.kv_lora_rank + self.fd_config.model_config.qk_rope_head_dim) + * (self.cache_config.block_size) + * num_layers + ) # compress_kv + k_pe + else: + required_memory = byte_of_dtype * 2 * (self.cache_config.block_size * hidden_dim) * num_layers # k + v return required_memory def not_need_stop(self) -> bool: