Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 27 additions & 26 deletions tests/layers/test_attention_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
from fastdeploy.model_executor.layers.rotary_embedding import get_rope
from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_Attention
from fastdeploy.model_executor.ops.gpu import get_padding_offset
from fastdeploy.worker.worker_process import init_distributed_environment

if "nvidia graphics device" in paddle.device.cuda.get_device_name().lower():
# (ZKK): CI machine.
Expand All @@ -65,19 +66,20 @@ def setUp(self):
print("Setting up test environment...")
paddle.set_device("gpu")
paddle.set_default_dtype("bfloat16")
init_distributed_environment()

self.model_dir = self.create_model_config_json()
self.fd_config = self.create_fd_config_from_model_path(self.model_dir, tensor_parallel_size=1)
self.fd_config.parallel_config.tp_group = [0]
tp_size = paddle.distributed.get_world_size()

self.fd_config = self.create_fd_config_from_model_path(self.model_dir, tensor_parallel_size=tp_size)
self.fd_config.parallel_config.tp_group = paddle.distributed.new_group(range(tp_size))

# Initialize Attention Layer
attn_cls = get_attention_backend()
self.attn_backend = attn_cls(
self.fd_config,
kv_num_heads=self.fd_config.model_config.num_key_value_heads
// self.fd_config.parallel_config.tensor_parallel_size,
num_heads=self.fd_config.model_config.num_attention_heads
// self.fd_config.parallel_config.tensor_parallel_size,
kv_num_heads=self.fd_config.model_config.num_key_value_heads // tp_size,
num_heads=self.fd_config.model_config.num_attention_heads // tp_size,
head_dim=self.fd_config.model_config.head_dim,
encoder_block_shape_q=64,
decoder_block_shape_q=16,
Expand Down Expand Up @@ -119,12 +121,12 @@ def create_model_config_json(self) -> str:
config_dict = {
"architectures": ["Ernie4_5_MoeForCausalLM"],
"dtype": "bfloat16",
"max_position_embeddings": 131072,
"max_model_len": 131072,
"max_position_embeddings": 201 * 1024,
"max_model_len": 201 * 1024,
"head_dim": 128,
"hidden_size": 8192,
"num_attention_heads": 64,
"num_key_value_heads": 8,
"hidden_size": 7168,
"num_attention_heads": 56,
"num_key_value_heads": 4,
"num_hidden_layers": 2,
}
model_dir = tempfile.mkdtemp(prefix="tmp_model_config_")
Expand Down Expand Up @@ -174,10 +176,9 @@ def create_random_attention_state_dict(self, fd_config: FDConfig, prefix: str) -
tp_size = fd_config.parallel_config.tensor_parallel_size
tensor_dtype = getattr(paddle, fd_config.model_config.dtype)

q_dims = fd_config.model_config.num_attention_heads * fd_config.model_config.head_dim
kv_dims = fd_config.model_config.num_key_value_heads * fd_config.model_config.head_dim
total_output_dim = q_dims + 2 * kv_dims
qkv_proj_output_dim_tp = total_output_dim // tp_size
q_dims = fd_config.model_config.num_attention_heads // tp_size * fd_config.model_config.head_dim
kv_dims = fd_config.model_config.num_key_value_heads // tp_size * fd_config.model_config.head_dim
qkv_proj_output_dim_tp = q_dims + 2 * kv_dims
qkv_weight_shape = [hidden_size, qkv_proj_output_dim_tp]

o_proj_input_dim = fd_config.model_config.num_attention_heads * fd_config.model_config.head_dim
Expand All @@ -187,8 +188,7 @@ def create_random_attention_state_dict(self, fd_config: FDConfig, prefix: str) -
qkv_weight = paddle.randn(qkv_weight_shape, dtype=tensor_dtype)
o_proj_weight = paddle.randn(o_proj_weight_shape, dtype=tensor_dtype)

kv_num_heads_tp = fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_size
activation_scale_shape = [kv_num_heads_tp]
activation_scale_shape = [fd_config.model_config.num_key_value_heads]
activation_scale_tensor = paddle.full(shape=activation_scale_shape, fill_value=1.0, dtype=tensor_dtype)

state_dict = {
Expand Down Expand Up @@ -222,14 +222,15 @@ def create_forward_meta(
else:
raise ValueError(f"Unsupported ForwardMode: {mode}")

tp_size = fd_config.parallel_config.tensor_parallel_size
attn_backend_buffers = allocate_launch_related_buffer(
max_batch_size=batch_size,
max_model_len=fd_config.model_config.max_model_len,
encoder_block_shape_q=64,
decoder_block_shape_q=16,
decoder_step_token_num=fd_config.speculative_config.num_speculative_tokens + 1,
num_heads=fd_config.model_config.num_attention_heads,
kv_num_heads=fd_config.model_config.num_key_value_heads,
num_heads=fd_config.model_config.num_attention_heads // tp_size,
kv_num_heads=fd_config.model_config.num_key_value_heads // tp_size,
block_size=fd_config.cache_config.block_size,
)

Expand All @@ -239,7 +240,7 @@ def create_forward_meta(
allocated_blocks_per_seq = seq_len // block_size + 1
allocated_num_blocks = allocated_blocks_per_seq * batch_size
head_dim = fd_config.model_config.head_dim
kv_num_heads_tp = fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_size
kv_num_heads_tp = fd_config.model_config.num_key_value_heads // tp_size
num_layers = fd_config.model_config.num_hidden_layers
cache_type = fd_config.model_config.dtype
if cache_quant_type_str != "none":
Expand All @@ -261,7 +262,7 @@ def create_forward_meta(
for j in range(allocated_blocks_per_seq):
block_tables[i, j] = i * allocated_blocks_per_seq + j

tmp_position_ids = paddle.arange(fd_config.model_config.max_model_len).reshape((1, -1))
tmp_position_ids = paddle.arange(max_model_len).reshape((1, -1))
rope_emb = get_rope(
rotary_dim=fd_config.model_config.head_dim,
position_ids=tmp_position_ids,
Expand All @@ -270,7 +271,7 @@ def create_forward_meta(
partial_rotary_factor=fd_config.model_config.partial_rotary_factor,
)

input_ids = paddle.zeros([batch_size, fd_config.model_config.max_model_len], dtype="int64")
input_ids = paddle.zeros([batch_size, max_model_len], dtype="int64")
token_num = np.sum(seq_lens_this_time)
ids_remove_padding, batch_id_per_token, cu_seqlens_q, cu_seqlens_k = get_padding_offset(
input_ids, seq_lens_this_time, token_num
Expand Down Expand Up @@ -303,7 +304,7 @@ def test_decode_performance_with_prefill(self):
test_steps = 100

prefill_batch_size = 1
prefill_seq_len = 2048
prefill_seq_len = 4096 * 2

forward_meta, prefill_hidden_states = self.create_forward_meta(
batch_size=prefill_batch_size,
Expand Down Expand Up @@ -341,7 +342,7 @@ def test_decode_performance_with_prefill(self):
print(times[-5:])
# p.stop()

return
# return

# p = profiler.Profiler(
# targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
Expand All @@ -351,10 +352,10 @@ def test_decode_performance_with_prefill(self):
# p.start()
# p.step()

for decode_batch_size in [32, 16, 8, 4, 2]:
for decode_batch_size in [1, 2, 3, 4, 5, 10, 20, 40, 60, 80, 100, 128, 160, 192, 256]:
forward_meta, hidden_states = self.create_forward_meta(
batch_size=decode_batch_size,
seq_len=36 * 1024,
seq_len=10 * 1024,
mode=ForwardMode.DECODE,
fd_config=self.fd_config,
attn_backend=self.attn_backend,
Expand Down
Loading