From f95b04a21cbb748ff5ed1a0489389166bc345672 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 19 Feb 2025 18:47:37 +0200 Subject: [PATCH] model : fix order kvq -> qkv ggml-ci --- src/llama-context.cpp | 12 +++--- src/llama-context.h | 2 +- src/llama-graph.h | 2 +- src/llama-model.cpp | 95 ++++++++++++++++++++++--------------------- 4 files changed, 56 insertions(+), 55 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 818702143e196..dbc9231acf1c8 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2572,9 +2572,9 @@ ggml_tensor * llama_context_kv_self::build_attn( ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, + ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, - ggml_tensor * q_cur, int32_t n_tokens, float kq_scale, int il, @@ -2617,9 +2617,6 @@ ggml_tensor * llama_context_kv_self::build_attn( ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view)); } - const auto & n_embd_head_k = hparams.n_embd_head_k; - const auto & n_embd_head_v = hparams.n_embd_head_v; - // TODO: improve bool is_sliding = false; @@ -2648,8 +2645,11 @@ ggml_tensor * llama_context_kv_self::build_attn( const auto n_kv = worst_case ? kv_self.size : kv_self.n; - const int64_t n_head = hparams.n_head(il); - const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_head = hparams.n_head(il); + const int64_t n_head_kv = hparams.n_head_kv(il); + + const auto & n_embd_head_k = hparams.n_embd_head_k; + const auto & n_embd_head_v = hparams.n_embd_head_v; struct ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); //cb(q, "q", il); diff --git a/src/llama-context.h b/src/llama-context.h index fb241adf1d151..2b3d5f122bbbe 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -381,9 +381,9 @@ class llama_context_kv_self : public llama_context { ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, + ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, - ggml_tensor * q_cur, int32_t n_tokens, float kq_scale, int il, diff --git a/src/llama-graph.h b/src/llama-graph.h index 9adfc6f2313e2..b64e0f5f4fdb0 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -93,9 +93,9 @@ class llama_graph_i { ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, + ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, - ggml_tensor * q_cur, int32_t n_tokens, float kq_scale, int il, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index a22720c3ad184..debbacbb6183b 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4246,9 +4246,9 @@ struct llm_build_context { struct ggml_cgraph * gf, struct ggml_tensor * wo, struct ggml_tensor * wo_b, + struct ggml_tensor * q_cur, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, - struct ggml_tensor * q_cur, int32_t n_tokens, float kq_scale, int il) { @@ -4258,7 +4258,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, k_cur); ggml_build_forward_expand(gf, v_cur); - ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, k_cur, v_cur, q_cur, n_tokens, kq_scale, il, worst_case); + ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, q_cur, k_cur, v_cur, n_tokens, kq_scale, il, worst_case); cb(cur, "kqv_out", il); return cur; @@ -4460,7 +4460,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, kq_scale, il); + Qcur, Kcur, Vcur, n_tokens, kq_scale, il); } if (il == n_layer - 1) { @@ -4632,7 +4632,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, kq_scale, il); + Qcur, Kcur, Vcur, n_tokens, kq_scale, il); } if (il == n_layer - 1) { @@ -4768,7 +4768,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -4874,7 +4874,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -4996,7 +4996,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -5118,7 +5118,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f, il); + Qcur, Kcur, Vcur, n_tokens, 1.0f, il); } if (il == n_layer - 1) { @@ -5265,7 +5265,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -5375,7 +5375,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -5470,7 +5470,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -5763,7 +5763,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -5896,13 +5896,13 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } else { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } } @@ -6048,7 +6048,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -6168,7 +6168,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -6283,7 +6283,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -6401,7 +6401,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -6514,7 +6514,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -6673,7 +6673,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f, il); + Qcur, Kcur, Vcur, n_tokens, 1.0f, il); } if (il == n_layer - 1) { @@ -6796,7 +6796,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f, il); + Qcur, Kcur, Vcur, n_tokens, 1.0f, il); } if (il == n_layer - 1) { @@ -6921,7 +6921,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } struct ggml_tensor * sa_out = cur; @@ -7024,7 +7024,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -7136,7 +7136,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -7257,7 +7257,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -7376,7 +7376,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -7570,7 +7570,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - k_states, v_states, q_states, n_tokens, kq_scale, il); + q_states, k_states, v_states, n_tokens, kq_scale, il); } if (il == n_layer - 1) { @@ -7692,7 +7692,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f, il); + Qcur, Kcur, Vcur, n_tokens, 1.0f, il); } if (il == n_layer - 1) { @@ -7806,7 +7806,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f, il); + Qcur, Kcur, Vcur, n_tokens, 1.0f, il); } cur = build_norm(cur, @@ -7943,7 +7943,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -8143,7 +8143,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -8276,8 +8276,9 @@ struct llm_build_context { cb(Kcur, "Kcur", il); } - cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, - n_tokens, 1.0f / sqrtf(float(n_embd_head)), il); + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -8400,7 +8401,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, nullptr, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -8515,7 +8516,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } cur = build_norm(cur, @@ -8643,7 +8644,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -8773,7 +8774,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -8883,7 +8884,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -9025,7 +9026,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -9172,7 +9173,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, kq_scale, il); + Qcur, Kcur, Vcur, n_tokens, kq_scale, il); } if (il == n_layer - 1) { @@ -9400,7 +9401,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - k_states, v_states, q_states, n_tokens, kq_scale, il); + q_states, k_states, v_states, n_tokens, kq_scale, il); } if (il == n_layer - 1) { @@ -9558,7 +9559,7 @@ struct llm_build_context { cur = build_attn(gf, NULL, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); cur = build_norm(cur, model.layers[il].attn_sub_norm, NULL, @@ -10007,7 +10008,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/float(n_embd_head), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/float(n_embd_head), il); } if (il == n_layer - 1) { @@ -10135,7 +10136,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -10254,7 +10255,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -10377,7 +10378,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -10699,7 +10700,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, nullptr, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); if (hparams.swin_norm) { cur = build_norm(cur,