From 0288c31624c7579f760625abc11ef2185d998832 Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Tue, 9 Dec 2025 00:00:23 +0100
Subject: [PATCH 1/2] Add DIAG for CUDA

---
 ggml/src/ggml-cuda/diag.cu      | 77 +++++++++++++++++++++++++++++++++
 ggml/src/ggml-cuda/diag.cuh     |  5 +++
 ggml/src/ggml-cuda/ggml-cuda.cu |  5 +++
 tests/test-backend-ops.cpp      | 29 +++++++++++++
 4 files changed, 116 insertions(+)
 create mode 100644 ggml/src/ggml-cuda/diag.cu
 create mode 100644 ggml/src/ggml-cuda/diag.cuh
diff --git a/ggml/src/ggml-cuda/diag.cu b/ggml/src/ggml-cuda/diag.cu
new file mode 100644
index 00000000000..318fd7ef36f
--- /dev/null
+++ b/ggml/src/ggml-cuda/diag.cu
@@ -0,0 +1,77 @@
+#include "convert.cuh"
+#include "diag.cuh"
+#include "ggml.h"
+
+template <typename T>
+static __global__ void diag_kernel(T * __restrict__ dst,
+                                   const T * __restrict__ src,
+                                   const int64_t ne00,
+                                   const int64_t ne01,
+                                   const int64_t ne02,
+                                   const int64_t ne03,
+                                   const int64_t ne0,
+                                   const int64_t ne1,
+                                   const int64_t ne2,
+                                   const int64_t ne3) {
+    const int64_t global_idx     = blockIdx.x * blockDim.x + threadIdx.x;
+    const int64_t total_elements = ne0 * ne1 * ne2 * ne3;
+
+    if (global_idx >= total_elements) {
+        return;
+    }
+
+    const int64_t i0 = global_idx % ne0;
+    const int64_t i1 = (global_idx / ne0) % ne1;
+    const int64_t i2 = (global_idx / (ne0 * ne1)) % ne2;
+    const int64_t i3 = global_idx / (ne0 * ne1 * ne2);
+
+    const int64_t dst_idx = ((i3 * ne2 + i2) * ne1 + i1) * ne0 + i0;
+
+    if (i0 == i1) {
+        const int64_t batch_idx = i3 * ne2 + i2;
+        const int64_t src_idx   = batch_idx * ne00 + i0;
+        dst[dst_idx]            = src[src_idx];
+    } else {
+        dst[dst_idx] = ggml_cuda_cast<T>(0);
+    }
+
+    GGML_UNUSED_VARS(ne01, ne02, ne03);
+}
+
+void ggml_cuda_op_diag(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    void *       dst_d  = dst->data;
+    const void * src0_d = src0->data;
+
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+    const int64_t ne2 = dst->ne[2];
+    const int64_t ne3 = dst->ne[3];
+
+    const int64_t n_elems    = ggml_nelements(dst);
+    const int64_t num_blocks = (n_elems + CUDA_DIAG_BLOCK_SIZE - 1) / CUDA_DIAG_BLOCK_SIZE;
+
+    switch (dst->type) {
+        case GGML_TYPE_F32:
+            diag_kernel<<<num_blocks, CUDA_DIAG_BLOCK_SIZE, 0, stream>>>((float *) dst_d, (const float *) src0_d, ne00,
+                                                                         ne01, ne02, ne03, ne0, ne1, ne2, ne3);
+            break;
+        case GGML_TYPE_F16:
+            diag_kernel<<<num_blocks, CUDA_DIAG_BLOCK_SIZE, 0, stream>>>((half *) dst_d, (const half *) src0_d, ne00,
+                                                                         ne01, ne02, ne03, ne0, ne1, ne2, ne3);
+            break;
+        default:
+            GGML_ABORT("unsupported type");
+    }
+}
diff --git a/ggml/src/ggml-cuda/diag.cuh b/ggml/src/ggml-cuda/diag.cuh
new file mode 100644
index 00000000000..7d73e6a8eb4
--- /dev/null
+++ b/ggml/src/ggml-cuda/diag.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_DIAG_BLOCK_SIZE 256
+
+void ggml_cuda_op_diag(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index d0463388c54..279679a4eac 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -20,6 +20,7 @@
 #include "ggml-cuda/cpy.cuh"
 #include "ggml-cuda/cross-entropy-loss.cuh"
 #include "ggml-cuda/diagmask.cuh"
+#include "ggml-cuda/diag.cuh"
 #include "ggml-cuda/fattn.cuh"
 #include "ggml-cuda/getrows.cuh"
 #include "ggml-cuda/im2col.cuh"
@@ -2641,6 +2642,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
                 break;
+        case GGML_OP_DIAG:
+            ggml_cuda_op_diag(ctx, dst);
+            break;
         case GGML_OP_DIAG_MASK_INF:
             ggml_cuda_op_diag_mask_inf(ctx, dst);
             break;
@@ -4624,6 +4628,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_FILL:
         case GGML_OP_CUMSUM:
         case GGML_OP_TRI:
+        case GGML_OP_DIAG:
             return true;
         case GGML_OP_SOLVE_TRI:
             return op->src[0]->ne[0] <= 64 && op->src[1]->ne[0] <= 32;
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 2e94a53da25..289e2e6d7fd 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -6253,6 +6253,31 @@ struct test_solve_tri : public test_case {
     }
 };
 
+// GGML_OP_DIAG
+struct test_diag : public test_case {
+    const ggml_type              type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override { return VARS_TO_STR2(type, ne); }
+
+    test_diag(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = { 10, 1, 4, 3 })
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        GGML_ASSERT(ne[1] == 1);
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_diag(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+
 enum llm_norm_type {
     LLM_NORM,
     LLM_NORM_RMS,
@@ -7826,6 +7851,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_fill(-152.0f, GGML_TYPE_F32, { 800, 600, 4, 4 }));
     test_cases.emplace_back(new test_fill(3.5f, GGML_TYPE_F32, { 2048, 512, 2, 2 }));
 
+    test_cases.emplace_back(new test_diag());
+    test_cases.emplace_back(new test_diag(GGML_TYPE_F32, { 79, 1, 19, 13 }));
+    test_cases.emplace_back(new test_diag(GGML_TYPE_F32, { 256, 1, 8, 16 }));
+
     test_cases.emplace_back(new test_solve_tri());
     test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 11, 11, 1, 1 }, { 5, 11, 1, 1 }));
     test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 17, 17, 2, 4 }, { 9, 17, 2, 4 }));

From 1f26fbd6e73e61cf1d98ea07a20b6359cf81adf4 Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Tue, 9 Dec 2025 00:00:23 +0100
Subject: [PATCH 2/2] Refactor parameters

---
 ggml/src/ggml-cuda/diag.cu | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/ggml/src/ggml-cuda/diag.cu b/ggml/src/ggml-cuda/diag.cu
index 318fd7ef36f..5cea210517f 100644
--- a/ggml/src/ggml-cuda/diag.cu
+++ b/ggml/src/ggml-cuda/diag.cu
@@ -5,16 +5,12 @@
 template <typename T>
 static __global__ void diag_kernel(T * __restrict__ dst,
                                    const T * __restrict__ src,
-                                   const int64_t ne00,
-                                   const int64_t ne01,
-                                   const int64_t ne02,
-                                   const int64_t ne03,
                                    const int64_t ne0,
                                    const int64_t ne1,
                                    const int64_t ne2,
-                                   const int64_t ne3) {
-    const int64_t global_idx     = blockIdx.x * blockDim.x + threadIdx.x;
-    const int64_t total_elements = ne0 * ne1 * ne2 * ne3;
+                                   const int64_t ne3,
+                                   const int64_t total_elements) {
+    const int64_t global_idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (global_idx >= total_elements) {
         return;
@@ -29,13 +25,12 @@ static __global__ void diag_kernel(T * __restrict__ dst,
 
     if (i0 == i1) {
         const int64_t batch_idx = i3 * ne2 + i2;
-        const int64_t src_idx   = batch_idx * ne00 + i0;
+        const int64_t src_idx   = batch_idx * ne0 + i0;
         dst[dst_idx]            = src[src_idx];
     } else {
         dst[dst_idx] = ggml_cuda_cast<T>(0);
     }
-
-    GGML_UNUSED_VARS(ne01, ne02, ne03);
+    GGML_UNUSED_VARS(ne3);
 }
 
 void ggml_cuda_op_diag(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -59,17 +54,22 @@ void ggml_cuda_op_diag(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int64_t ne2 = dst->ne[2];
     const int64_t ne3 = dst->ne[3];
 
+    GGML_ASSERT(ne00 == ne0);
+    GGML_ASSERT(ne01 == 1);
+    GGML_ASSERT(ne02 == ne2);
+    GGML_ASSERT(ne03 == ne3);
+
     const int64_t n_elems    = ggml_nelements(dst);
     const int64_t num_blocks = (n_elems + CUDA_DIAG_BLOCK_SIZE - 1) / CUDA_DIAG_BLOCK_SIZE;
 
     switch (dst->type) {
         case GGML_TYPE_F32:
-            diag_kernel<<<num_blocks, CUDA_DIAG_BLOCK_SIZE, 0, stream>>>((float *) dst_d, (const float *) src0_d, ne00,
-                                                                         ne01, ne02, ne03, ne0, ne1, ne2, ne3);
+            diag_kernel<<<num_blocks, CUDA_DIAG_BLOCK_SIZE, 0, stream>>>((float *) dst_d, (const float *) src0_d, ne0,
+                                                                         ne1, ne2, ne3, n_elems);
             break;
         case GGML_TYPE_F16:
-            diag_kernel<<<num_blocks, CUDA_DIAG_BLOCK_SIZE, 0, stream>>>((half *) dst_d, (const half *) src0_d, ne00,
-                                                                         ne01, ne02, ne03, ne0, ne1, ne2, ne3);
+            diag_kernel<<<num_blocks, CUDA_DIAG_BLOCK_SIZE, 0, stream>>>((half *) dst_d, (const half *) src0_d, ne0,
+                                                                         ne1, ne2, ne3, n_elems);
             break;
         default:
             GGML_ABORT("unsupported type");