Add debug flag

KuangjuX · KuangjuX · commit 81d70b317186 · 2025-01-10T09:20:37.000Z
diff --git a/benchmarks/cpp/flashattention/copy.cuh b/benchmarks/cpp/flashattention/copy.cuh
@@ -491,6 +491,10 @@ inline __device__ auto make_g2s_qk(const Element* gQ_ptr, Element* sQ_ptr,
 
     TiledCopy tiled_copy;
 
+    // if (thread0()) {
+    //     print_latex(tiled_copy);
+    // }
+
     auto loader = tiled_copy.get_thread_slice(tid);
 
     auto gQs = loader.partition_S(gQ);
@@ -501,10 +505,12 @@ inline __device__ auto make_g2s_qk(const Element* gQ_ptr, Element* sQ_ptr,
     int sQ_stride = size(sQ);
     int sK_stride = size(sK);
 
+#ifdef DEBUG
     if (thread0()) {
         printf("gQ_stride: %d, sQ_stride: %d, gK_stride: %d, sK_stride: %d\n",
                gQ_stride, sQ_stride, gK_stride, sK_stride);
     }
+#endif
 
     detail::G2SCopyQK copy_qk(gQs, sQs, gKs, sKs, tiled_copy, gQ_stride,
                               sQ_stride, gK_stride, sK_stride);
@@ -529,9 +535,11 @@ DEVICE auto make_g2s_v(const Element* gV_ptr, Element* sV_ptr, int gV_stride) {
 
     int sV_stride = size(sV);
 
+#ifdef DEBUG
     if (thread0()) {
         printf("gV_stride: %d, sV_stride: %d\n", gV_stride, sV_stride);
     }
+#endif
 
     detail::G2SCopyV copy_v(gVs, sVs, tiled_copy, gV_stride, sV_stride);
 
@@ -556,6 +564,15 @@ DEVICE auto make_s2r_qk(const Element* sQ_ptr, const Element* sK_ptr,
     auto s2r_thr_copy_q = s2r_copy_q.get_thread_slice(tid);
     auto s2r_thr_copy_k = s2r_copy_k.get_thread_slice(tid);
 
+#ifdef DEBUG
+    if (thread0()) {
+        printf("sQ_Layout: ");
+        print(sQ_layout), print('\n');
+        printf("s2r_copy_q: ");
+        print(s2r_copy_q), print('\n');
+    }
+#endif
+
     auto sQ = s2r_thr_copy_q.partition_S(sQ_);
     auto sK = s2r_thr_copy_k.partition_S(sK_);
 
@@ -567,6 +584,19 @@ DEVICE auto make_s2r_qk(const Element* sQ_ptr, const Element* sK_ptr,
     auto rQ_copy = s2r_thr_copy_q.retile_D(rQ_mma);
     auto rK_copy = s2r_thr_copy_k.retile_D(rK_mma);
 
+#ifdef DEBUG
+    if (thread0()) {
+        printf("sQ_: ");
+        print(sQ_), print('\n');
+        printf("sQ: ");
+        print(sQ), print('\n');
+        printf("rQ_copy: ");
+        print(rQ_copy), print('\n');
+        printf("rQ_mma: ");
+        print(rQ_mma), print('\n');
+    }
+#endif
+
     int sQ_stride = size(sQ_);
     int sK_stride = size(sK_);
 
diff --git a/benchmarks/cpp/flashattention/cutlass_fa.cuh b/benchmarks/cpp/flashattention/cutlass_fa.cuh
@@ -141,12 +141,14 @@ __global__ void __launch_bounds__(Nthreads)
     auto acc0 = get_acc<kTM, kTN>(mma);
     auto acco = get_acc<kTM, kTP>(mma);
 
+#ifdef DEBUG
     if (thread0()) {
         printf("acc0 size<0>: %d, size<1>: %d, size<2>: %d\n",
                (int)size<0>(acc0), (int)size<1>(acc0), (int)size<2>(acc0));
         printf("acco size<0>: %d, size<1>: %d, size<2>: %d\n",
                (int)size<0>(acco), (int)size<1>(acco), (int)size<2>(acco));
     }
+#endif
 
     /**
      * In TileFusion, we use
@@ -226,13 +228,15 @@ __global__ void __launch_bounds__(Nthreads)
         auto previous_attn_block =
             make_tensor(acco.data(), convert_layout_scores(acco.layout()));
 
+#ifdef DEBUG
         if (thread0()) {
             printf("scores size<0>: %d, size<1>: %d\n", (int)size<0>(scores),
                    (int)size<1>(scores));
             printf("previous_attn_block size<0>: %d, size<1>: %d\n",
                    (int)size<0>(previous_attn_block),
                    (int)size<1>(previous_attn_block));
         }
+#endif
 
         // Renormalization for the previous block.
         for (int ax0 = 0; ax0 < size<0>(previous_attn_block); ++ax0) {
diff --git a/benchmarks/cpp/flashattention/main.cu b/benchmarks/cpp/flashattention/main.cu
@@ -89,7 +89,8 @@ void run(bool check = true) {
     dim3 grid(block_x, block_y, block_z);
     dim3 block(kThreads, 1, 1);
 
-    int shm_input = (kTM * kTK + kTK * kTN + kTN * kTP);
+    int shm_input =
+        (kTM * kTK * kStagesQK + kTK * kTN * kStagesQK + kTN * kTP * kStagesV);
     int shm_output = kTM * kTP;
     int shm_size = shm_input < shm_output ? shm_output * sizeof(InType)
                                           : shm_input * sizeof(InType);