diff --git a/awq_cuda/quantization/gemm_cuda_gen.cu b/awq_cuda/quantization/gemm_cuda_gen.cu index 3ff74c07..a3f4e8a5 100644 --- a/awq_cuda/quantization/gemm_cuda_gen.cu +++ b/awq_cuda/quantization/gemm_cuda_gen.cu @@ -189,7 +189,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i for (int j_0_4 = 0; j_0_4 < 4; ++j_0_4) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 { - __asm__ __volatile__( + asm volatile( "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" : "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3]) @@ -197,7 +197,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i } { - __asm__ __volatile__( + asm volatile( "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" : "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]) @@ -205,7 +205,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i } { - __asm__ __volatile__( + asm volatile( "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" : "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3]) @@ -213,7 +213,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i } { - __asm__ __volatile__( + asm volatile( "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" : "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]) @@ -221,7 +221,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i } #else { - __asm__ __volatile__( + asm volatile( "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" : "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3]) @@ -229,7 +229,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i } { - __asm__ __volatile__( + asm volatile( "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" : "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]) @@ -420,7 +420,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 { - __asm__ __volatile__( + asm volatile( "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" : "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3]) @@ -428,7 +428,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in } { - __asm__ __volatile__( + asm volatile( "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" : "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]) @@ -436,7 +436,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in } { - __asm__ __volatile__( + asm volatile( "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" : "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3]) @@ -444,7 +444,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in } { - __asm__ __volatile__( + asm volatile( "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" : "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]) @@ -452,7 +452,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in } #else { - __asm__ __volatile__( + asm volatile( "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" : "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3]) @@ -460,7 +460,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in } { - __asm__ __volatile__( + asm volatile( "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" : "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])