@@ -38,57 +38,95 @@ rocfft_transpose_outofplace_template(size_t m, size_t n, const T* A, T* B, void
38
38
dim3 grid ((n-1 )/TRANSPOSE_DIM_X + 1 , ( (m-1 )/TRANSPOSE_DIM_X + 1 ), count);
39
39
dim3 threads (TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, 1 );
40
40
41
+ bool noCorner = false ;
42
+
43
+ if ((n % TRANSPOSE_DIM_X == 0 ) && (m % TRANSPOSE_DIM_X == 0 ))// working threads match problem sizes, no corner cases
44
+ {
45
+ noCorner = true ;
46
+ }
41
47
42
48
if (scheme == 0 )
43
49
{
44
50
if (twl == 2 )
45
51
{
46
52
if (dir == -1 )
47
53
{
48
- hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true , 2 , -1 >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
49
- A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
54
+ if (noCorner)
55
+ hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true , 2 , -1 , true >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
56
+ A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
57
+ else
58
+ hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true , 2 , -1 , false >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
59
+ A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
50
60
}
51
61
else
52
62
{
53
- hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true , 2 , 1 >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
54
- A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
63
+ if (noCorner)
64
+ hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true , 2 , 1 , true >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
65
+ A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
66
+ else
67
+ hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true , 2 , 1 , false >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
68
+ A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
55
69
}
56
70
}
57
71
else if (twl == 3 )
58
72
{
59
73
if (dir == -1 )
60
74
{
61
- hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true , 3 , -1 >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
62
- A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
75
+ if (noCorner)
76
+ hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true , 3 , -1 , true >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
77
+ A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
78
+ else
79
+ hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true , 3 , -1 , false >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
80
+ A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
63
81
}
64
82
else
65
83
{
66
- hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true , 3 , 1 >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
67
- A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
84
+ if (noCorner)
85
+ hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true , 3 , 1 , true >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
86
+ A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
87
+ else
88
+ hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true , 3 , 1 , false >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
89
+ A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
68
90
}
69
91
}
70
92
else if (twl == 4 )
71
93
{
72
94
if (dir == -1 )
73
95
{
74
- hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true , 4 , -1 >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
75
- A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
96
+ if (noCorner)
97
+ hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true , 4 , -1 , true >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
98
+ A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
99
+ else
100
+ hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true , 4 , -1 , false >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
101
+ A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
76
102
}
77
103
else
78
104
{
79
- hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true , 4 , 1 >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
80
- A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
105
+ if (noCorner)
106
+ hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true , 4 , 1 , true >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
107
+ A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
108
+ else
109
+ hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true , 4 , 1 , false >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
110
+ A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
81
111
}
82
112
}
83
113
else
84
114
{
85
- hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, false , 0 , 0 >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
86
- A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
115
+ if (noCorner)
116
+ hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, false , 0 , 0 , true >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
117
+ A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
118
+ else
119
+ hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, false , 0 , 0 , false >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
120
+ A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out);
87
121
}
88
122
}
89
123
else
90
124
{
91
- hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2_scheme<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y>), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
125
+ if (noCorner)
126
+ hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2_scheme<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
127
+ A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out, scheme);
128
+ else
129
+ hipLaunchKernelGGL (HIP_KERNEL_NAME (transpose_kernel2_scheme<T, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, false >), dim3 (grid), dim3 (threads), 0 , rocfft_stream,
92
130
A, B, (T *)twiddles_large, dim, lengths, stride_in, stride_out, scheme);
93
131
}
94
132
0 commit comments