@@ -41,8 +41,8 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
41
41
i1 = T0.alloc_stride[0LL] * i0;
42
42
nvfuser_index_t i2;
43
43
i2 = 3LL * i0;
44
- float T1[ 1LL] ;
45
- float T2[ 1LL] ;
44
+ Array< float, 1LL, 1> T1 ;
45
+ Array< float, 1LL, 1> T2 ;
46
46
T1[0LL] = 0LL;
47
47
T1[0LL]
48
48
= T0[i1];
@@ -53,7 +53,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
53
53
for(nvfuser_index_t i3 = 0LL; i3 < 3LL; ++i3) {
54
54
nvfuser_index_t i4;
55
55
i4 = (1LL + i3) + nvfuser_zero;
56
- float T3[ 1LL] ;
56
+ Array< float, 1LL, 1> T3 ;
57
57
T3[0LL]
58
58
= T2[0LL];
59
59
T4[(i2 + (i3 + nvfuser_zero))]
@@ -101,8 +101,8 @@ TEST_F(LoopRotationTest, RotateOuter) {
101
101
const std::string expected_kernel = R"(
102
102
__global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2> T4) {
103
103
NVFUSER_DEFINE_MAGIC_ZERO;
104
- float T1[ 3LL] ;
105
- float T2[ 3LL] ;
104
+ Array< float, 3LL, 1> T1 ;
105
+ Array< float, 3LL, 1> T2 ;
106
106
#pragma unroll
107
107
for(nvfuser_index_t i0 = 0LL; i0 < 3LL; ++i0) {
108
108
T1[i0] = 0LL;
@@ -202,8 +202,8 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
202
202
i0 = T0.logical_size[0LL] * T0.logical_size[1LL];
203
203
nvfuser_index_t i1;
204
204
i1 = ceilDiv(i0, 5LL);
205
- float T1[ 5LL] ;
206
- float T2[ 5LL] ;
205
+ Array< float, 5LL, 1> T1 ;
206
+ Array< float, 5LL, 1> T2 ;
207
207
#pragma unroll
208
208
for(nvfuser_index_t i2 = 0LL; i2 < 5LL; ++i2) {
209
209
T1[i2] = 0LL;
@@ -306,7 +306,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
306
306
NVFUSER_DEFINE_MAGIC_ZERO;
307
307
nvfuser_index_t i0;
308
308
i0 = 4LL * T0.alloc_stride[0LL];
309
- float T1[ 15LL] ;
309
+ Array< float, 15LL, 1> T1 ;
310
310
#pragma unroll 4
311
311
for(nvfuser_index_t i1 = 0LL; i1 < 4LL; ++i1) {
312
312
nvfuser_index_t i2;
@@ -328,7 +328,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
328
328
}
329
329
}
330
330
NVFUSER_UPDATE_MAGIC_ZERO;
331
- float T2[ 3LL] ;
331
+ Array< float, 3LL, 1> T2 ;
332
332
#pragma unroll
333
333
for(nvfuser_index_t i6 = 0LL; i6 < 3LL; ++i6) {
334
334
T2[i6]
@@ -362,7 +362,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
362
362
}
363
363
}
364
364
NVFUSER_UPDATE_MAGIC_ZERO;
365
- float T3[ 3LL] ;
365
+ Array< float, 3LL, 1> T3 ;
366
366
#pragma unroll
367
367
for(nvfuser_index_t i14 = 0LL; i14 < 3LL; ++i14) {
368
368
T3[i14]
@@ -421,7 +421,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
421
421
i1 = 5LL * T0.alloc_stride[0LL];
422
422
bool b2;
423
423
b2 = 4LL < T0.logical_size[0LL];
424
- float T1[ 15LL] ;
424
+ Array< float, 15LL, 1> T1 ;
425
425
#pragma unroll
426
426
for(nvfuser_index_t i3 = 0LL; i3 < 3LL; ++i3) {
427
427
T1[i3] = 0LL;
@@ -454,7 +454,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
454
454
}
455
455
}
456
456
NVFUSER_UPDATE_MAGIC_ZERO;
457
- float T2[ 3LL] ;
457
+ Array< float, 3LL, 1> T2 ;
458
458
#pragma unroll
459
459
for(nvfuser_index_t i3 = 0LL; i3 < 3LL; ++i3) {
460
460
T1[(12LL + i3)] = 0LL;
@@ -486,7 +486,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
486
486
i13 = 3LL * ((1LL + i9) % 5LL);
487
487
bool b14;
488
488
b14 = (5LL + i9) < T0.logical_size[0LL];
489
- float T3[ 3LL] ;
489
+ Array< float, 3LL, 1> T3 ;
490
490
#pragma unroll
491
491
for(nvfuser_index_t i15 = 0LL; i15 < 3LL; ++i15) {
492
492
T3[i15]
@@ -599,7 +599,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
599
599
}
600
600
NVFUSER_UPDATE_MAGIC_ZERO;
601
601
asm volatile("cp.async.wait_group %0;\n"::"n"(3LL));
602
- float T1[ 2LL] ;
602
+ Array< float, 2LL, 1> T1 ;
603
603
T1[0LL]
604
604
= T4[0LL];
605
605
#pragma unroll 4
@@ -637,14 +637,14 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
637
637
for(nvfuser_index_t i14 = 0LL; i14 < 2LL; ++i14) {
638
638
T1[((1LL + i14) % 2LL)]
639
639
= T4[(i11 + i14)];
640
- float T2[ 1LL] ;
640
+ Array< float, 1LL, 1> T2 ;
641
641
T2[0LL]
642
642
= T1[i14];
643
643
T3[(i12 + (i14 + nvfuser_zero))]
644
644
= T2[0LL];
645
645
}
646
646
NVFUSER_UPDATE_MAGIC_ZERO;
647
- float T2[ 1LL] ;
647
+ Array< float, 1LL, 1> T2 ;
648
648
T2[0LL]
649
649
= T1[0LL];
650
650
T3[(2LL + i12)]
0 commit comments