Skip to content

Commit 3158d84

Browse files
committed
Prefer Array class over register arrays. i.e. Array<float, 2> rather than float[2].
1 parent 062dd50 commit 3158d84

File tree

3 files changed

+23
-24
lines changed

3 files changed

+23
-24
lines changed

csrc/codegen.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,9 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
265265
} else if (v->isA<TensorView>()) {
266266
tv = v->as<TensorView>();
267267
}
268-
if (tv && aligned_array_of_regs_.count(tv)) {
268+
if (tv &&
269+
(aligned_array_of_regs_.count(tv) ||
270+
tv->getMemoryType() == MemoryType::Local)) {
269271
return genVariableName(tv).append(".array");
270272
} else {
271273
return genVariableName(v);
@@ -3169,14 +3171,11 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
31693171
break;
31703172
case MemoryType::Local: {
31713173
auto va = kernel_->summary().vectorized_accesses;
3174+
indent() << "Array<" << buffer_dtype << ", " << genInline(size)
3175+
<< ", " << (va.find(tv) != va.end() ? va.at(tv) : 1) << "> "
3176+
<< genVariableName(tv) << ";\n";
31723177
if (va.find(tv) != va.end()) {
3173-
indent() << "Array<" << buffer_dtype << ", " << genInline(size)
3174-
<< ", " << va.at(tv) << "> " << genVariableName(tv)
3175-
<< ";\n";
31763178
aligned_array_of_regs_.insert(tv);
3177-
} else {
3178-
indent() << buffer_dtype << " " << genVariableName(tv) << "["
3179-
<< genInline(size) << "];\n";
31803179
}
31813180
} break;
31823181
default:

tests/cpp/test_loop_rotation.cpp

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
4141
i1 = T0.alloc_stride[0LL] * i0;
4242
nvfuser_index_t i2;
4343
i2 = 3LL * i0;
44-
float T1[1LL];
45-
float T2[1LL];
44+
Array<float, 1LL, 1> T1;
45+
Array<float, 1LL, 1> T2;
4646
T1[0LL] = 0LL;
4747
T1[0LL]
4848
= T0[i1];
@@ -53,7 +53,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
5353
for(nvfuser_index_t i3 = 0LL; i3 < 3LL; ++i3) {
5454
nvfuser_index_t i4;
5555
i4 = (1LL + i3) + nvfuser_zero;
56-
float T3[1LL];
56+
Array<float, 1LL, 1> T3;
5757
T3[0LL]
5858
= T2[0LL];
5959
T4[(i2 + (i3 + nvfuser_zero))]
@@ -101,8 +101,8 @@ TEST_F(LoopRotationTest, RotateOuter) {
101101
const std::string expected_kernel = R"(
102102
__global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2> T4) {
103103
NVFUSER_DEFINE_MAGIC_ZERO;
104-
float T1[3LL];
105-
float T2[3LL];
104+
Array<float, 3LL, 1> T1;
105+
Array<float, 3LL, 1> T2;
106106
#pragma unroll
107107
for(nvfuser_index_t i0 = 0LL; i0 < 3LL; ++i0) {
108108
T1[i0] = 0LL;
@@ -202,8 +202,8 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
202202
i0 = T0.logical_size[0LL] * T0.logical_size[1LL];
203203
nvfuser_index_t i1;
204204
i1 = ceilDiv(i0, 5LL);
205-
float T1[5LL];
206-
float T2[5LL];
205+
Array<float, 5LL, 1> T1;
206+
Array<float, 5LL, 1> T2;
207207
#pragma unroll
208208
for(nvfuser_index_t i2 = 0LL; i2 < 5LL; ++i2) {
209209
T1[i2] = 0LL;
@@ -306,7 +306,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
306306
NVFUSER_DEFINE_MAGIC_ZERO;
307307
nvfuser_index_t i0;
308308
i0 = 4LL * T0.alloc_stride[0LL];
309-
float T1[15LL];
309+
Array<float, 15LL, 1> T1;
310310
#pragma unroll 4
311311
for(nvfuser_index_t i1 = 0LL; i1 < 4LL; ++i1) {
312312
nvfuser_index_t i2;
@@ -328,7 +328,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
328328
}
329329
}
330330
NVFUSER_UPDATE_MAGIC_ZERO;
331-
float T2[3LL];
331+
Array<float, 3LL, 1> T2;
332332
#pragma unroll
333333
for(nvfuser_index_t i6 = 0LL; i6 < 3LL; ++i6) {
334334
T2[i6]
@@ -362,7 +362,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
362362
}
363363
}
364364
NVFUSER_UPDATE_MAGIC_ZERO;
365-
float T3[3LL];
365+
Array<float, 3LL, 1> T3;
366366
#pragma unroll
367367
for(nvfuser_index_t i14 = 0LL; i14 < 3LL; ++i14) {
368368
T3[i14]
@@ -421,7 +421,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
421421
i1 = 5LL * T0.alloc_stride[0LL];
422422
bool b2;
423423
b2 = 4LL < T0.logical_size[0LL];
424-
float T1[15LL];
424+
Array<float, 15LL, 1> T1;
425425
#pragma unroll
426426
for(nvfuser_index_t i3 = 0LL; i3 < 3LL; ++i3) {
427427
T1[i3] = 0LL;
@@ -454,7 +454,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
454454
}
455455
}
456456
NVFUSER_UPDATE_MAGIC_ZERO;
457-
float T2[3LL];
457+
Array<float, 3LL, 1> T2;
458458
#pragma unroll
459459
for(nvfuser_index_t i3 = 0LL; i3 < 3LL; ++i3) {
460460
T1[(12LL + i3)] = 0LL;
@@ -486,7 +486,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
486486
i13 = 3LL * ((1LL + i9) % 5LL);
487487
bool b14;
488488
b14 = (5LL + i9) < T0.logical_size[0LL];
489-
float T3[3LL];
489+
Array<float, 3LL, 1> T3;
490490
#pragma unroll
491491
for(nvfuser_index_t i15 = 0LL; i15 < 3LL; ++i15) {
492492
T3[i15]
@@ -599,7 +599,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
599599
}
600600
NVFUSER_UPDATE_MAGIC_ZERO;
601601
asm volatile("cp.async.wait_group %0;\n"::"n"(3LL));
602-
float T1[2LL];
602+
Array<float, 2LL, 1> T1;
603603
T1[0LL]
604604
= T4[0LL];
605605
#pragma unroll 4
@@ -637,14 +637,14 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
637637
for(nvfuser_index_t i14 = 0LL; i14 < 2LL; ++i14) {
638638
T1[((1LL + i14) % 2LL)]
639639
= T4[(i11 + i14)];
640-
float T2[1LL];
640+
Array<float, 1LL, 1> T2;
641641
T2[0LL]
642642
= T1[i14];
643643
T3[(i12 + (i14 + nvfuser_zero))]
644644
= T2[0LL];
645645
}
646646
NVFUSER_UPDATE_MAGIC_ZERO;
647-
float T2[1LL];
647+
Array<float, 1LL, 1> T2;
648648
T2[0LL]
649649
= T1[0LL];
650650
T3[(2LL + i12)]

tests/cpp/test_scalar_hoisting.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
316316
b7 = i0 < i6;
317317
float f8;
318318
f8 = (float)(i6);
319-
float T1[1LL];
319+
Array<float, 1LL, 1> T1;
320320
if (b7) {
321321
T1[0LL]
322322
= sinf(T0[i0]);

0 commit comments

Comments
 (0)