@@ -1310,42 +1310,42 @@ function triton_optimization_passes()
1310
1310
" convert-nvvm-to-llvm" ,
1311
1311
# common passes
1312
1312
" canonicalize" ,
1313
- # # ttir passes
1314
- # "triton-combine",
1315
- # "triton-reorder-broadcast",
1316
- # "triton-rewrite-tensor-pointer",
1317
- # "triton-rewrite-tensor-descriptor-to-pointer",
1318
- # "triton-loop-unroll",
1319
- # "triton-licm",
1320
- # "triton-loop-aware-cse",
1321
- # # TODO : should num-warps and num-ctas be set for each kernel?
1322
- # "convert-triton-to-tritongpu{target=cuda:$(cubinChip[][4:end]) num-warps=1 threads-per-warp=$(cuWarpSize[]) num-ctas=1}",
1323
- # # ttgir passes
1324
- # "tritongpu-coalesce",
1325
- # "tritongpu-optimize-thread-locality",
1326
- # "tritongpu-hoist-tmem-alloc",
1327
- # "tritongpu-assign-latencies",
1328
- # "tritongpu-pipeline",
1329
- # "tritongpu-schedule-loops",
1330
- # "tritongpu-automatic-warp-specialization",
1331
- # "tritongpu-prefetch",
1332
- # "tritongpu-accelerate-matmul",
1333
- # "tritongpu-reorder-instructions",
1334
- # "tritongpu-F32DotTC",
1335
- # "tritongpu-optimize-dot-operands",
1336
- # "tritongpu-remove-layout-conversions",
1337
- # "tritongpu-reduce-data-duplication",
1338
- # "tritongpu-hoist-tmem-alloc",
1339
- # "tritongpu-fuse-nested-loops",
1340
- # "tritongpu-rewrite-partition-dependencies",
1341
- # "tritongpu-partition-loops",
1342
- # "tritongpu-combine-tensor-select-and-if",
1343
- # # ttgir to llvm passes
1344
- # "tritongpu-allocate-warp-groups",
1345
- # "allocate-shared-memory",
1346
- # "tritongpu-global-scratch-memory-allocation",
1347
- # "tritongpu-optimize-accumulator-init",
1348
- # "tritongpu-coalesce-async-copy",
1313
+ # ttir passes
1314
+ " triton-combine" ,
1315
+ " triton-reorder-broadcast" ,
1316
+ " triton-rewrite-tensor-pointer" ,
1317
+ " triton-rewrite-tensor-descriptor-to-pointer" ,
1318
+ " triton-loop-unroll" ,
1319
+ " triton-licm" ,
1320
+ " triton-loop-aware-cse" ,
1321
+ # TODO : should num-warps and num-ctas be set for each kernel?
1322
+ " convert-triton-to-tritongpu{target=cuda:$(cubinChip[][4 : end ]) num-warps=1 threads-per-warp=$(cuWarpSize[]) num-ctas=1}" ,
1323
+ # ttgir passes
1324
+ " tritongpu-coalesce" ,
1325
+ " tritongpu-optimize-thread-locality" ,
1326
+ " tritongpu-hoist-tmem-alloc" ,
1327
+ " tritongpu-assign-latencies" ,
1328
+ " tritongpu-pipeline" ,
1329
+ " tritongpu-schedule-loops" ,
1330
+ " tritongpu-automatic-warp-specialization" ,
1331
+ " tritongpu-prefetch" ,
1332
+ " tritongpu-accelerate-matmul" ,
1333
+ " tritongpu-reorder-instructions" ,
1334
+ " tritongpu-F32DotTC" ,
1335
+ " tritongpu-optimize-dot-operands" ,
1336
+ " tritongpu-remove-layout-conversions" ,
1337
+ " tritongpu-reduce-data-duplication" ,
1338
+ " tritongpu-hoist-tmem-alloc" ,
1339
+ " tritongpu-fuse-nested-loops" ,
1340
+ " tritongpu-rewrite-partition-dependencies" ,
1341
+ " tritongpu-partition-loops" ,
1342
+ " tritongpu-combine-tensor-select-and-if" ,
1343
+ # ttgir to llvm passes
1344
+ " tritongpu-allocate-warp-groups" ,
1345
+ " allocate-shared-memory" ,
1346
+ " tritongpu-global-scratch-memory-allocation" ,
1347
+ " tritongpu-optimize-accumulator-init" ,
1348
+ " tritongpu-coalesce-async-copy" ,
1349
1349
],
1350
1350
" ," ,
1351
1351
)
@@ -2303,8 +2303,7 @@ function compile_mlir!(
2303
2303
end
2304
2304
end
2305
2305
2306
- # XXX : re-enable this pass
2307
- # run_pass_pipeline!(mod, "mark-func-memory-effects", "mark-func-memory-effects")
2306
+ run_pass_pipeline! (mod, " mark-func-memory-effects" , " mark-func-memory-effects" )
2308
2307
2309
2308
func_op = MLIR. API. mlirSymbolTableLookup (
2310
2309
MLIR. IR. SymbolTable (MLIR. IR. Operation (mod)), fnname
0 commit comments