feat: put the tt func in a separate module and use symbol ref

avik-pal · avik-pal · commit 95598f9c1724 · 2025-09-28T00:14:35.000-05:00
diff --git a/deps/ReactantExtra/WORKSPACE b/deps/ReactantExtra/WORKSPACE
@@ -4,7 +4,7 @@ NSYNC_COMMIT = "82b118aa7ace3132e517e2c467f8732978cf4023"
 
 NSYNC_SHA256 = ""
 
-ENZYMEXLA_COMMIT = "b59185c7586783a17d9486e682307ae89c713964"
+ENZYMEXLA_COMMIT = "52ae936cae8f7050adc26c4ed5e755200497dc86"
 
 ENZYMEXLA_SHA256 = ""
 
diff --git a/src/Compiler.jl b/src/Compiler.jl
@@ -1310,42 +1310,42 @@ function triton_optimization_passes()
             "convert-nvvm-to-llvm",
             # common passes
             "canonicalize",
-            # # ttir passes
-            # "triton-combine",
-            # "triton-reorder-broadcast",
-            # "triton-rewrite-tensor-pointer",
-            # "triton-rewrite-tensor-descriptor-to-pointer",
-            # "triton-loop-unroll",
-            # "triton-licm",
-            # "triton-loop-aware-cse",
-            # # TODO: should num-warps and num-ctas be set for each kernel?
-            # "convert-triton-to-tritongpu{target=cuda:$(cubinChip[][4:end]) num-warps=1 threads-per-warp=$(cuWarpSize[]) num-ctas=1}",
-            # # ttgir passes
-            # "tritongpu-coalesce",
-            # "tritongpu-optimize-thread-locality",
-            # "tritongpu-hoist-tmem-alloc",
-            # "tritongpu-assign-latencies",
-            # "tritongpu-pipeline",
-            # "tritongpu-schedule-loops",
-            # "tritongpu-automatic-warp-specialization",
-            # "tritongpu-prefetch",
-            # "tritongpu-accelerate-matmul",
-            # "tritongpu-reorder-instructions",
-            # "tritongpu-F32DotTC",
-            # "tritongpu-optimize-dot-operands",
-            # "tritongpu-remove-layout-conversions",
-            # "tritongpu-reduce-data-duplication",
-            # "tritongpu-hoist-tmem-alloc",
-            # "tritongpu-fuse-nested-loops",
-            # "tritongpu-rewrite-partition-dependencies",
-            # "tritongpu-partition-loops",
-            # "tritongpu-combine-tensor-select-and-if",
-            # # ttgir to llvm passes
-            # "tritongpu-allocate-warp-groups",
-            # "allocate-shared-memory",
-            # "tritongpu-global-scratch-memory-allocation",
-            # "tritongpu-optimize-accumulator-init",
-            # "tritongpu-coalesce-async-copy",
+            # ttir passes
+            "triton-combine",
+            "triton-reorder-broadcast",
+            "triton-rewrite-tensor-pointer",
+            "triton-rewrite-tensor-descriptor-to-pointer",
+            "triton-loop-unroll",
+            "triton-licm",
+            "triton-loop-aware-cse",
+            # TODO: should num-warps and num-ctas be set for each kernel?
+            "convert-triton-to-tritongpu{target=cuda:$(cubinChip[][4:end]) num-warps=1 threads-per-warp=$(cuWarpSize[]) num-ctas=1}",
+            # ttgir passes
+            "tritongpu-coalesce",
+            "tritongpu-optimize-thread-locality",
+            "tritongpu-hoist-tmem-alloc",
+            "tritongpu-assign-latencies",
+            "tritongpu-pipeline",
+            "tritongpu-schedule-loops",
+            "tritongpu-automatic-warp-specialization",
+            "tritongpu-prefetch",
+            "tritongpu-accelerate-matmul",
+            "tritongpu-reorder-instructions",
+            "tritongpu-F32DotTC",
+            "tritongpu-optimize-dot-operands",
+            "tritongpu-remove-layout-conversions",
+            "tritongpu-reduce-data-duplication",
+            "tritongpu-hoist-tmem-alloc",
+            "tritongpu-fuse-nested-loops",
+            "tritongpu-rewrite-partition-dependencies",
+            "tritongpu-partition-loops",
+            "tritongpu-combine-tensor-select-and-if",
+            # ttgir to llvm passes
+            "tritongpu-allocate-warp-groups",
+            "allocate-shared-memory",
+            "tritongpu-global-scratch-memory-allocation",
+            "tritongpu-optimize-accumulator-init",
+            "tritongpu-coalesce-async-copy",
         ],
         ",",
     )
@@ -2303,8 +2303,7 @@ function compile_mlir!(
         end
     end
 
-    # XXX: re-enable this pass
-    # run_pass_pipeline!(mod, "mark-func-memory-effects", "mark-func-memory-effects")
+    run_pass_pipeline!(mod, "mark-func-memory-effects", "mark-func-memory-effects")
 
     func_op = MLIR.API.mlirSymbolTableLookup(
         MLIR.IR.SymbolTable(MLIR.IR.Operation(mod)), fnname
diff --git a/src/Ops.jl b/src/Ops.jl
@@ -1707,12 +1707,19 @@ function _extract_function(
     nested_module::Bool=false,
 )
     module_suffix = string(hash(code); base=16)
-    name_to_call = _new_function_name(func_name, module_suffix)
+    name_to_call = func_name * "_call_" * module_suffix
+    mod_name = func_name * "_module_" * module_suffix
 
     current_module = MLIR.IR.mmodule()
     if nested_module
         new_module = MLIR.IR.Module()
-        push!(MLIR.IR.body(current_module), MLIR.IR.Operation(new_module, true))
+        moduleop = MLIR.IR.Operation(new_module, true)
+        MLIR.IR.attr!(
+            moduleop,
+            String(MLIR.API.mlirSymbolTableGetSymbolAttributeName()),
+            MLIR.IR.Attribute(mod_name),
+        )
+        push!(MLIR.IR.body(current_module), moduleop)
         current_module = new_module
     end
     top_level_block = MLIR.IR.body(current_module)
@@ -1764,7 +1771,7 @@ function _extract_function(
         error("hlo_call: could not find function $func_name in the provided module")
     end
 
-    return fn, name_to_call
+    return fn, name_to_call, mod_name
 end
 
 function triton_call(
@@ -1778,7 +1785,7 @@ function triton_call(
     location=mlir_stacktrace("triton_call", @__FILE__, @__LINE__),
     # TODO: other kwargs
 )
-    _, name_to_call = _extract_function(
+    _, name_to_call, mod_name = _extract_function(
         mlir_code; func_name, func_op_kind="tt.func", nested_module=true
     )
 
@@ -1788,7 +1795,9 @@ function triton_call(
         grid_z.mlir_data,
         shmem.mlir_data,
         [Reactant.TracedUtils.get_mlir_data(a) for a in args];
-        fn=MLIR.IR.FlatSymbolRefAttribute(name_to_call),
+        fn=MLIR.IR.SymbolRefAttribute(
+            mod_name, MLIR.IR.Attribute[MLIR.IR.FlatSymbolRefAttribute(name_to_call)]
+        ),
         result_0=MLIR.IR.Type[],
         location,
     )
@@ -1826,7 +1835,7 @@ julia> Reactant.@jit(
     func_name="main",
     location=mlir_stacktrace("hlo_call", @__FILE__, @__LINE__),
 )
-    fn, name_to_call = _extract_function(code; func_name, func_op_kind="func.func")
+    fn, name_to_call, _ = _extract_function(code; func_name, func_op_kind="func.func")
 
     ftype_attr = MLIR.IR.attr(fn, "function_type")
     ftype = MLIR.IR.Type(ftype_attr)